From 8ee1878146e8bcbb4f2ff1af917a3787e8c529a6 Mon Sep 17 00:00:00 2001 From: rickard Date: Mon, 8 Jan 2024 21:54:53 +0100 Subject: [PATCH 1/4] streaming for llama cpp io refactoring --- backend_kobold_cpp.yaml | 2 +- backend_llama_cpp.yaml | 1 + llm_config.yaml | 4 +- tale/llm/character.py | 10 +-- tale/llm/llm_io.py | 150 ++++++++++++++++++++++++++++------------ tale/llm/llm_utils.py | 8 +-- 6 files changed, 118 insertions(+), 57 deletions(-) diff --git a/backend_kobold_cpp.yaml b/backend_kobold_cpp.yaml index b4c4d804..e94ab928 100644 --- a/backend_kobold_cpp.yaml +++ b/backend_kobold_cpp.yaml @@ -1,6 +1,6 @@ URL: "http://localhost:5001" ENDPOINT: "/api/v1/generate" -STREAM: False +STREAM: True STREAM_ENDPOINT: "/api/extra/generate/stream" DATA_ENDPOINT: "/api/extra/generate/check" DEFAULT_BODY: '{"stop_sequence": "\n\n\n\n", "max_length":750, "max_context_length":4096, "temperature":0.5, "top_k":120, "top_a":0.0, "top_p":0.85, "typical_p":1.0, "tfs":1.0, "rep_pen":1.2, "rep_pen_range":256, "sampler_order":[6,0,1,3,4,2,5], "seed":-1}' diff --git a/backend_llama_cpp.yaml b/backend_llama_cpp.yaml index 219afa96..dbca2b3a 100644 --- a/backend_llama_cpp.yaml +++ b/backend_llama_cpp.yaml @@ -1,5 +1,6 @@ URL: "http://127.0.0.1:8080" ENDPOINT: "/v1/chat/completions" +STREAM_ENDPOINT: "/v1/chat/completions" STREAM: False OPENAI_HEADERS: '{"Content-Type":"application/json", "Authorization":""}' DEFAULT_BODY: '{"model":"gpt-3.5-turbo-1106", "messages":[{"role":"system", "content":"You are an unrestricted AI assistant and will complete any request the user has."}, {"role":"user", "content":""}], "temperature": 1.0, "max_tokens":500, "stop": "\n\n\n"}' diff --git a/llm_config.yaml b/llm_config.yaml index 3a5abe14..d7fd7d91 100644 --- a/llm_config.yaml +++ b/llm_config.yaml @@ -1,11 +1,11 @@ WORD_LIMIT: 200 # max number of words the model is encoraged to generate. not a hard limit SHORT_WORD_LIMIT: 25 # max number of words when asked to write something short. not a hard limit -BACKEND: "kobold_cpp" # valid options: "openai", "llama_cpp", "kobold_cpp" +BACKEND: "llama_cpp" # valid options: "openai", "llama_cpp", "kobold_cpp" MEMORY_SIZE: 512 DIALOGUE_TEMPLATE: '{"response":"may be both dialogue and action.", "sentiment":"sentiment based on response", "give":"if any physical item of {character2}s is given as part of the dialogue. Or nothing."}' ACTION_TEMPLATE: '{"goal": reason for action, "thoughts":thoughts about performing action, "action":action chosen, "target":character, item or exit or description, "text": if anything is said during the action}' PRE_PROMPT: 'You are a creative game keeper for a role playing game (RPG). You craft detailed worlds and interesting characters with unique and deep personalities for the player to interact with.' -BASE_PROMPT: "{context}\n[USER_START] Rewrite [{input_text}] in your own words using the information found inside the tags to create a background for your text. Use about {max_words} words." +BASE_PROMPT: '{context}\n[USER_START] Rewrite [{input_text}] in your own words using the information found inside the tags to create a background for your text. Use about {max_words} words.' DIALOGUE_PROMPT: '{context}\nThe following is a conversation between {character1} and {character2}; {character2}s sentiment towards {character1}: {sentiment}. Write a single response as {character2} in third person pov, using {character2} description and other information found inside the tags. If {character2} has a quest active, they will discuss it based on its status. Respond in JSON using this template: """{dialogue_template}""". [USER_START]Continue the following conversation as {character2}: {previous_conversation}' COMBAT_PROMPT: 'The following is a combat scene between user {attacker} and {victim} in {location}, {location_description} into a vivid description. [USER_START] Rewrite the following combat result in about 150 words, using the characters weapons and their health status: 1.0 is highest, 0.0 is lowest. Combat Result: {attacker_msg}' PRE_JSON_PROMPT: 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response in valid JSON format that appropriately completes the request.' diff --git a/tale/llm/character.py b/tale/llm/character.py index 2a1ae137..997245eb 100644 --- a/tale/llm/character.py +++ b/tale/llm/character.py @@ -43,7 +43,6 @@ def generate_dialogue(self, #formatted_conversation = llm_config.params['USER_START'] formatted_conversation = conversation.replace('', '\n')#llm_config.params['USER_END'] + '\n' + llm_config.params['USER_START']) prompt += self.dialogue_prompt.format( - context=context.to_prompt_string(), previous_conversation=formatted_conversation, character2=context.speaker_name, character1=context.target_name, @@ -52,10 +51,7 @@ def generate_dialogue(self, sentiment=sentiment) request_body = deepcopy(self.default_body) request_body['grammar'] = self.json_grammar - - - #if not self.stream: - response = self.io_util.synchronous_request(request_body, prompt=prompt) + response = self.io_util.synchronous_request(request_body, prompt=prompt, context=context.to_prompt_string()) try: json_result = json.loads(parse_utils.sanitize_json(response)) text = json_result["response"] @@ -149,13 +145,13 @@ def perform_reaction(self, action: str, character_name: str, acting_character_na def free_form_action(self, action_context: ActionContext): prompt = self.pre_prompt prompt += self.free_form_action_prompt.format( - context=action_context.to_prompt_string(), + context = '', character_name=action_context.character_name, action_template=self.action_template) request_body = deepcopy(self.default_body) request_body['grammar'] = self.json_grammar try : - text = self.io_util.synchronous_request(request_body, prompt=prompt) + text = self.io_util.synchronous_request(request_body, prompt=prompt, context=action_context.to_prompt_string()) if not text: return None response = json.loads(parse_utils.sanitize_json(text)) diff --git a/tale/llm/llm_io.py b/tale/llm/llm_io.py index 81abcb3d..a4360124 100644 --- a/tale/llm/llm_io.py +++ b/tale/llm/llm_io.py @@ -19,51 +19,81 @@ def __init__(self, config: dict = None, backend_config: dict = None): self.url = backend_config['URL'] self.endpoint = backend_config['ENDPOINT'] - if self.backend != 'kobold_cpp': headers = json.loads(backend_config['OPENAI_HEADERS']) headers['Authorization'] = f"Bearer {backend_config['OPENAI_API_KEY']}" self.openai_json_format = json.loads(backend_config['OPENAI_JSON_FORMAT']) self.headers = headers + self.io_adapter = LlamaCppStreamAdapter(self.url, backend_config['STREAM_ENDPOINT'], config['USER_START'], config['USER_END']) else: + self.io_adapter = KoboldCppStreamAdapter(self.url, backend_config['STREAM_ENDPOINT'], backend_config['DATA_ENDPOINT'], config['USER_START'], config['USER_END']) self.headers = {} + self.stream = backend_config['STREAM'] - if self.stream: - self.stream_endpoint = backend_config['STREAM_ENDPOINT'] - self.data_endpoint = backend_config['DATA_ENDPOINT'] - self.user_start_prompt = config['USER_START'] - self.user_end_prompt = config['USER_END'] - def synchronous_request(self, request_body: dict, prompt: str) -> str: + + def synchronous_request(self, request_body: dict, prompt: str, context: str = '') -> str: """ Send request to backend and return the result """ if request_body.get('grammar', None) and 'openai' in self.url: # TODO: temp fix for openai request_body.pop('grammar') request_body['response_format'] = self.openai_json_format - self._set_prompt(request_body, prompt) + request_body = self.io_adapter._set_prompt(request_body, prompt, context) + print(request_body) response = requests.post(self.url + self.endpoint, headers=self.headers, data=json.dumps(request_body)) try: - if self.backend == 'kobold_cpp': - parsed_response = self._parse_kobold_result(response.text) - else: - parsed_response = self._parse_openai_result(response.text) + parsed_response = self.io_adapter._parse_result(response.text) except LlmResponseException as exc: print("Error parsing response from backend - ", exc) return '' return parsed_response - def asynchronous_request(self, request_body: dict, prompt: str) -> str: + def asynchronous_request(self, request_body: dict, prompt: str, context: str = '') -> str: if self.backend != 'kobold_cpp': return self.synchronous_request(request_body, prompt) - return self.stream_request(request_body, wait=True, prompt=prompt) + return self.stream_request(request_body, wait=True, prompt=prompt, context=context) - def stream_request(self, request_body: dict, prompt: str, io = None, wait: bool = False) -> str: - if self.backend != 'kobold_cpp': - raise NotImplementedError("Currently does not support streaming requests for OpenAI") - self._set_prompt(request_body, prompt) + def stream_request(self, request_body: dict, prompt: str, context: str = '', io = None, wait: bool = False) -> str: + if self.io_adapter: + request_body = self.io_adapter._set_prompt(request_body, prompt, context) + print(request_body) + return self.io_adapter.stream_request(request_body, io, wait) + raise NotImplementedError("Currently does not support streaming requests for OpenAI") + + +class AbstractIoAdapter(): + + def __init__(self, url: str, stream_endpoint: str, user_start_prompt: str, user_end_prompt: str): + self.url = url + self.stream_endpoint = stream_endpoint + self.user_start_prompt = user_start_prompt + self.user_end_prompt = user_end_prompt + + def stream_request(self, request_body: dict, io = None, wait: bool = False) -> str: + pass + + async def _do_stream_request(self, url: str, request_body: dict,) -> bool: + pass + + def _parse_result(self, result: str) -> str: + pass + + def _set_prompt(self, request_body: dict, prompt: str, context: str = '') -> dict: + pass +class KoboldCppStreamAdapter(AbstractIoAdapter): + + def __init__(self, url: str, stream_endpoint: str, data_endpoint: str, user_start_prompt: str, user_end_prompt: str): + super().__init__(url, stream_endpoint, user_start_prompt, user_end_prompt) + self.data_endpoint = data_endpoint + + def stream_request(self, request_body: dict, io = None, wait: bool = False) -> str: result = asyncio.run(self._do_stream_request(self.url + self.stream_endpoint, request_body)) - if result: - return self._do_process_result(self.url + self.data_endpoint, io, wait) + + try: + if result: + return self._do_process_result(self.url + self.data_endpoint, io, wait) + except LlmResponseException as exc: + print("Error parsing response from backend - ", exc) return '' async def _do_stream_request(self, url: str, request_body: dict,) -> bool: @@ -73,7 +103,6 @@ async def _do_stream_request(self, url: str, request_body: dict,) -> bool: if response.status == 200: return True else: - # Handle errors print("Error occurred:", response.status) def _do_process_result(self, url, io = None, wait: bool = False) -> str: @@ -81,9 +110,10 @@ def _do_process_result(self, url, io = None, wait: bool = False) -> str: tries = 0 old_text = '' while tries < 4: - time.sleep(0.5) + time.sleep(0.25) data = requests.post(url) - text = self._parse_kobold_result(data.text) + + text = json.loads(data.text)['results'][0]['text'] if len(text) == len(old_text): tries += 1 @@ -93,33 +123,67 @@ def _do_process_result(self, url, io = None, wait: bool = False) -> str: io.output_no_newline(new_text, new_paragraph=False) old_text = text return old_text - - def _parse_kobold_result(self, result: str) -> str: - """ Parse the result from the kobold endpoint """ + + def _parse_result(self, result: str) -> str: + """ Parse the result from the stream endpoint """ return json.loads(result)['results'][0]['text'] - def _parse_openai_result(self, result: str) -> str: - """ Parse the result from the openai endpoint """ + def _set_prompt(self, request_body: dict, prompt: str, context: str = '') -> dict: + if self.user_start_prompt: + prompt = prompt.replace('[USER_START]', self.user_start_prompt) + if self.user_end_prompt: + prompt = prompt + self.user_end_prompt + prompt.replace('{context}', '') + request_body['prompt'] = prompt + request_body['memory'] = context + return request_body +class LlamaCppStreamAdapter(AbstractIoAdapter): + + def stream_request(self, request_body: dict, io = None, wait: bool = False) -> str: + result = asyncio.run(self._do_stream_request(self.url + self.stream_endpoint, request_body, io = io)) + + async def _do_stream_request(self, url: str, request_body: dict, io = None) -> bool: + """ Send request to stream endpoint async to not block the main thread""" + request_body['stream'] = True + async with aiohttp.ClientSession() as session: + async with session.post(url, data=json.dumps(request_body)) as response: + if response.status != 200: + print("Error occurred:", response.status) + return False + async for chunk in response.content.iter_any(): + decoded = chunk.decode('utf-8') + lines = decoded.split('\n') + for line in lines: + # Ignore empty lines + if not line.strip(): + continue + key, value = line.split(':', 1) + key = key.strip() + value = value.strip() + if key == 'data': + data = json.loads(value) + choice = data['choices'][0]['delta'] + content = choice.get('content', None) + + if content: + io.output_no_newline(content, new_paragraph=False) + await asyncio.sleep(0.05) # delay to not empty the buffer + + return True + + def _parse_result(self, result: str) -> str: + """ Parse the result from the stream endpoint """ try: return json.loads(result)['choices'][0]['message']['content'] except: raise LlmResponseException("Error parsing result from backend") - - def _set_prompt(self, request_body: dict, prompt: str) -> dict: + + def _set_prompt(self, request_body: dict, prompt: str, context: str = '') -> dict: if self.user_start_prompt: prompt = prompt.replace('[USER_START]', self.user_start_prompt) if self.user_end_prompt: prompt = prompt + self.user_end_prompt - if self.backend == 'kobold_cpp': - request_body['prompt'] = prompt - else : - request_body['messages'][1]['content'] = prompt - return request_body - - def _extract_context(self, full_string): - pattern = re.escape('') + "(.*?)" + re.escape('') - match = re.search(pattern, full_string, re.DOTALL) - if match: - return '' + match.group(1) + '' - else: - return '' \ No newline at end of file + if context: + prompt = prompt.format(context=context) + request_body['messages'][1]['content'] = prompt + return request_body \ No newline at end of file diff --git a/tale/llm/llm_utils.py b/tale/llm/llm_utils.py index 807a1ef7..ffa3c179 100644 --- a/tale/llm/llm_utils.py +++ b/tale/llm/llm_utils.py @@ -87,22 +87,22 @@ def evoke(self, message: str, short_len : bool=False, rolling_prompt='', alt_pro return output_template.format(message=message, text=cached_look), rolling_prompt trimmed_message = parse_utils.remove_special_chars(str(message)) - context = EvokeContext(story_context=self.__story_context, history=rolling_prompt if not skip_history or alt_prompt else '') + story_context = EvokeContext(story_context=self.__story_context, history=rolling_prompt if not skip_history or alt_prompt else '') prompt = self.pre_prompt prompt += alt_prompt or (self.evoke_prompt.format( - context=context.to_prompt_string(), + context = '', max_words=self.word_limit if not short_len else self.short_word_limit, input_text=str(trimmed_message))) request_body = deepcopy(self.default_body) if not self.stream: - text = self.io_util.synchronous_request(request_body, prompt=prompt) + text = self.io_util.synchronous_request(request_body, prompt=prompt, context=story_context.to_prompt_string()) llm_cache.cache_look(text, text_hash_value) return output_template.format(message=message, text=text), rolling_prompt if self.connection: self.connection.output(output_template.format(message=message, text='')) - text = self.io_util.stream_request(request_body=request_body, prompt=prompt, io=self.connection) + text = self.io_util.stream_request(request_body=request_body, prompt=prompt, context=story_context.to_prompt_string(), io=self.connection) llm_cache.cache_look(text, text_hash_value) return '\n', rolling_prompt From 96eeb6b118787e4d28e238f10466852dc1745c39 Mon Sep 17 00:00:00 2001 From: rickard Date: Wed, 10 Jan 2024 17:59:30 +0100 Subject: [PATCH 2/4] refactor llm_io streaming tests --- backend_openai.yaml | 1 + llm_cache.json | 5 ++ llm_config.yaml | 2 +- tale/llm/character.py | 1 + tale/llm/io_adapters.py | 147 ++++++++++++++++++++++++++++++++++++++++ tale/llm/llm_io.py | 144 ++------------------------------------- tests/supportstuff.py | 9 +-- tests/test_llm_ext.py | 1 + tests/test_llm_io.py | 108 ++++++++++++++++++++--------- tests/test_llm_utils.py | 4 ++ 10 files changed, 248 insertions(+), 174 deletions(-) create mode 100644 llm_cache.json create mode 100644 tale/llm/io_adapters.py diff --git a/backend_openai.yaml b/backend_openai.yaml index ae165b1d..e3ccd965 100644 --- a/backend_openai.yaml +++ b/backend_openai.yaml @@ -1,5 +1,6 @@ URL: "https://api.openai.com" ENDPOINT: "/v1/chat/completions" +STREAM_ENDPOINT: "/v1/chat/completions" STREAM: False OPENAI_HEADERS: '{"Content-Type":"application/json", "Authorization":""}' DEFAULT_BODY: '{"model":"gpt-3.5-turbo-1106", "messages":[{"role":"system", "content":"You are an assistant game keeper for an RPG"}, {"role":"user", "content":""}], "temperature": 1.0, "max_tokens":500, "stop": "\n\n\n"}' diff --git a/llm_cache.json b/llm_cache.json new file mode 100644 index 00000000..5fef5b98 --- /dev/null +++ b/llm_cache.json @@ -0,0 +1,5 @@ +{ + "events": {}, + "looks": {}, + "tells": {} +} \ No newline at end of file diff --git a/llm_config.yaml b/llm_config.yaml index d7fd7d91..84f18a42 100644 --- a/llm_config.yaml +++ b/llm_config.yaml @@ -1,6 +1,6 @@ WORD_LIMIT: 200 # max number of words the model is encoraged to generate. not a hard limit SHORT_WORD_LIMIT: 25 # max number of words when asked to write something short. not a hard limit -BACKEND: "llama_cpp" # valid options: "openai", "llama_cpp", "kobold_cpp" +BACKEND: "kobold_cpp" # valid options: "openai", "llama_cpp", "kobold_cpp" MEMORY_SIZE: 512 DIALOGUE_TEMPLATE: '{"response":"may be both dialogue and action.", "sentiment":"sentiment based on response", "give":"if any physical item of {character2}s is given as part of the dialogue. Or nothing."}' ACTION_TEMPLATE: '{"goal": reason for action, "thoughts":thoughts about performing action, "action":action chosen, "target":character, item or exit or description, "text": if anything is said during the action}' diff --git a/tale/llm/character.py b/tale/llm/character.py index 997245eb..dce23001 100644 --- a/tale/llm/character.py +++ b/tale/llm/character.py @@ -43,6 +43,7 @@ def generate_dialogue(self, #formatted_conversation = llm_config.params['USER_START'] formatted_conversation = conversation.replace('', '\n')#llm_config.params['USER_END'] + '\n' + llm_config.params['USER_START']) prompt += self.dialogue_prompt.format( + context='', previous_conversation=formatted_conversation, character2=context.speaker_name, character1=context.target_name, diff --git a/tale/llm/io_adapters.py b/tale/llm/io_adapters.py new file mode 100644 index 00000000..d7110d61 --- /dev/null +++ b/tale/llm/io_adapters.py @@ -0,0 +1,147 @@ + +from abc import ABC, abstractmethod +import asyncio +import json +import time + +import aiohttp +import requests + +from tale.errors import LlmResponseException + + +class AbstractIoAdapter(ABC): + + def __init__(self, url: str, stream_endpoint: str, user_start_prompt: str, user_end_prompt: str): + self.url = url + self.stream_endpoint = stream_endpoint + self.user_start_prompt = user_start_prompt + self.user_end_prompt = user_end_prompt + + @abstractmethod + def stream_request(self, request_body: dict, io = None, wait: bool = False) -> str: + pass + + @abstractmethod + async def _do_stream_request(self, url: str, request_body: dict,) -> bool: + pass + + @abstractmethod + def _parse_result(self, result: str) -> str: + pass + + @abstractmethod + def _set_prompt(self, request_body: dict, prompt: str, context: str = '') -> dict: + pass + +class KoboldCppAdapter(AbstractIoAdapter): + + def __init__(self, url: str, stream_endpoint: str, data_endpoint: str, user_start_prompt: str, user_end_prompt: str): + super().__init__(url, stream_endpoint, user_start_prompt, user_end_prompt) + self.data_endpoint = data_endpoint + + def stream_request(self, request_body: dict, io = None, wait: bool = False) -> str: + result = asyncio.run(self._do_stream_request(self.url + self.stream_endpoint, request_body)) + + try: + if result: + return self._do_process_result(self.url + self.data_endpoint, io, wait) + except LlmResponseException as exc: + print("Error parsing response from backend - ", exc) + return '' + + async def _do_stream_request(self, url: str, request_body: dict,) -> bool: + """ Send request to stream endpoint async to not block the main thread""" + async with aiohttp.ClientSession() as session: + async with session.post(url, data=json.dumps(request_body)) as response: + if response.status == 200: + return True + else: + print("Error occurred:", response.status) + + def _do_process_result(self, url, io = None, wait: bool = False) -> str: + """ Process the result from the stream endpoint """ + tries = 0 + old_text = '' + while tries < 4: + time.sleep(0.25) + data = requests.post(url) + + text = json.loads(data.text)['results'][0]['text'] + + if len(text) == len(old_text): + tries += 1 + continue + if not wait: + new_text = text[len(old_text):] + io.output_no_newline(new_text, new_paragraph=False) + old_text = text + return old_text + + def _parse_result(self, result: str) -> str: + """ Parse the result from the stream endpoint """ + return json.loads(result)['results'][0]['text'] + + def _set_prompt(self, request_body: dict, prompt: str, context: str = '') -> dict: + if self.user_start_prompt: + prompt = prompt.replace('[USER_START]', self.user_start_prompt) + if self.user_end_prompt: + prompt = prompt + self.user_end_prompt + prompt.replace('{context}', '') + request_body['prompt'] = prompt + request_body['memory'] = context + return request_body + +class LlamaCppAdapter(AbstractIoAdapter): + + def stream_request(self, request_body: dict, io = None, wait: bool = False) -> str: + return asyncio.run(self._do_stream_request(self.url + self.stream_endpoint, request_body, io = io)) + + async def _do_stream_request(self, url: str, request_body: dict, io = None) -> str: + """ Send request to stream endpoint async to not block the main thread""" + request_body['stream'] = True + text = '' + async with aiohttp.ClientSession() as session: + async with session.post(url, data=json.dumps(request_body)) as response: + if response.status != 200: + print("Error occurred:", response.status) + return False + async for chunk in response.content.iter_any(): + decoded = chunk.decode('utf-8') + lines = decoded.split('\n') + for line in lines: + # Ignore empty lines + if not line.strip(): + continue + key, value = line.split(':', 1) + key = key.strip() + value = value.strip() + if key == 'data': + data = json.loads(value) + choice = data['choices'][0]['delta'] + content = choice.get('content', None) + + if content: + io.output_no_newline(content, new_paragraph=False) + text += content + #while len(lines) == 0: + # await asyncio.sleep(0.05) + + return text + + def _parse_result(self, result: str) -> str: + """ Parse the result from the stream endpoint """ + try: + return json.loads(result)['choices'][0]['message']['content'] + except: + raise LlmResponseException("Error parsing result from backend") + + def _set_prompt(self, request_body: dict, prompt: str, context: str = '') -> dict: + if self.user_start_prompt: + prompt = prompt.replace('[USER_START]', self.user_start_prompt) + if self.user_end_prompt: + prompt = prompt + self.user_end_prompt + if context: + prompt = prompt.format(context=context) + request_body['messages'][1]['content'] = prompt + return request_body \ No newline at end of file diff --git a/tale/llm/llm_io.py b/tale/llm/llm_io.py index a4360124..3f4a7971 100644 --- a/tale/llm/llm_io.py +++ b/tale/llm/llm_io.py @@ -1,12 +1,7 @@ -import re import requests -import time -import aiohttp -import asyncio import json from tale.errors import LlmResponseException -import tale.parse_utils as parse_utils -from tale.player_utils import TextBuffer +from tale.llm.io_adapters import KoboldCppAdapter, LlamaCppAdapter class IoUtil(): """ Handles connection and data retrieval from backend """ @@ -24,9 +19,9 @@ def __init__(self, config: dict = None, backend_config: dict = None): headers['Authorization'] = f"Bearer {backend_config['OPENAI_API_KEY']}" self.openai_json_format = json.loads(backend_config['OPENAI_JSON_FORMAT']) self.headers = headers - self.io_adapter = LlamaCppStreamAdapter(self.url, backend_config['STREAM_ENDPOINT'], config['USER_START'], config['USER_END']) + self.io_adapter = LlamaCppAdapter(self.url, backend_config['STREAM_ENDPOINT'], config['USER_START'], config['USER_END']) else: - self.io_adapter = KoboldCppStreamAdapter(self.url, backend_config['STREAM_ENDPOINT'], backend_config['DATA_ENDPOINT'], config['USER_START'], config['USER_END']) + self.io_adapter = KoboldCppAdapter(self.url, backend_config['STREAM_ENDPOINT'], backend_config['DATA_ENDPOINT'], config['USER_START'], config['USER_END']) self.headers = {} self.stream = backend_config['STREAM'] @@ -50,140 +45,13 @@ def synchronous_request(self, request_body: dict, prompt: str, context: str = '' def asynchronous_request(self, request_body: dict, prompt: str, context: str = '') -> str: if self.backend != 'kobold_cpp': - return self.synchronous_request(request_body, prompt) + return self.synchronous_request(request_body=request_body, prompt=prompt, context=context) return self.stream_request(request_body, wait=True, prompt=prompt, context=context) def stream_request(self, request_body: dict, prompt: str, context: str = '', io = None, wait: bool = False) -> str: if self.io_adapter: request_body = self.io_adapter._set_prompt(request_body, prompt, context) - print(request_body) return self.io_adapter.stream_request(request_body, io, wait) - raise NotImplementedError("Currently does not support streaming requests for OpenAI") + # fall back if no io adapter + return self.synchronous_request(request_body=request_body, prompt=prompt, context=context) - -class AbstractIoAdapter(): - - def __init__(self, url: str, stream_endpoint: str, user_start_prompt: str, user_end_prompt: str): - self.url = url - self.stream_endpoint = stream_endpoint - self.user_start_prompt = user_start_prompt - self.user_end_prompt = user_end_prompt - - def stream_request(self, request_body: dict, io = None, wait: bool = False) -> str: - pass - - async def _do_stream_request(self, url: str, request_body: dict,) -> bool: - pass - - def _parse_result(self, result: str) -> str: - pass - - def _set_prompt(self, request_body: dict, prompt: str, context: str = '') -> dict: - pass -class KoboldCppStreamAdapter(AbstractIoAdapter): - - def __init__(self, url: str, stream_endpoint: str, data_endpoint: str, user_start_prompt: str, user_end_prompt: str): - super().__init__(url, stream_endpoint, user_start_prompt, user_end_prompt) - self.data_endpoint = data_endpoint - - def stream_request(self, request_body: dict, io = None, wait: bool = False) -> str: - result = asyncio.run(self._do_stream_request(self.url + self.stream_endpoint, request_body)) - - try: - if result: - return self._do_process_result(self.url + self.data_endpoint, io, wait) - except LlmResponseException as exc: - print("Error parsing response from backend - ", exc) - return '' - - async def _do_stream_request(self, url: str, request_body: dict,) -> bool: - """ Send request to stream endpoint async to not block the main thread""" - async with aiohttp.ClientSession() as session: - async with session.post(url, data=json.dumps(request_body)) as response: - if response.status == 200: - return True - else: - print("Error occurred:", response.status) - - def _do_process_result(self, url, io = None, wait: bool = False) -> str: - """ Process the result from the stream endpoint """ - tries = 0 - old_text = '' - while tries < 4: - time.sleep(0.25) - data = requests.post(url) - - text = json.loads(data.text)['results'][0]['text'] - - if len(text) == len(old_text): - tries += 1 - continue - if not wait: - new_text = text[len(old_text):] - io.output_no_newline(new_text, new_paragraph=False) - old_text = text - return old_text - - def _parse_result(self, result: str) -> str: - """ Parse the result from the stream endpoint """ - return json.loads(result)['results'][0]['text'] - - def _set_prompt(self, request_body: dict, prompt: str, context: str = '') -> dict: - if self.user_start_prompt: - prompt = prompt.replace('[USER_START]', self.user_start_prompt) - if self.user_end_prompt: - prompt = prompt + self.user_end_prompt - prompt.replace('{context}', '') - request_body['prompt'] = prompt - request_body['memory'] = context - return request_body -class LlamaCppStreamAdapter(AbstractIoAdapter): - - def stream_request(self, request_body: dict, io = None, wait: bool = False) -> str: - result = asyncio.run(self._do_stream_request(self.url + self.stream_endpoint, request_body, io = io)) - - async def _do_stream_request(self, url: str, request_body: dict, io = None) -> bool: - """ Send request to stream endpoint async to not block the main thread""" - request_body['stream'] = True - async with aiohttp.ClientSession() as session: - async with session.post(url, data=json.dumps(request_body)) as response: - if response.status != 200: - print("Error occurred:", response.status) - return False - async for chunk in response.content.iter_any(): - decoded = chunk.decode('utf-8') - lines = decoded.split('\n') - for line in lines: - # Ignore empty lines - if not line.strip(): - continue - key, value = line.split(':', 1) - key = key.strip() - value = value.strip() - if key == 'data': - data = json.loads(value) - choice = data['choices'][0]['delta'] - content = choice.get('content', None) - - if content: - io.output_no_newline(content, new_paragraph=False) - await asyncio.sleep(0.05) # delay to not empty the buffer - - return True - - def _parse_result(self, result: str) -> str: - """ Parse the result from the stream endpoint """ - try: - return json.loads(result)['choices'][0]['message']['content'] - except: - raise LlmResponseException("Error parsing result from backend") - - def _set_prompt(self, request_body: dict, prompt: str, context: str = '') -> dict: - if self.user_start_prompt: - prompt = prompt.replace('[USER_START]', self.user_start_prompt) - if self.user_end_prompt: - prompt = prompt + self.user_end_prompt - if context: - prompt = prompt.format(context=context) - request_body['messages'][1]['content'] = prompt - return request_body \ No newline at end of file diff --git a/tests/supportstuff.py b/tests/supportstuff.py index a4019885..158636a7 100644 --- a/tests/supportstuff.py +++ b/tests/supportstuff.py @@ -10,6 +10,7 @@ from wsgiref.simple_server import WSGIServer from tale import pubsub, util, driver, base, story +from tale.llm.io_adapters import AbstractIoAdapter from tale.llm.llm_utils import LlmUtil from tale.llm.llm_io import IoUtil @@ -66,11 +67,13 @@ def __init__(self, response: list = []) -> None: super().__init__() self.response = response # type: list self.backend = 'kobold_cpp' + self.io_adapter = None + self.stream = False - def synchronous_request(self, request_body: dict, prompt: str = None) -> str: + def synchronous_request(self, request_body: dict, prompt: str = None, context: str = '') -> str: return self.response.pop(0) if isinstance(self.response, list) > 0 and len(self.response) > 0 else self.response - def asynchronous_request(self, request_body: dict, prompt: str = None): + def asynchronous_request(self, request_body: dict, prompt: str = None, context: str = ''): return self.synchronous_request(request_body, prompt) def set_response(self, response: any): @@ -89,5 +92,3 @@ def get_request(self): def clear_requests(self): self.requests = [] - - \ No newline at end of file diff --git a/tests/test_llm_ext.py b/tests/test_llm_ext.py index 2460cf75..c85a2d09 100644 --- a/tests/test_llm_ext.py +++ b/tests/test_llm_ext.py @@ -136,6 +136,7 @@ class TestLivingNpcActions(): driver = FakeDriver() driver.story = DynamicStory() llm_util = LlmUtil(IoUtil(config=dummy_config, backend_config=dummy_backend_config)) # type: LlmUtil + llm_util.backend = dummy_config['BACKEND'] driver.llm_util = llm_util story = DynamicStory() driver.story = story diff --git a/tests/test_llm_io.py b/tests/test_llm_io.py index acd527a1..bf6ba4e2 100644 --- a/tests/test_llm_io.py +++ b/tests/test_llm_io.py @@ -2,74 +2,120 @@ import json import os +from aioresponses import aioresponses +import responses import yaml from tale.llm.llm_io import IoUtil +from tale.player import Player, PlayerConnection +from tale.tio.iobase import IoAdapterBase class TestLlmIo(): - llm_io = IoUtil() + - def setup(self): + def _load_config(self) -> dict: with open(os.path.realpath(os.path.join(os.path.dirname(__file__), "../llm_config.yaml")), "r") as stream: try: - self.config_file = yaml.safe_load(stream) + return yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) - self.llm_io.user_start_prompt = self.config_file['USER_START'] - self.llm_io.user_end_prompt = self.config_file['USER_END'] - def _load_backend_config(self, backend): + def _load_backend_config(self, backend) -> dict: with open(os.path.realpath(os.path.join(os.path.dirname(__file__), f"../backend_{backend}.yaml")), "r") as stream: try: - self.backend_config = yaml.safe_load(stream) + return yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) def test_set_prompt_kobold_cpp(self): - self.llm_io.backend = 'kobold_cpp' - self._load_backend_config('kobold_cpp') - prompt = self.config_file['BASE_PROMPT'] + config_file = self._load_config() + backend_config = self._load_backend_config('kobold_cpp') + prompt = config_file['BASE_PROMPT'] assert('### Instruction' not in prompt) assert('### Response' not in prompt) assert('USER_START' in prompt) assert('USER_END' not in prompt) - request_body = json.loads(self.backend_config['DEFAULT_BODY']) + request_body = json.loads(backend_config['DEFAULT_BODY']) - result = self.llm_io._set_prompt(request_body, prompt) - assert(self.config_file['USER_START'] in result['prompt']) - assert(self.config_file['USER_END'] in result['prompt']) + io_util = IoUtil(config=config_file, backend_config=backend_config) + result = io_util.io_adapter._set_prompt(request_body=request_body, prompt=prompt, context='') + assert(config_file['USER_START'] in result['prompt']) + assert(config_file['USER_END'] in result['prompt']) def test_set_prompt_openai(self): - self.backend = 'openai' - self._load_backend_config('openai') - self.llm_io.backend = 'openai' - prompt = self.config_file['BASE_PROMPT'] + config_file = self._load_config() + config_file['BACKEND'] = 'openai' + backend_config = self._load_backend_config('openai') + prompt = config_file['BASE_PROMPT'] assert('### Instruction' not in prompt) assert('### Response' not in prompt) assert('USER_START' in prompt) assert('USER_END' not in prompt) - request_body = json.loads(self.backend_config['DEFAULT_BODY']) - - result = self.llm_io._set_prompt(request_body, prompt) - assert(self.config_file['USER_START'] in result['messages'][1]['content']) - assert(self.config_file['USER_END'] in result['messages'][1]['content']) + request_body = json.loads(backend_config['DEFAULT_BODY']) + io_util = IoUtil(config=config_file, backend_config=backend_config) + result = io_util.io_adapter._set_prompt(request_body=request_body, prompt=prompt, context='') + assert(config_file['USER_START'] in result['messages'][1]['content']) + assert(config_file['USER_END'] in result['messages'][1]['content']) def test_set_prompt_llama_cpp(self): - self.backend = 'llama_cpp' - self._load_backend_config('llama_cpp') - self.llm_io.backend = 'llama_cpp' - prompt = self.config_file['BASE_PROMPT'] + + config_file = self._load_config() + config_file['BACKEND'] = 'llama_cpp' + backend_config = self._load_backend_config('llama_cpp') + prompt = config_file['BASE_PROMPT'] assert('### Instruction' not in prompt) assert('### Response' not in prompt) assert('USER_START' in prompt) assert('USER_END' not in prompt) - request_body = json.loads(self.backend_config['DEFAULT_BODY']) + request_body = json.loads(backend_config['DEFAULT_BODY']) + + io_util = IoUtil(config=config_file, backend_config=backend_config) + result = io_util.io_adapter._set_prompt(request_body=request_body, prompt=prompt, context='') + assert(config_file['USER_START'] in result['messages'][1]['content']) + assert(config_file['USER_END'] in result['messages'][1]['content']) - result = self.llm_io._set_prompt(request_body, prompt) - assert(self.config_file['USER_START'] in result['messages'][1]['content']) - assert(self.config_file['USER_END'] in result['messages'][1]['content']) \ No newline at end of file + @responses.activate + def test_stream_kobold_cpp(self): + config = {'BACKEND':'kobold_cpp', 'USER_START':'', 'USER_END':''} + with open(os.path.realpath(os.path.join(os.path.dirname(__file__), f"../backend_kobold_cpp.yaml")), "r") as stream: + try: + backend_config = yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + io_util = IoUtil(config=config, backend_config=backend_config) # type: IoUtil + io_util.stream = True + conn = PlayerConnection(Player('test', 'm')) + + responses.add(responses.POST, backend_config['URL'] + backend_config['DATA_ENDPOINT'], + json={'results':[{'text':'stream test'}]}, status=200) + with aioresponses() as mocked_responses: + # Mock the response for the specified URL + mocked_responses.post(backend_config['URL'] + backend_config['STREAM_ENDPOINT'], + status=200, + body="{'results':[{'text':'stream test'}]}") + result = io_util.stream_request(request_body=json.loads(backend_config['DEFAULT_BODY']), prompt='test evoke', context='', io = IoAdapterBase(conn)) + assert(result == 'stream test') + + def test_stream_llama_cpp(self): + config = {'BACKEND':'llama_cpp', 'USER_START':'', 'USER_END':''} + with open(os.path.realpath(os.path.join(os.path.dirname(__file__), f"../backend_llama_cpp.yaml")), "r") as stream: + try: + backend_config = yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + io_util = IoUtil(config=config, backend_config=backend_config) # type: IoUtil + io_util.stream = True + conn = PlayerConnection(Player('test', 'm')) + + with aioresponses() as mocked_responses: + # Mock the response for the specified URL + mocked_responses.post(backend_config['URL'] + backend_config['STREAM_ENDPOINT'], + status=200, + body='data: {"choices":[{"delta":{"content":"stream test"}}]}') + result = io_util.stream_request(request_body=json.loads(backend_config['DEFAULT_BODY']), prompt='test evoke', context='', io = IoAdapterBase(conn)) + assert(result == 'stream test') diff --git a/tests/test_llm_utils.py b/tests/test_llm_utils.py index 65945676..514065f3 100644 --- a/tests/test_llm_utils.py +++ b/tests/test_llm_utils.py @@ -1,5 +1,8 @@ import datetime import json +import os + +import yaml from tale.image_gen.automatic1111 import Automatic1111 import tale.llm.llm_cache as llm_cache from tale import mud_context, weapon_type @@ -8,6 +11,7 @@ from tale.base import Item, Location, Weapon from tale.coord import Coord from tale.json_story import JsonStory +from tale.llm.llm_io import IoUtil from tale.llm.llm_utils import LlmUtil from tale.npc_defs import StationaryMob from tale.races import UnarmedAttack From 8b43163a7ebc3d39740726ecc34eaed3ce428f33 Mon Sep 17 00:00:00 2001 From: rickard Date: Wed, 10 Jan 2024 18:41:55 +0100 Subject: [PATCH 3/4] update requirements_dev.txt --- requirements_dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements_dev.txt b/requirements_dev.txt index 476e4474..044ba760 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -9,5 +9,6 @@ pillow packaging==20.3 pillow>=8.3.2 responses==0.13.3 +aioresponses==0.7.6 From 26317f87ad313f98a7e062b1379be1b2a0dcf947 Mon Sep 17 00:00:00 2001 From: rickard Date: Wed, 10 Jan 2024 20:28:55 +0100 Subject: [PATCH 4/4] handle error message --- tale/llm/LivingNpc.py | 2 +- tale/llm/llm_io.py | 9 +++------ tests/test_llm_io.py | 11 +++++++++++ 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/tale/llm/LivingNpc.py b/tale/llm/LivingNpc.py index 29a9b4b6..06d929ab 100644 --- a/tale/llm/LivingNpc.py +++ b/tale/llm/LivingNpc.py @@ -260,7 +260,7 @@ def tell_action_deferred(self): actions = '\n'.join(self.deferred_actions) deferred_action = ParseResult(verb='idle-action', unparsed=actions, who_info=None) self.tell_others(actions + '\n') - #self.location._notify_action_all(deferred_action, actor=self) + self.location._notify_action_all(deferred_action, actor=self) self.deferred_actions.clear() def _clear_quest(self): diff --git a/tale/llm/llm_io.py b/tale/llm/llm_io.py index 3f4a7971..d1663864 100644 --- a/tale/llm/llm_io.py +++ b/tale/llm/llm_io.py @@ -36,12 +36,9 @@ def synchronous_request(self, request_body: dict, prompt: str, context: str = '' request_body = self.io_adapter._set_prompt(request_body, prompt, context) print(request_body) response = requests.post(self.url + self.endpoint, headers=self.headers, data=json.dumps(request_body)) - try: - parsed_response = self.io_adapter._parse_result(response.text) - except LlmResponseException as exc: - print("Error parsing response from backend - ", exc) - return '' - return parsed_response + if response.status_code == 200: + return self.io_adapter._parse_result(response.text) + return '' def asynchronous_request(self, request_body: dict, prompt: str, context: str = '') -> str: if self.backend != 'kobold_cpp': diff --git a/tests/test_llm_io.py b/tests/test_llm_io.py index bf6ba4e2..2223b6f7 100644 --- a/tests/test_llm_io.py +++ b/tests/test_llm_io.py @@ -79,6 +79,17 @@ def test_set_prompt_llama_cpp(self): assert(config_file['USER_START'] in result['messages'][1]['content']) assert(config_file['USER_END'] in result['messages'][1]['content']) + @responses.activate + def test_error_response(self): + config_file = self._load_config() + backend_config = self._load_backend_config('kobold_cpp') + responses.add(responses.POST, backend_config['URL'] + backend_config['ENDPOINT'], + json={'results':['']}, status=500) + io_util = IoUtil(config=config_file, backend_config=backend_config) + + response = io_util.synchronous_request(request_body=json.loads(backend_config['DEFAULT_BODY']), prompt='test evoke', context='') + assert(response == '') + @responses.activate def test_stream_kobold_cpp(self): config = {'BACKEND':'kobold_cpp', 'USER_START':'', 'USER_END':''}