diff --git a/bot/bot.py b/bot/bot.py index f4510a6dc..0447f7960 100644 --- a/bot/bot.py +++ b/bot/bot.py @@ -31,6 +31,7 @@ import database import openai_utils +import base64 # setup db = database.Database() @@ -177,6 +178,168 @@ async def retry_handle(update: Update, context: CallbackContext): await message_handle(update, context, message=last_dialog_message["user"], use_new_dialog_timeout=False) +async def _vision_message_handle_fn( + update: Update, context: CallbackContext, use_new_dialog_timeout: bool = True +): + logger.info('_vision_message_handle_fn') + user_id = update.message.from_user.id + current_model = db.get_user_attribute(user_id, "current_model") + + if current_model != "gpt-4-vision-preview": + await update.message.reply_text( + "🥲 Images processing is only available for gpt-4-vision-preview model. Please change your settings in /settings", + parse_mode=ParseMode.HTML, + ) + return + + chat_mode = db.get_user_attribute(user_id, "current_chat_mode") + + # new dialog timeout + if use_new_dialog_timeout: + if (datetime.now() - db.get_user_attribute(user_id, "last_interaction")).seconds > config.new_dialog_timeout and len(db.get_dialog_messages(user_id)) > 0: + db.start_new_dialog(user_id) + await update.message.reply_text(f"Starting new dialog due to timeout ({config.chat_modes[chat_mode]['name']} mode) ✅", parse_mode=ParseMode.HTML) + db.set_user_attribute(user_id, "last_interaction", datetime.now()) + + buf = None + if update.message.effective_attachment: + photo = update.message.effective_attachment[-1] + photo_file = await context.bot.get_file(photo.file_id) + + # store file in memory, not on disk + buf = io.BytesIO() + await photo_file.download_to_memory(buf) + buf.name = "image.jpg" # file extension is required + buf.seek(0) # move cursor to the beginning of the buffer + + # in case of CancelledError + n_input_tokens, n_output_tokens = 0, 0 + + try: + # send placeholder message to user + placeholder_message = await update.message.reply_text("...") + message = update.message.caption or update.message.text + + # send typing action + await update.message.chat.send_action(action="typing") + + if message is None or len(message) == 0: + await update.message.reply_text( + "🥲 You sent empty message. Please, try again!", + parse_mode=ParseMode.HTML, + ) + return + + dialog_messages = db.get_dialog_messages(user_id, dialog_id=None) + parse_mode = {"html": ParseMode.HTML, "markdown": ParseMode.MARKDOWN}[ + config.chat_modes[chat_mode]["parse_mode"] + ] + + chatgpt_instance = openai_utils.ChatGPT(model=current_model) + if config.enable_message_streaming: + gen = chatgpt_instance.send_vision_message_stream( + message, + dialog_messages=dialog_messages, + image_buffer=buf, + chat_mode=chat_mode, + ) + else: + ( + answer, + (n_input_tokens, n_output_tokens), + n_first_dialog_messages_removed, + ) = await chatgpt_instance.send_vision_message( + message, + dialog_messages=dialog_messages, + image_buffer=buf, + chat_mode=chat_mode, + ) + + async def fake_gen(): + yield "finished", answer, ( + n_input_tokens, + n_output_tokens, + ), n_first_dialog_messages_removed + + gen = fake_gen() + + prev_answer = "" + async for gen_item in gen: + ( + status, + answer, + (n_input_tokens, n_output_tokens), + n_first_dialog_messages_removed, + ) = gen_item + answer = current_model + " " + answer + answer = answer[:4096] # telegram message limit + + # update only when 100 new symbols are ready + if abs(len(answer) - len(prev_answer)) < 100 and status != "finished": + continue + + try: + await context.bot.edit_message_text( + answer, + chat_id=placeholder_message.chat_id, + message_id=placeholder_message.message_id, + parse_mode=parse_mode, + ) + except telegram.error.BadRequest as e: + if str(e).startswith("Message is not modified"): + continue + else: + await context.bot.edit_message_text( + answer, + chat_id=placeholder_message.chat_id, + message_id=placeholder_message.message_id, + ) + + await asyncio.sleep(0.01) # wait a bit to avoid flooding + + prev_answer = answer + + # update user data + if buf is not None: + base_image = base64.b64encode(buf.getvalue()).decode("utf-8") + new_dialog_message = {"user": [ + { + "type": "text", + "text": message, + }, + { + "type": "image", + "image": base_image, + } + ] + , "bot": answer, "date": datetime.now()} + else: + new_dialog_message = {"user": [{"type": "text", "text": message}], "bot": answer, "date": datetime.now()} + + db.set_dialog_messages( + user_id, + db.get_dialog_messages(user_id, dialog_id=None) + [new_dialog_message], + dialog_id=None + ) + + db.update_n_used_tokens(user_id, current_model, n_input_tokens, n_output_tokens) + + except asyncio.CancelledError: + # note: intermediate token updates only work when enable_message_streaming=True (config.yml) + db.update_n_used_tokens(user_id, current_model, n_input_tokens, n_output_tokens) + raise + + except Exception as e: + error_text = f"Something went wrong during completion. Reason: {e}" + logger.error(error_text) + await update.message.reply_text(error_text) + return + +async def unsupport_message_handle(update: Update, context: CallbackContext, message=None): + error_text = f"I don't know how to read files or videos. Send the picture in normal mode (Quick Mode)." + logger.error(error_text) + await update.message.reply_text(error_text) + return async def message_handle(update: Update, context: CallbackContext, message=None, use_new_dialog_timeout=True): # check if bot was mentioned (for group chats) @@ -204,6 +367,8 @@ async def message_handle(update: Update, context: CallbackContext, message=None, await generate_image_handle(update, context, message=message) return + current_model = db.get_user_attribute(user_id, "current_model") + async def message_handle_fn(): # new dialog timeout if use_new_dialog_timeout: @@ -214,7 +379,6 @@ async def message_handle_fn(): # in case of CancelledError n_input_tokens, n_output_tokens = 0, 0 - current_model = db.get_user_attribute(user_id, "current_model") try: # send placeholder message to user @@ -249,11 +413,12 @@ async def fake_gen(): gen = fake_gen() prev_answer = "" + async for gen_item in gen: status, answer, (n_input_tokens, n_output_tokens), n_first_dialog_messages_removed = gen_item - + answer = current_model + " " + answer answer = answer[:4096] # telegram message limit - + # update only when 100 new symbols are ready if abs(len(answer) - len(prev_answer)) < 100 and status != "finished": continue @@ -267,11 +432,12 @@ async def fake_gen(): await context.bot.edit_message_text(answer, chat_id=placeholder_message.chat_id, message_id=placeholder_message.message_id) await asyncio.sleep(0.01) # wait a bit to avoid flooding - + prev_answer = answer - + # update user data new_dialog_message = {"user": _message, "bot": answer, "date": datetime.now()} + db.set_dialog_messages( user_id, db.get_dialog_messages(user_id, dialog_id=None) + [new_dialog_message], @@ -300,7 +466,19 @@ async def fake_gen(): await update.message.reply_text(text, parse_mode=ParseMode.HTML) async with user_semaphores[user_id]: - task = asyncio.create_task(message_handle_fn()) + if current_model == "gpt-4-vision-preview" or update.message.photo is not None and len(update.message.photo) > 0: + logger.error('gpt-4-vision-preview') + if current_model != "gpt-4-vision-preview": + current_model = "gpt-4-vision-preview" + db.set_user_attribute(user_id, "current_model", "gpt-4-vision-preview") + task = asyncio.create_task( + _vision_message_handle_fn(update, context, use_new_dialog_timeout=use_new_dialog_timeout) + ) + else: + task = asyncio.create_task( + message_handle_fn() + ) + user_tasks[user_id] = task try: @@ -392,6 +570,7 @@ async def new_dialog_handle(update: Update, context: CallbackContext): user_id = update.message.from_user.id db.set_user_attribute(user_id, "last_interaction", datetime.now()) + db.set_user_attribute(user_id, "current_model", "gpt-3.5-turbo") db.start_new_dialog(user_id) await update.message.reply_text("Starting new dialog ✅") @@ -672,6 +851,9 @@ def run_bot() -> None: application.add_handler(CommandHandler("help_group_chat", help_group_chat_handle, filters=user_filter)) application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND & user_filter, message_handle)) + application.add_handler(MessageHandler(filters.PHOTO & ~filters.COMMAND & user_filter, message_handle)) + application.add_handler(MessageHandler(filters.VIDEO & ~filters.COMMAND & user_filter, unsupport_message_handle)) + application.add_handler(MessageHandler(filters.Document.ALL & ~filters.COMMAND & user_filter, unsupport_message_handle)) application.add_handler(CommandHandler("retry", retry_handle, filters=user_filter)) application.add_handler(CommandHandler("new", new_dialog_handle, filters=user_filter)) application.add_handler(CommandHandler("cancel", cancel_handle, filters=user_filter)) @@ -694,4 +876,4 @@ def run_bot() -> None: if __name__ == "__main__": - run_bot() + run_bot() \ No newline at end of file diff --git a/bot/openai_utils.py b/bot/openai_utils.py index cd3168ebd..cf19257e7 100644 --- a/bot/openai_utils.py +++ b/bot/openai_utils.py @@ -1,4 +1,7 @@ +import base64 +from io import BytesIO import config +import logging import tiktoken import openai @@ -8,6 +11,7 @@ openai.api_key = config.openai_api_key if config.openai_api_base is not None: openai.api_base = config.openai_api_base +logger = logging.getLogger(__name__) OPENAI_COMPLETION_OPTIONS = { @@ -22,7 +26,7 @@ class ChatGPT: def __init__(self, model="gpt-3.5-turbo"): - assert model in {"text-davinci-003", "gpt-3.5-turbo-16k", "gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview"}, f"Unknown model: {model}" + assert model in {"text-davinci-003", "gpt-3.5-turbo-16k", "gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview", "gpt-4-vision-preview"}, f"Unknown model: {model}" self.model = model async def send_message(self, message, dialog_messages=[], chat_mode="assistant"): @@ -33,8 +37,9 @@ async def send_message(self, message, dialog_messages=[], chat_mode="assistant") answer = None while answer is None: try: - if self.model in {"gpt-3.5-turbo-16k", "gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview"}: + if self.model in {"gpt-3.5-turbo-16k", "gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview", "gpt-4-vision-preview"}: messages = self._generate_prompt_messages(message, dialog_messages, chat_mode) + r = await openai.ChatCompletion.acreate( model=self.model, messages=messages, @@ -75,6 +80,7 @@ async def send_message_stream(self, message, dialog_messages=[], chat_mode="assi try: if self.model in {"gpt-3.5-turbo-16k", "gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview"}: messages = self._generate_prompt_messages(message, dialog_messages, chat_mode) + r_gen = await openai.ChatCompletion.acreate( model=self.model, messages=messages, @@ -85,11 +91,15 @@ async def send_message_stream(self, message, dialog_messages=[], chat_mode="assi answer = "" async for r_item in r_gen: delta = r_item.choices[0].delta + if "content" in delta: answer += delta.content n_input_tokens, n_output_tokens = self._count_tokens_from_messages(messages, answer, model=self.model) - n_first_dialog_messages_removed = n_dialog_messages_before - len(dialog_messages) + n_first_dialog_messages_removed = 0 + yield "not_finished", answer, (n_input_tokens, n_output_tokens), n_first_dialog_messages_removed + + elif self.model == "text-davinci-003": prompt = self._generate_prompt(message, dialog_messages, chat_mode) r_gen = await openai.Completion.acreate( @@ -117,6 +127,109 @@ async def send_message_stream(self, message, dialog_messages=[], chat_mode="assi yield "finished", answer, (n_input_tokens, n_output_tokens), n_first_dialog_messages_removed # sending final answer + async def send_vision_message( + self, + message, + dialog_messages=[], + chat_mode="assistant", + image_buffer: BytesIO = None, + ): + n_dialog_messages_before = len(dialog_messages) + answer = None + while answer is None: + try: + if self.model == "gpt-4-vision-preview": + messages = self._generate_prompt_messages( + message, dialog_messages, chat_mode, image_buffer + ) + r = await openai.ChatCompletion.acreate( + model=self.model, + messages=messages, + **OPENAI_COMPLETION_OPTIONS + ) + answer = r.choices[0].message.content + else: + raise ValueError(f"Unsupported model: {self.model}") + + answer = self._postprocess_answer(answer) + n_input_tokens, n_output_tokens = ( + r.usage.prompt_tokens, + r.usage.completion_tokens, + ) + except openai.error.InvalidRequestError as e: # too many tokens + if len(dialog_messages) == 0: + raise ValueError( + "Dialog messages is reduced to zero, but still has too many tokens to make completion" + ) from e + + # forget first message in dialog_messages + dialog_messages = dialog_messages[1:] + + n_first_dialog_messages_removed = n_dialog_messages_before - len( + dialog_messages + ) + + return ( + answer, + (n_input_tokens, n_output_tokens), + n_first_dialog_messages_removed, + ) + + async def send_vision_message_stream( + self, + message, + dialog_messages=[], + chat_mode="assistant", + image_buffer: BytesIO = None, + ): + n_dialog_messages_before = len(dialog_messages) + answer = None + while answer is None: + try: + if self.model == "gpt-4-vision-preview": + messages = self._generate_prompt_messages( + message, dialog_messages, chat_mode, image_buffer + ) + + r_gen = await openai.ChatCompletion.acreate( + model=self.model, + messages=messages, + stream=True, + **OPENAI_COMPLETION_OPTIONS, + ) + + answer = "" + async for r_item in r_gen: + delta = r_item.choices[0].delta + if "content" in delta: + answer += delta.content + ( + n_input_tokens, + n_output_tokens, + ) = self._count_tokens_from_messages( + messages, answer, model=self.model + ) + n_first_dialog_messages_removed = ( + n_dialog_messages_before - len(dialog_messages) + ) + yield "not_finished", answer, ( + n_input_tokens, + n_output_tokens, + ), n_first_dialog_messages_removed + + answer = self._postprocess_answer(answer) + + except openai.error.InvalidRequestError as e: # too many tokens + if len(dialog_messages) == 0: + raise e + # forget first message in dialog_messages + dialog_messages = dialog_messages[1:] + + yield "finished", answer, ( + n_input_tokens, + n_output_tokens, + ), n_first_dialog_messages_removed + def _generate_prompt(self, message, dialog_messages, chat_mode): prompt = config.chat_modes[chat_mode]["prompt_start"] prompt += "\n\n" @@ -134,16 +247,32 @@ def _generate_prompt(self, message, dialog_messages, chat_mode): return prompt - def _generate_prompt_messages(self, message, dialog_messages, chat_mode): + def _encode_image(self, image_buffer: BytesIO) -> bytes: + return base64.b64encode(image_buffer.read()).decode("utf-8") + + def _generate_prompt_messages(self, message, dialog_messages, chat_mode, image_buffer: BytesIO = None): prompt = config.chat_modes[chat_mode]["prompt_start"] messages = [{"role": "system", "content": prompt}] + user_messages = {"role": "user", "content": []} + for dialog_message in dialog_messages: - messages.append({"role": "user", "content": dialog_message["user"]}) + user_messages["content"].extend(dialog_message["user"]) messages.append({"role": "assistant", "content": dialog_message["bot"]}) - messages.append({"role": "user", "content": message}) + + + user_messages["content"].append({"type": "text", "text": message}) + + if image_buffer is not None: + user_messages["content"].append( + { + "type": "image", + "image": self._encode_image(image_buffer), + } + ) - return messages + response = messages + ([user_messages] if len(user_messages["content"]) > 0 else []) + return response def _postprocess_answer(self, answer): answer = answer.strip() @@ -164,6 +293,9 @@ def _count_tokens_from_messages(self, messages, answer, model="gpt-3.5-turbo"): elif model == "gpt-4-1106-preview": tokens_per_message = 3 tokens_per_name = 1 + elif model == "gpt-4-vision-preview": + tokens_per_message = 3 + tokens_per_name = 1 else: raise ValueError(f"Unknown model: {model}") @@ -171,10 +303,20 @@ def _count_tokens_from_messages(self, messages, answer, model="gpt-3.5-turbo"): n_input_tokens = 0 for message in messages: n_input_tokens += tokens_per_message - for key, value in message.items(): - n_input_tokens += len(encoding.encode(value)) - if key == "name": - n_input_tokens += tokens_per_name + if isinstance(message["content"], list): + for sub_message in message["content"]: + if "type" in sub_message: + if sub_message["type"] == "text": + n_input_tokens += len(encoding.encode(sub_message["text"])) + elif sub_message["type"] == "image_url": + pass + else: + if "type" in message: + if message["type"] == "text": + n_input_tokens += len(encoding.encode(message["text"])) + elif message["type"] == "image_url": + pass + n_input_tokens += 2 @@ -205,4 +347,4 @@ async def generate_images(prompt, n_images=4, size="512x512"): async def is_content_acceptable(prompt): r = await openai.Moderation.acreate(input=prompt) - return not all(r.results[0].categories.values()) + return not all(r.results[0].categories.values()) \ No newline at end of file diff --git a/config/models.yml b/config/models.yml index dc35f2d63..ebd1bc90f 100644 --- a/config/models.yml +++ b/config/models.yml @@ -1,4 +1,4 @@ -available_text_models: ["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4-1106-preview", "gpt-4", "text-davinci-003"] +available_text_models: ["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4-1106-preview", "gpt-4-vision-preview", "gpt-4", "text-davinci-003"] info: gpt-3.5-turbo: @@ -53,6 +53,19 @@ info: fast: 4 cheap: 3 + gpt-4-vision-preview: + type: chat_completion + name: GPT-4 Vision + description: Ability to understand images, in addition to all other GPT-4 Turbo capabilties. + + price_per_1000_input_tokens: 0.01 + price_per_1000_output_tokens: 0.03 + + scores: + smart: 5 + fast: 4 + cheap: 3 + text-davinci-003: type: completion name: GPT-3.5