From 854c95270f42129d43291fb874710209cd5822e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20Juvenal?= Date: Wed, 2 Oct 2024 16:38:27 -0300 Subject: [PATCH] Improve performance and precision on movies AI --- example/README.md | 1 + example/example/settings.py | 1 + example/movies/ai_assistants.py | 85 +++++++++++++++++++++++---------- 3 files changed, 62 insertions(+), 25 deletions(-) diff --git a/example/README.md b/example/README.md index 408a886..3ba02ee 100644 --- a/example/README.md +++ b/example/README.md @@ -34,6 +34,7 @@ Fill the `.env` file with the necessary API keys. You'll need accounts on: - [OpenAI](https://platform.openai.com/) - [Weather API](https://www.weatherapi.com/) - [Brave Search API](https://app.tavily.com/) +- [Jina Reader API](https://jina.ai/) Activate the poetry shell: diff --git a/example/example/settings.py b/example/example/settings.py index 7a5fac0..690b8c5 100644 --- a/example/example/settings.py +++ b/example/example/settings.py @@ -180,6 +180,7 @@ # Example specific settings: WEATHER_API_KEY = os.getenv("WEATHER_API_KEY") # get for free at https://www.weatherapi.com/ +JINA_API_KEY = os.getenv("JINA_API_KEY") # get for free at https://jina.ai/ BRAVE_SEARCH_API_KEY = os.getenv( "BRAVE_SEARCH_API_KEY" ) # get for free at https://brave.com/search/api/ diff --git a/example/movies/ai_assistants.py b/example/movies/ai_assistants.py index dc7b132..9841c63 100644 --- a/example/movies/ai_assistants.py +++ b/example/movies/ai_assistants.py @@ -1,3 +1,5 @@ +import threading +import time from typing import Sequence from django.conf import settings @@ -8,16 +10,30 @@ import requests from langchain_community.tools import BraveSearch from langchain_core.tools import BaseTool -from pydantic import BaseModel from django_ai_assistant import AIAssistant, method_tool from movies.models import MovieBacklogItem -class IMDbMovie(BaseModel): - imdb_url: str - imdb_rating: float - scrapped_imdb_page_markdown: str +brave_search_lock = threading.Lock() + + +class RateLimitedBraveSearch(BraveSearch): + def _run(self, query: str, **kwargs) -> str: + """Use the tool.""" + + # brave_search_lock is necessary to ensure 1 request/second, + # due to free plan limitations of Brave Search API: + try: + brave_search_lock.acquire(timeout=10) + start_time = time.time() + result = self.search_wrapper.run(query) + elapsed_time = time.time() - start_time + if 1 - elapsed_time > 0: + time.sleep(1 - elapsed_time + 0.2) # sleep plus some jitter + return result + finally: + brave_search_lock.release() # Note this assistant is not registered, but we'll use it as a tool on the other. @@ -28,31 +44,38 @@ class IMDbScraper(AIAssistant): "You're a function to find the IMDb URL of a given movie, " "and scrape this URL to get the movie rating and other information.\n" "Use the search function to find the IMDb URL. " - "Make search queries like: \n" - "- IMDb page of The Matrix\n" - "- IMDb page of The Godfather\n" - "- IMDb page of The Shawshank Redemption\n" - "Then check results, scape the IMDb URL, process the page, and produce a JSON output." + "Make search queries like:\n" + "- IMDb page of \n" + "Then check results, scrape the IMDb URL, process the page, and produce an output like this: \n" + "- IMDb URL: ...\n" + "- IMDb Rating: ...\n" + "- IMDb Page: " ) name = "IMDb Scraper" model = "gpt-4o-mini" - structured_output = IMDbMovie + tool_max_concurrency = 4 def get_instructions(self): # Warning: this will use the server's timezone # See: https://docs.djangoproject.com/en/5.0/topics/i18n/timezones/#default-time-zone-and-current-time-zone # In a real application, you should use the user's timezone current_date_str = timezone.now().date().isoformat() - return f"{self.instructions} Today is: {current_date_str}." + return f"{self.instructions}.\n Today is: {current_date_str}." @method_tool def scrape_imdb_url(self, url: str) -> str: - """Scrape the IMDb URL and return the content as markdown.""" - return requests.get("https://r.jina.ai/" + url, timeout=20).text[:10000] + """Scrape the IMDb URL and return the content as Markdown.""" + return requests.get( + "https://r.jina.ai/" + url, + headers={ + "Authorization": "Bearer " + settings.JINA_API_KEY, + }, + timeout=20, + ).text[:30000] def get_tools(self) -> Sequence[BaseTool]: return [ - BraveSearch.from_api_key( + RateLimitedBraveSearch.from_api_key( api_key=settings.BRAVE_SEARCH_API_KEY, search_kwargs={"count": 5} ), *super().get_tools(), @@ -63,18 +86,23 @@ class MovieRecommendationAIAssistant(AIAssistant): id = "movie_recommendation_assistant" # noqa: A003 instructions = ( "You're a helpful movie recommendation assistant. " - "Help the user find movies to watch and manage their movie backlogs. " - "Use the provided functions to answer questions and run operations.\n" + "Use the provided functions to answer queries and run operations.\n" + "Use the search function to find movie recommendations based on user's query.\n" + "Then, use the IMDb Scraper to get the IMDb URL and rating of the movies you're recommending. " + "Both the IMDb URL and rating are necessary to add a movie to the user's backlog. " "Note the backlog is stored in a DB. " - "When managing the backlog, you must call the functions, to keep the sync with the DB. " + "When managing the backlog, you must call the functions, to keep your answers in sync with the DB. " "The backlog has an order, and you should respect it. Call `reorder_backlog` when necessary.\n" - "Include the IMDb URL and rating of the movies when displaying the backlog. " - "You must use the IMDb Scraper to get the IMDb URL and rating of the movies. \n" - "Ask the user if they want to add your recommended movies to their backlog, " - "but only if the movie is not on the user's backlog yet." + "When showing the backlog, show the movies in the order they are stored in the DB, " + "and include the IMDb URL and rating.\n" + "Ask the user if they want to add your recommended movies to their backlog.\n" + "User may talk to you in any language. Respond with the same language, " + "but refer to movies and call functions with their English name.\n" + "Do not include images in your response." ) name = "Movie Recommendation Assistant" model = "gpt-4o-mini" + tool_max_concurrency = 4 def get_instructions(self): # Warning: this will use the server's timezone @@ -93,10 +121,14 @@ def get_instructions(self): def get_tools(self) -> Sequence[BaseTool]: return [ - BraveSearch.from_api_key( + RateLimitedBraveSearch.from_api_key( api_key=settings.BRAVE_SEARCH_API_KEY, search_kwargs={"count": 5} ), - IMDbScraper().as_tool(description="IMDb Scraper to get the IMDb data a given movie."), + IMDbScraper().as_tool( + description="IMDb Scraper to get the IMDb data a given movie. " + "Given a movie name (in English), " + "finds the movie URL, rating, and scrapes the IMDb page (as Markdown)." + ), *super().get_tools(), ] @@ -116,7 +148,10 @@ def get_movies_backlog(self) -> str: @method_tool def add_movie_to_backlog(self, movie_name: str, imdb_url: str, imdb_rating: float) -> str: - """Add a movie to user's backlog. Must pass the movie_name, imdb_url, and imdb_rating.""" + """ + Add a movie to user's backlog. Must pass the movie_name, imdb_url, and imdb_rating. + Set imdb_rating to 0.0 if not available. + """ with transaction.atomic(): MovieBacklogItem.objects.update_or_create(