From c3ea7043199e726305ef33111e4a3283b33b4047 Mon Sep 17 00:00:00 2001
From: burtenshaw <ben.burtenshaw@gmail.com>
Date: Mon, 13 Jan 2025 19:12:17 +0100
Subject: [PATCH] mistakes in retrievers

---
 8_agents/retrieval_agents.md | 97 ++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 42 deletions(-)

diff --git a/8_agents/retrieval_agents.md b/8_agents/retrieval_agents.md
index 331ac261..fcd70d28 100644
--- a/8_agents/retrieval_agents.md
+++ b/8_agents/retrieval_agents.md
@@ -11,7 +11,6 @@ Let's start by building a simple agent that can search the web using DuckDuckGo.
 ```python
 from smolagents import Agent
 from smolagents.tools import DuckDuckGoSearch
-from smolagents.memory import SqliteMemory
 
 # Initialize the search tool
 search_tool = DuckDuckGoSearch()
@@ -20,8 +19,7 @@ search_tool = DuckDuckGoSearch()
 agent = Agent(
     name="research_assistant",
     description="I help find and synthesize information from the web",
-    tools=[search_tool],
-    memory=SqliteMemory(db_path="agent_memory.db")
+    tools=[search_tool]
 )
 
 # Example usage
@@ -42,48 +40,39 @@ The agent will:
 For domain-specific applications, we often want to combine web search with our own knowledge base. Let's create a custom tool that can query a vector database of technical documentation.
 
 ```python
-from smolagents import Agent, Tool
-from smolagents.embeddings import OpenAIEmbeddings
-from smolagents.vectorstores import Qdrant
-
-class DocumentationTool(Tool):
-    def __init__(self, docs_path: str):
-        self.embeddings = OpenAIEmbeddings()
-        self.vectorstore = Qdrant(
-            collection_name="documentation",
-            embeddings=self.embeddings
+from smolagents import Tool
+
+class RetrieverTool(Tool):
+    name = "retriever"
+    description = "Uses semantic search to retrieve the parts of transformers documentation that could be most relevant to answer your query."
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
+        }
+    }
+    output_type = "string"
+
+    def __init__(self, docs, **kwargs):
+        super().__init__(**kwargs)
+        self.retriever = BM25Retriever.from_documents(
+            docs, k=10
         )
-        # Load and index documentation
-        self.load_docs(docs_path)
-    
-    def load_docs(self, path: str):
-        # Load documents and split into chunks
-        documents = self.load_and_split(path)
-        # Index in vector store
-        self.vectorstore.add_documents(documents)
-    
-    def search(self, query: str, k: int = 3):
-        # Retrieve relevant documents
-        results = self.vectorstore.similarity_search(query, k=k)
-        return [doc.page_content for doc in results]
-
-# Create tools
-docs_tool = DocumentationTool("path/to/docs")
-search_tool = DuckDuckGoSearch()
 
-# Initialize agent with both tools
-agent = Agent(
-    name="technical_assistant",
-    description="I help answer questions using documentation and web search",
-    tools=[docs_tool, search_tool],
-    memory=SqliteMemory(db_path="tech_agent_memory.db")
-)
+    def forward(self, query: str) -> str:
+        assert isinstance(query, str), "Your search query must be a string"
 
-# Example usage
-response = agent.run(
-    "How do I implement RAG in my application? Include both general concepts and specific code examples."
-)
-print(response)
+        docs = self.retriever.invoke(
+            query,
+        )
+        return "\nRetrieved documents:\n" + "".join(
+            [
+                f"\n\n===== Document {str(i)} =====\n" + doc.page_content
+                for i, doc in enumerate(docs)
+            ]
+        )
+
+retriever_tool = RetrieverTool(docs_processed)
 ```
 
 This enhanced agent can:
@@ -103,6 +92,30 @@ When building agentic RAG systems, the agent can employ sophisticated strategies
 
 Effective agentic RAG systems require careful consideration of several key aspects. The agent should select between available tools based on the query type and context. Memory systems help maintain conversation history and avoid repetitive retrievals. Having fallback strategies ensures the system can still provide value even when primary retrieval methods fail. Additionally, implementing validation steps helps ensure the accuracy and relevance of retrieved information.
 
+```python
+import datasets
+from langchain.docstore.document import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.retrievers import BM25Retriever
+
+knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
+knowledge_base = knowledge_base.filter(lambda row: row["source"].startswith("huggingface/transformers"))
+
+source_docs = [
+    Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]})
+    for doc in knowledge_base
+]
+
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=500,
+    chunk_overlap=50,
+    add_start_index=True,
+    strip_whitespace=True,
+    separators=["\n\n", "\n", ".", " ", ""],
+)
+docs_processed = text_splitter.split_documents(source_docs)
+```
+
 ## Next Steps
 
 ⏩ Check out the [Code Agents](./code_agents.md) module to learn how to build agents that can manipulate code.