Merge branch 'master' into master

ZenGuard-AI · Jun 20, 2024 · e96359c · e96359c
2 parents 9e68444 + a349fce
commit e96359c
Showing 12 changed files with 216 additions and 38 deletions.
diff --git a/docs/docs/how_to/document_loader_pdf.ipynb b/docs/docs/how_to/document_loader_pdf.ipynb
@@ -69,6 +69,17 @@
     "Once we have loaded PDFs into LangChain `Document` objects, we can index them (e.g., a RAG application) in the usual way:"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3b932bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install faiss-cpu \n",
+    "# use `pip install faiss-gpu` for CUDA GPU support"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/docs/docs/how_to/functions.ipynb b/docs/docs/how_to/functions.ipynb
@@ -300,7 +300,11 @@
    "id": "922b48bd",
    "metadata": {},
    "source": [
-    "# Streaming\n",
+    "## Streaming\n",
+    "\n",
+    ":::{.callout-note}\n",
+    "[RunnableLambda](https://api.python.langchain.com/en/latest/runnables/langchain_core.runnables.base.RunnableLambda.html) is best suited for code that does not need to support streaming. If you need to support streaming (i.e., be able to operate on chunks of inputs and yield chunks of outputs), use [RunnableGenerator](https://api.python.langchain.com/en/latest/runnables/langchain_core.runnables.base.RunnableGenerator.html) instead as in the example below.\n",
+    ":::\n",
     "\n",
     "You can use generator functions (ie. functions that use the `yield` keyword, and behave like iterators) in a chain.\n",
     "\n",

diff --git a/docs/docs/integrations/document_loaders/microsoft_powerpoint.ipynb b/docs/docs/integrations/document_loaders/microsoft_powerpoint.ipynb
@@ -12,6 +12,19 @@
     "This covers how to load `Microsoft PowerPoint` documents into a document format that we can use downstream."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aef1500f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install packages\n",
+    "%pip install unstructured\n",
+    "%pip install python-magic\n",
+    "%pip install python-pptx"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,

diff --git a/docs/docs/tutorials/qa_chat_history.ipynb b/docs/docs/tutorials/qa_chat_history.ipynb
@@ -322,7 +322,7 @@
     "\n",
     "Now we can build our full QA chain. This is as simple as updating the retriever to be our new `history_aware_retriever`.\n",
     "\n",
-    "Again, we will use [create_stuff_documents_chain](https://api.python.langchain.com/en/latest/chains/langchain.chains.combine_documents.stuff.create_stuff_documents_chain.html) to generate a `question_answer_chain`, with input keys `context`, `chat_history`, and `input`-- it accepts the retrieved context alongside the conversation history and query to generate an answer.\n",
+    "Again, we will use [create_stuff_documents_chain](https://api.python.langchain.com/en/latest/chains/langchain.chains.combine_documents.stuff.create_stuff_documents_chain.html) to generate a `question_answer_chain`, with input keys `context`, `chat_history`, and `input`-- it accepts the retrieved context alongside the conversation history and query to generate an answer. A more detailed explaination is over [here](/docs/tutorials/rag/#built-in-chains)\n",
     "\n",
     "We build our final `rag_chain` with [create_retrieval_chain](https://api.python.langchain.com/en/latest/chains/langchain.chains.retrieval.create_retrieval_chain.html). This chain applies the `history_aware_retriever` and `question_answer_chain` in sequence, retaining intermediate outputs such as the retrieved context for convenience. It has input keys `input` and `chat_history`, and includes `input`, `chat_history`, `context`, and `answer` in its output."
    ]
@@ -760,13 +760,6 @@
    "id": "931c4fe3-c603-4efb-9b37-5f7cbbb1cbbd",
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Error in LangChainTracer.on_tool_end callback: TracerException(\"Found chain run at ID 0ec120e2-b1fc-4593-9fee-2dd4f4cae256, but expected {'tool'} run.\")\n"
-     ]
-    },
     {
      "data": {
       "text/plain": [
@@ -1030,6 +1023,7 @@
     "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n",
     "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
     "from langgraph.checkpoint.sqlite import SqliteSaver\n",
+    "from langgraph.prebuilt import create_react_agent\n",
     "\n",
     "memory = SqliteSaver.from_conn_string(\":memory:\")\n",
     "llm = ChatOpenAI(model=\"gpt-3.5-turbo\", temperature=0)\n",

diff --git a/docs/scripts/generate_api_reference_links.py b/docs/scripts/generate_api_reference_links.py
@@ -24,7 +24,7 @@
 
 _CURRENT_PATH = Path(__file__).parent.absolute()
 # Directory where generated markdown files are stored
-_DOCS_DIR = _CURRENT_PATH / "docs"
+_DOCS_DIR = _CURRENT_PATH.parent.parent / "docs"
 
 
 def find_files(path):
@@ -75,6 +75,7 @@ def main():
 
     for file in find_files(args.docs_dir):
         file_imports = replace_imports(file)
+        print(file)
 
         if file_imports:
             # Use relative file path as key

diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -89,7 +89,13 @@ def __init__(
 
     def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
         """Lazily parse the blob."""
-        import pypdf
+        try:
+            import pypdf
+        except ImportError:
+            raise ImportError(
+                "`pypdf` package not found, please install it with "
+                "`pip install pypdf`"
+            )
 
         with blob.as_bytes_io() as pdf_file_obj:  # type: ignore[attr-defined]
             pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
@@ -144,7 +150,13 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
         """Lazily parse the blob."""
 
         if not self.extract_images:
-            from pdfminer.high_level import extract_text
+            try:
+                from pdfminer.high_level import extract_text
+            except ImportError:
+                raise ImportError(
+                    "`pdfminer` package not found, please install it with "
+                    "`pip install pdfminer.six`"
+                )
 
             with blob.as_bytes_io() as pdf_file_obj:  # type: ignore[attr-defined]
                 if self.concatenate_pages:

diff --git a/libs/community/langchain_community/utilities/sql_database.py b/libs/community/langchain_community/utilities/sql_database.py
@@ -201,10 +201,10 @@ def from_databricks(
             from dbruntime.databricks_repl_context import get_context
 
             context = get_context()
+            default_host = context.browserHostName
         except ImportError:
-            pass
+            default_host = None
 
-        default_host = context.browserHostName if context else None
         if host is None:
             host = get_from_env("host", "DATABRICKS_HOST", default_host)
 

diff --git a/libs/core/langchain_core/runnables/base.py b/libs/core/langchain_core/runnables/base.py
@@ -1357,6 +1357,7 @@ def with_alisteners(
         Example:
 
         .. code-block:: python
+
             from langchain_core.runnables import RunnableLambda
             import time
 
@@ -3388,6 +3389,7 @@ async def agen(input: AsyncIterator[Any]) -> AsyncIterator[str]:
 
     RunnableGenerator makes it easy to implement custom behavior within a streaming
     context. Below we show an example:
+
         .. code-block:: python
 
             from langchain_core.prompts import ChatPromptTemplate
@@ -4384,12 +4386,15 @@ def with_listeners(
             Union[Callable[[Run], None], Callable[[Run, RunnableConfig], None]]
         ] = None,
     ) -> RunnableEach[Input, Output]:
-        """
-        Bind lifecycle listeners to a Runnable, returning a new Runnable.
+        """Bind lifecycle listeners to a Runnable, returning a new Runnable.
 
-        on_start: Called before the runnable starts running, with the Run object.
-        on_end: Called after the runnable finishes running, with the Run object.
-        on_error: Called if the runnable throws an error, with the Run object.
+        Args:
+            on_start: Called before the runnable starts running, with the Run object.
+            on_end: Called after the runnable finishes running, with the Run object.
+            on_error: Called if the runnable throws an error, with the Run object.
+
+        Returns:
+            A new Runnable with the listeners bound.
 
         The Run object contains information about the run, including its id,
         type, input, output, error, start_time, end_time, and any tags or metadata
@@ -4408,15 +4413,18 @@ def with_alisteners(
         on_end: Optional[AsyncListener] = None,
         on_error: Optional[AsyncListener] = None,
     ) -> RunnableEach[Input, Output]:
-        """
-        Bind async lifecycle listeners to a Runnable, returning a new Runnable.
-
-        on_start: Called asynchronously before the runnable starts running,
-                  with the Run object.
-        on_end: Called asynchronously after the runnable finishes running,
-                with the Run object.
-        on_error: Called asynchronously if the runnable throws an error,
-                with the Run object.
+        """Bind async lifecycle listeners to a Runnable, returning a new Runnable.
+
+        Args:
+            on_start: Called asynchronously before the runnable starts running,
+                      with the Run object.
+            on_end: Called asynchronously after the runnable finishes running,
+                    with the Run object.
+            on_error: Called asynchronously if the runnable throws an error,
+                    with the Run object.
+
+        Returns:
+            A new Runnable with the listeners bound.
 
         The Run object contains information about the run, including its id,
         type, input, output, error, start_time, end_time, and any tags or metadata

diff --git a/libs/partners/pinecone/langchain_pinecone/vectorstores.py b/libs/partners/pinecone/langchain_pinecone/vectorstores.py
@@ -118,6 +118,7 @@ def add_texts(
         embedding_chunk_size: int = 1000,
         *,
         async_req: bool = True,
+        id_prefix: Optional[str] = None,
         **kwargs: Any,
     ) -> List[str]:
         """Run more texts through the embeddings and add to the vectorstore.
@@ -133,6 +134,7 @@ def add_texts(
             namespace: Optional pinecone namespace to add the texts to.
             batch_size: Batch size to use when adding the texts to the vectorstore.
             embedding_chunk_size: Chunk size to use when embedding the texts.
+            id_prefix: Optional string to use as an ID prefix when upserting vectors.
 
         Returns:
             List of ids from adding the texts into the vectorstore.
@@ -143,6 +145,10 @@ def add_texts(
 
         texts = list(texts)
         ids = ids or [str(uuid.uuid4()) for _ in texts]
+        if id_prefix:
+            ids = [
+                id_prefix + "#" + id if id_prefix + "#" not in id else id for id in ids
+            ]
         metadatas = metadatas or [{} for _ in texts]
         for metadata, text in zip(metadatas, texts):
             metadata[self._text_key] = text
@@ -406,6 +412,8 @@ def from_texts(
         upsert_kwargs: Optional[dict] = None,
         pool_threads: int = 4,
         embeddings_chunk_size: int = 1000,
+        *,
+        id_prefix: Optional[str] = None,
         **kwargs: Any,
     ) -> PineconeVectorStore:
         """Construct Pinecone wrapper from raw documents.
@@ -445,6 +453,7 @@ def from_texts(
             namespace=namespace,
             batch_size=batch_size,
             embedding_chunk_size=embeddings_chunk_size,
+            id_prefix=id_prefix,
             **(upsert_kwargs or {}),
         )
         return pinecone

diff --git a/libs/partners/pinecone/poetry.lock b/libs/partners/pinecone/poetry.lock
diff --git a/libs/partners/pinecone/pyproject.toml b/libs/partners/pinecone/pyproject.toml
@@ -14,7 +14,7 @@ license = "MIT"
 # <3.13 is due to restriction in pinecone-client package
 python = ">=3.8.1,<3.13"
 langchain-core = ">=0.1.52,<0.3"
-pinecone-client = "^3.2.2"
+pinecone-client = ">=3.2.2,<5"
 # Support Python 3.8 and 3.12+.
 numpy = [
     {version = "^1", python = "<3.12"},

diff --git a/libs/partners/pinecone/tests/unit_tests/test_vectorstores.py b/libs/partners/pinecone/tests/unit_tests/test_vectorstores.py
@@ -1,6 +1,6 @@
 from unittest.mock import Mock
 
-from langchain_pinecone.vectorstores import Pinecone
+from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore
 
 
 def test_initialization() -> None:
@@ -10,3 +10,16 @@ def test_initialization() -> None:
     embedding = Mock()
     text_key = "xyz"
     Pinecone(index, embedding, text_key)
+
+
+def test_id_prefix() -> None:
+    """Test integration of the id_prefix parameter."""
+    embedding = Mock()
+    embedding.embed_documents = Mock(return_value=[0.1, 0.2, 0.3, 0.4, 0.5])
+    index = Mock()
+    index.upsert = Mock(return_value=None)
+    text_key = "testing"
+    vectorstore = PineconeVectorStore(index, embedding, text_key)
+    texts = ["alpha", "beta", "gamma", "delta", "epsilon"]
+    id_prefix = "testing_prefixes"
+    vectorstore.add_texts(texts, id_prefix=id_prefix, async_req=False)