-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(api): vector store endpoints (#468)
* Adds Vector Stores routes to API * Adds Vector Stores migration for database * Adds query/indexing operations that use Langchain to process files * Adds mimetype checking to Files endpoint, rejecting all unsupported filetypes from upload. * Adds a new leapfrogai/rag route for obtaining rag results before the LLM has processed them. * Adds some integration tests for Vector Stores, although Embeddings is mocked instead of required to be run - future PR should split these out into integration/unit tests. * Supports auth implementation on all new endpoints
- Loading branch information
Showing
34 changed files
with
1,721 additions
and
158 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
"""Load a file and split it into chunks.""" | ||
|
||
# This import is required for "magic" to work, see https://github.com/ahupp/python-magic/issues/233 | ||
# may not be needed after https://github.com/ahupp/python-magic/pull/294 is merged | ||
import pylibmagic # noqa: F401 # pylint: disable=unused-import | ||
import magic | ||
from langchain_community.document_loaders import ( | ||
CSVLoader, | ||
Docx2txtLoader, | ||
PyPDFLoader, | ||
TextLoader, | ||
UnstructuredHTMLLoader, | ||
UnstructuredMarkdownLoader, | ||
) | ||
from langchain_core.documents import Document | ||
from langchain_text_splitters import RecursiveCharacterTextSplitter | ||
|
||
HANDLERS = { | ||
"application/pdf": PyPDFLoader, | ||
"text/plain": TextLoader, | ||
"text/html": UnstructuredHTMLLoader, | ||
"text/csv": CSVLoader, | ||
"text/markdown": UnstructuredMarkdownLoader, | ||
"application/msword": Docx2txtLoader, | ||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": Docx2txtLoader, | ||
} | ||
|
||
|
||
def is_supported_mime_type(mime_type: str) -> bool: | ||
"""Validate the mime type of a file.""" | ||
return mime_type in HANDLERS | ||
|
||
|
||
async def load_file(file_path: str) -> list[Document]: | ||
"""Load a file and return a list of documents.""" | ||
|
||
mime_type = magic.from_file(file_path, mime=True) | ||
|
||
loader = HANDLERS.get(mime_type) | ||
|
||
if loader: | ||
return await loader(file_path).aload() | ||
raise ValueError(f"Unsupported file type: {mime_type}") | ||
|
||
|
||
async def split(docs: list[Document]) -> list[Document]: | ||
"""Split a document into chunks.""" | ||
separators = [ | ||
"\n\n", | ||
"\n", | ||
" ", | ||
".", | ||
",", | ||
"\u200b", # Zero-width space | ||
"\uff0c", # Full width comma | ||
"\u3001", # Ideographic comma | ||
"\uff0e", # Full width full stop | ||
"\u3002", # Ideographic full stop | ||
"", | ||
] | ||
|
||
text_splitter = RecursiveCharacterTextSplitter( | ||
# TODO: This parameters might need to be tuned and/or exposed for configuration | ||
chunk_size=500, | ||
chunk_overlap=50, | ||
length_function=len, | ||
is_separator_regex=False, | ||
separators=separators, | ||
) | ||
|
||
return await text_splitter.atransform_documents(docs) |
Oops, something went wrong.