-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathingest.py
32 lines (27 loc) · 1.16 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import os
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
def process_documents(source_dir, embeddings_model_name):
loaders = {
".pdf": PyPDFLoader,
".txt": TextLoader
}
documents = []
for root, _, files in os.walk(source_dir):
for file in files:
ext = os.path.splitext(file)[-1].lower()
if ext in loaders:
loader = loaders[ext](os.path.join(root, file))
documents.extend(loader.load())
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)
# Create embeddings and FAISS vectorstore
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
vectorstore = FAISS.from_documents(chunks, embeddings)
# Save the vectorstore to disk
vectorstore.save_local("faiss_store")
if __name__ == "__main__":
process_documents("source_documents", "all-MiniLM-L6-v2")