forked from pdichone/rag-intro-chat-with-docs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
150 lines (120 loc) · 4.66 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
from dotenv import load_dotenv
import chromadb
from openai import OpenAI
from chromadb.utils import embedding_functions
# Load environment variables from .env file
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
api_key=openai_key, model_name="text-embedding-3-small"
)
# Initialize the Chroma client with persistence
chroma_client = chromadb.PersistentClient(path="chroma_persistent_storage")
collection_name = "document_qa_collection"
collection = chroma_client.get_or_create_collection(
name=collection_name, embedding_function=openai_ef
)
client = OpenAI(api_key=openai_key)
# resp = client.chat.completions.create(
# model="gpt-3.5-turbo",
# messages=[
# {"role": "system", "content": "You are a helpful assistant."},
# {
# "role": "user",
# "content": "What is human life expectancy in the United States?",
# },
# ],
# )
# Function to load documents from a directory
def load_documents_from_directory(directory_path):
print("==== Loading documents from directory ====")
documents = []
for filename in os.listdir(directory_path):
if filename.endswith(".txt"):
with open(
os.path.join(directory_path, filename), "r", encoding="utf-8"
) as file:
documents.append({"id": filename, "text": file.read()})
return documents
# Function to split text into chunks
def split_text(text, chunk_size=1000, chunk_overlap=20):
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start = end - chunk_overlap
return chunks
# Load documents from the directory
directory_path = "./news_articles"
documents = load_documents_from_directory(directory_path)
print(f"Loaded {len(documents)} documents")
# Split documents into chunks
chunked_documents = []
for doc in documents:
chunks = split_text(doc["text"])
print("==== Splitting docs into chunks ====")
for i, chunk in enumerate(chunks):
chunked_documents.append({"id": f"{doc['id']}_chunk{i+1}", "text": chunk})
# print(f"Split documents into {len(chunked_documents)} chunks")
# Function to generate embeddings using OpenAI API
def get_openai_embedding(text):
response = client.embeddings.create(input=text, model="text-embedding-3-small")
embedding = response.data[0].embedding
print("==== Generating embeddings... ====")
return embedding
# Generate embeddings for the document chunks
for doc in chunked_documents:
print("==== Generating embeddings... ====")
doc["embedding"] = get_openai_embedding(doc["text"])
# print(doc["embedding"])
# Upsert documents with embeddings into Chroma
for doc in chunked_documents:
print("==== Inserting chunks into db;;; ====")
collection.upsert(
ids=[doc["id"]], documents=[doc["text"]], embeddings=[doc["embedding"]]
)
# Function to query documents
def query_documents(question, n_results=2):
# query_embedding = get_openai_embedding(question)
results = collection.query(query_texts=question, n_results=n_results)
# Extract the relevant chunks
relevant_chunks = [doc for sublist in results["documents"] for doc in sublist]
print("==== Returning relevant chunks ====")
return relevant_chunks
# for idx, document in enumerate(results["documents"][0]):
# doc_id = results["ids"][0][idx]
# distance = results["distances"][0][idx]
# print(f"Found document chunk: {document} (ID: {doc_id}, Distance: {distance})")
# Function to generate a response from OpenAI
def generate_response(question, relevant_chunks):
context = "\n\n".join(relevant_chunks)
prompt = (
"You are an assistant for question-answering tasks. Use the following pieces of "
"retrieved context to answer the question. If you don't know the answer, say that you "
"don't know. Use three sentences maximum and keep the answer concise."
"\n\nContext:\n" + context + "\n\nQuestion:\n" + question
)
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": prompt,
},
{
"role": "user",
"content": question,
},
],
)
answer = response.choices[0].message
return answer
# Example query
# query_documents("tell me about AI replacing TV writers strike.")
# Example query and response generation
question = "tell me about databricks"
relevant_chunks = query_documents(question)
answer = generate_response(question, relevant_chunks)
print(answer)