-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
369 lines (276 loc) · 14.1 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import sys
import fitz
from dotenv import load_dotenv, find_dotenv
# gemini embeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from logger import logging
from exception import customexception
# loading environment variables
load_dotenv(find_dotenv())
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
def get_pdf_text(pdf_docs):
"""
Extracts the text content from a list of PDF documents.
This function takes a list of PDF file paths, reads each PDF file,
and extracts the text from all the pages in each document. The
extracted text is concatenated into a single string and returned.
Args:
pdf_docs (list): A list of file paths (strings) to the PDF documents
from which text needs to be extracted.
Returns:
str: A string containing the concatenated text extracted from all the
pages of the provided PDF documents.
Raises:
customexception: If any exception occurs during the text extraction
process, the exception is logged and raised as
`customexception`.
Example:
pdf_files = ['document1.pdf', 'document2.pdf']
text = get_pdf_text(pdf_files)
print(text)
Notes:
- This function uses the `PdfReader` from the `PyPDF2` library to
extract text from each PDF page.
- The function logs the start and completion of the text extraction
process, and it also logs any exceptions that occur during the
process.
"""
try:
logging.info(f"[-] Text extraction started...")
text = ""
# for pdf in pdf_docs:
# pdf_reader=PdfReader(pdf)
# for page in pdf_reader.pages:
# text += page.extract_text()
# docs = fitz.open(pdf_docs)
docs = fitz.open(stream=pdf_docs.read(), filetype="pdf")
for page_num in range(len(docs)):
page = docs[page_num]
text += page.get_text("text") + "\n"
logging.info(f"[*] Text extrcation completed!")
return text
except Exception as e:
logging.info(f"[!] Exception while getting text from PDF : {e}")
raise customexception(e, sys)
def get_text_chunks(text):
"""
Splits a large text into smaller, manageable chunks.
This function takes a long text string and splits it into chunks of
a specified size using a recursive character-based text splitter.
The chunks are designed to have a maximum size of `chunk_size`
with a possible overlap (`chunk_overlap`) between consecutive chunks.
Args:
text (str): The large text string that needs to be split into chunks.
Returns:
list: A list of text chunks (strings), where each chunk is a substring
of the original text. The chunks have a maximum size of `chunk_size`,
with a `chunk_overlap` between adjacent chunks.
Raises:
customexception: If any error occurs during the text chunking process,
an exception is logged and raised as `customexception`.
Example:
large_text = "A very long text that needs to be chunked ..."
text_chunks = get_text_chunks(large_text)
print(text_chunks)
Notes:
- This function uses `RecursiveCharacterTextSplitter` to split the input text.
- `chunk_size` is set to 10,000 characters, and `chunk_overlap` is set to 1,000
characters by default.
- The function logs the start and completion of the chunking process,
and it also logs any exceptions that occur.
"""
try:
logging.info(f"[-] Text chunking started...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
chunks = text_splitter.split_text(text)
logging.info("[*] Text chunking completed!")
return chunks
except Exception as e:
logging.info(f"[!] Exception while converting text to chunks : {e}")
raise customexception(e, sys)
def get_vector_store(text_chunks):
"""
Converts text chunks into a vector store using embeddings and saves it locally.
This function takes a list of text chunks, converts each chunk into a vector
using a pre-trained embedding model (Google Generative AI Embeddings), and
stores the resulting vectors in a FAISS index. The index is then saved to the
local file system for future use.
Args:
text_chunks (list): A list of text strings (chunks) to be converted into
vector embeddings.
Returns:
None: This function does not return any value. Instead, it saves the
resulting FAISS index to disk.
Raises:
customexception: If any error occurs during the embedding or vector store
creation process, an exception is logged and raised as
`customexception`.
Example:
text_chunks = ["This is the first chunk of text.", "This is the second chunk."]
get_vector_store(text_chunks) # This saves the FAISS index locally as "faiss_index"
Notes:
- This function uses `GoogleGenerativeAIEmbeddings` to generate embeddings for each text chunk.
- It uses FAISS to create a vector store and saves the index to a file named "faiss_index".
- The function logs the start and completion of the embedding conversion process,
and also logs any exceptions that occur during the process.
"""
try:
logging.info("[-] Conversion to embedding started...")
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
vector_store.save_local("faiss_index")
logging.info("[*] Conversion to embedding completed!")
except Exception as e:
logging.info(f"[!] Exception while converting to vector : {e}")
raise customexception(e, sys)
def get_conversational_chain():
"""
Creates and returns a conversational chain for question-answering using a large language model (LLM).
This function initializes a conversational chain designed to answer questions based on a provided context.
It sets up a prompt template for the question-answering model, uses a pre-trained Google Generative AI model
(specified by the "gemini-pro" model), and configures the chain to process context and questions.
The function returns the initialized chain that can be used for answering questions with the provided context.
Returns:
Chain: A conversational chain (`load_qa_chain`) configured with a prompt template and a model, ready to
answer questions based on the provided context.
Raises:
customexception: If any error occurs while creating or setting up the conversational chain, an exception is
logged and raised as `customexception`.
Example:
# Example of using the conversational chain
chain = get_conversational_chain()
result = chain.run(context="Your context here", question="Your question here")
print(result)
Notes:
- The function uses the `ChatGoogleGenerativeAI` model with `gemini-pro` as the LLM for answering the questions.
- The prompt template is designed to instruct the model to provide detailed answers based on the context provided.
- The function logs the start and completion of setting up the conversation chain, and it also logs any exceptions
that occur during the setup.
"""
try:
logging.info("[-] Conversation chain started...")
prompt_template= """
Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
provided context just say, "answer is not available in the context", don't provide the wrong answers\n\n
Context:\n {context}?\n
Question:\n {question}\n
Answer:
"""
model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)
prompt=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
logging.info("[*] Conversation chain completed!")
return chain
except Exception as e:
logging.info(f"[!] Exception while conversing with llm : {e}")
raise customexception(e, sys)
def user_input(user_question):
"""
Processes user input by performing a similarity search on a pre-trained FAISS index
and generates a response using a conversational chain.
This function takes a user’s question, performs a similarity search on a locally saved
FAISS index to retrieve relevant context, and then uses a conversational chain powered
by a language model to generate a detailed answer. The response is then printed and
displayed using Streamlit (`st.write`).
Args:
user_question (str): The question input by the user that will be used to
perform a similarity search and generate a response.
Returns:
None: The function does not return a value, but it prints the response to
the console and displays it on the web interface using Streamlit.
Raises:
customexception: If any error occurs during loading the FAISS index, performing
the similarity search, or generating the response, an exception
is logged and raised as `customexception`.
Example:
user_question = "What is the capital of France?"
user_input(user_question)
Notes:
- The function loads the FAISS index stored locally as "faiss_index" and uses the
`GoogleGenerativeAIEmbeddings` model to perform the similarity search.
- It then passes the relevant documents retrieved from the search along with the user’s
question to a conversational chain for response generation.
- The final response is printed and displayed using Streamlit's `st.write` function.
"""
try:
logging.info("[-] Processing user input...")
# loading googles embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# loading faiss index from the local
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
# then performing similarity search
docs = new_db.similarity_search(user_question)
chain = get_conversational_chain()
response = chain(
{"input_documents": docs,
"question": user_question},
return_only_outputs=True
)
print(response)
st.write("Reply: ", response["output_text"])
logging.info("[*] Processing use input completed!")
except Exception as e:
logging.info(f"[!] Exception while processing user input : {e}")
raise customexception(e, sys)
def main():
"""
Streamlit app for chatting with PDF documents using a language model.
This function sets up a Streamlit web application where users can upload PDF files,
ask questions related to the contents of the PDFs, and receive answers generated by
a language model. The app allows the user to upload PDFs, processes the text, converts
it into embeddings, and uses a conversational chain to respond to user queries.
Functionality:
- Sets up the main page with a header and a text input for user questions.
- Allows users to upload PDF files from the sidebar.
- Once a PDF is uploaded, the text is extracted, chunked, and embedded into a vector store.
- Users can submit questions, which are answered based on the processed PDF content.
Workflow:
1. User uploads PDF files and submits them for processing.
2. Text is extracted from the PDFs, split into chunks, and stored in a vector store.
3. User asks a question, and the system retrieves relevant context from the PDF.
4. The question is answered using a conversational chain powered by a language model.
Notes:
- The app uses Streamlit’s UI components like `file_uploader`, `text_input`, and `button` for interaction.
- The backend processes include extracting text from PDFs, chunking it, creating embeddings, and storing them in FAISS.
"""
try:
# streamlit app
st.set_page_config("Chat PDF")
# header for front end
st.header("Chat with PDF Using Gemini")
# user will provide a question
user_question = st.text_input("Ask a Question from the PDF Files")
# function will execute qith user question
if user_question:
user_input(user_question)
# side bar to upload pdf
with st.sidebar:
# title of side bar
st.title("Menu:")
# pdf uploader button
pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submission Button")
print(f"Loaded file: {pdf_docs}")
# submit button to submit pdf
if st.button("Submit & Process"):
# text on sppiner
with st.spinner("Processing..."):
# following flow will execute once pdf is uploaded / will log this each step for debugging purpose
raw_text = get_pdf_text(pdf_docs)
text_chunks = get_text_chunks(raw_text)
get_vector_store(text_chunks)
# once all process complete, message will be showen
st.success("Done")
except Exception as e:
logging.info(f"[!] Exception while loading frontend : {e}")
raise customexception(e, sys)
if __name__ == "__main__":
main()