-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummarizer_test_macos.py
68 lines (52 loc) · 2.37 KB
/
summarizer_test_macos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
# macOS specific
os.environ['TESSDATA_PREFIX'] = '/opt/miniconda3/envs/llm-conda/share/tessdata'
#os.environ['LANGSMITH_TRACING'] = "false"
#os.environ['LANGSMITH_TRACING_V2'] = "false"
from langchain_community.document_loaders import DirectoryLoader
REBUILD = True
if REBUILD:
loader = DirectoryLoader("../../benchmark/ott", glob="**/*.pdf")
books = loader.load()
print('Number of books:', len(books))
from langchain_text_splitters import RecursiveCharacterTextSplitter
if REBUILD:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
all_splits = text_splitter.split_documents(books)
all_splits = all_splits[9:13]
for chunk in all_splits:
print(chunk.page_content)
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
if REBUILD:
vectorstore = Chroma.from_documents(
documents=all_splits,
embedding=OllamaEmbeddings(model="llama3"),
persist_directory="./chroma_db",
)
else:
vectorstore = Chroma(persist_directory='./chroma_db',
embedding_function=OllamaEmbeddings(model="llama3"))
from langchain_ollama.llms import OllamaLLM
llm = OllamaLLM(model="llama3")
def generate_response(query, retrieved_docs):
context = " ".join(doc.page_content for doc in retrieved_docs)
prompt = f"Based on the following information: {context}\n\nAnswer the query: {query}"
response = llm.invoke(prompt)
return response
def rag_system(query):
retrieved_docs = vectorstore.similarity_search(query, k=3) # Retrieve documents based on the query
#print(retrieved_docs)
response = generate_response(query, retrieved_docs) # Generate an answer using the language model
return response
question = "What is the royalty income in FY2023"
#question = "What is the office of Technology transfer"
#question = "In FY2023, how many licensed products provided royalty income back to the NIH"
#question = "Where was royalty income mentioned in the document"
#question = "Where is OTT mentioned in the document?"
#question = "What did NCATS do? Show me the numbers of inventions and patents"
#question = "In the first paragraph, the document mentioned about royal income. How much was the dollar amount of the royal income?"
#question = "how many licensed products developed from NIH, what are their names"
print(question)
answers = rag_system(question)
print(answers)