diff --git a/.env.example b/.env.example index 778ad5519..2d17e9e12 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,3 @@ OPENAI_API_KEY= -# Update these with your Supabase details from your project settings > API and dashboard settings -PINECONE_API_KEY= -PINECONE_ENVIRONMENT= -PINECONE_INDEX_NAME= +COLLECTION_NAME= diff --git a/README.md b/README.md index 5696be598..d556a40a8 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,10 @@ -# GPT-4 & LangChain - Create a ChatGPT Chatbot for Your PDF Files +# GPT-4, LangChain & Chroma - Create a ChatGPT Chatbot for Your PDF Files -Use the new GPT-4 api to build a chatGPT chatbot for multiple Large PDF files. - -Tech stack used includes LangChain, Pinecone, Typescript, Openai, and Next.js. LangChain is a framework that makes it easier to build scalable AI/LLM apps and chatbots. Pinecone is a vectorstore for storing embeddings and your PDF in text to later retrieve similar docs. +**NOTE: The logic in the codebase is mostly outddated. To see the latest version of the ai pdf chatbot look at the main branch [here](https://github.com/mayooear/ai-pdf-chatbot-langchain)** -[Tutorial video](https://www.youtube.com/watch?v=ih9PBGVVOO4) +Use the new GPT-4 api to build a chatGPT chatbot for multiple Large PDF files. -[Join the discord if you have questions](https://discord.gg/E4Mc77qwjm) +Tech stack used includes LangChain, Chroma, Typescript, Openai, and Next.js. LangChain is a framework that makes it easier to build scalable AI/LLM apps and chatbots. Chroma is a vectorstore for storing embeddings and your PDF in text to later retrieve similar docs. The visual guide of this repo and tutorial is in the `visual guide` folder. @@ -16,14 +14,15 @@ Prelude: Please make sure you have already downloaded node on your system and th ## Development -1. Clone the repo or download the ZIP +1. Install [Docker Desktop](https://www.docker.com/products/docker-desktop/) for your platform. + +2. Clone the repo or download the ZIP ``` git clone [github https url] ``` - -2. Install packages +3. Install packages First run `npm install yarn -g` to install yarn globally (if you haven't already). @@ -32,30 +31,32 @@ Then run: ``` yarn install ``` + After installation, you should now see a `node_modules` folder. -3. Set up your `.env` file +4. Set up your `.env` file - Copy `.env.example` into `.env` Your `.env` file should look like this: ``` OPENAI_API_KEY= - -PINECONE_API_KEY= -PINECONE_ENVIRONMENT= - -PINECONE_INDEX_NAME= +COLLECTION_NAME= ``` - Visit [openai](https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key) to retrieve API keys and insert into your `.env` file. -- Visit [pinecone](https://pinecone.io/) to create and retrieve your API keys, and also retrieve your environment and index name from the dashboard. - -4. In the `config` folder, replace the `PINECONE_NAME_SPACE` with a `namespace` where you'd like to store your embeddings on Pinecone when you run `npm run ingest`. This namespace will later be used for queries and retrieval. +- Choose a collection name where you'd like to store your embeddings in Chroma. This collection will later be used for queries and retrieval. +- [Chroma details](https://docs.trychroma.com/getting-started) 5. In `utils/makechain.ts` chain change the `QA_PROMPT` for your own usecase. Change `modelName` in `new OpenAI` to `gpt-4`, if you have access to `gpt-4` api. Please verify outside this repo that you have access to `gpt-4` api, otherwise the application will not work. +6. In a new terminal window, run Chroma in the Docker container: + +``` +docker run -p 8000:8000 ghcr.io/chroma-core/chroma:0.3.21 +``` + ## Convert your PDF files to embeddings **This repo can load multiple PDF files** @@ -64,11 +65,9 @@ PINECONE_INDEX_NAME= 2. Run the script `npm run ingest` to 'ingest' and embed your docs. If you run into errors troubleshoot below. -3. Check Pinecone dashboard to verify your namespace and vectors have been added. - ## Run the app -Once you've verified that the embeddings and content have been successfully added to your Pinecone, you can run the app `npm run dev` to launch the local dev environment, and then type a question in the chat interface. +Once you've verified that the embeddings and content have been successfully added to Chroma db, you can run the app `npm run dev` to launch the local dev environment, and then type a question in the chat interface. ## Troubleshooting @@ -79,21 +78,8 @@ In general, keep an eye out in the `issues` and `discussions` section of this re - Make sure you're running the latest Node version. Run `node -v` - Try a different PDF or convert your PDF to text first. It's possible your PDF is corrupted, scanned, or requires OCR to convert to text. - `Console.log` the `env` variables and make sure they are exposed. -- Make sure you're using the same versions of LangChain and Pinecone as this repo. - Check that you've created an `.env` file that contains your valid (and working) API keys, environment and index name. - If you change `modelName` in `OpenAI`, make sure you have access to the api for the appropriate model. - Make sure you have enough OpenAI credits and a valid card on your billings account. - Check that you don't have multiple OPENAPI keys in your global environment. If you do, the local `env` file from the project will be overwritten by systems `env` variable. - Try to hard code your API keys into the `process.env` variables if there are still issues. - -**Pinecone errors** - -- Make sure your pinecone dashboard `environment` and `index` matches the one in the `pinecone.ts` and `.env` files. -- Check that you've set the vector dimensions to `1536`. -- Make sure your pinecone namespace is in lowercase. -- Pinecone indexes of users on the Starter(free) plan are deleted after 7 days of inactivity. To prevent this, send an API request to Pinecone to reset the counter before 7 days. -- Retry from scratch with a new Pinecone project, index, and cloned repo. - -## Credit - -Frontend of this repo is inspired by [langchain-chat-nextjs](https://github.com/zahidkhawaja/langchain-chat-nextjs) diff --git a/config/chroma.ts b/config/chroma.ts new file mode 100644 index 000000000..31a247a80 --- /dev/null +++ b/config/chroma.ts @@ -0,0 +1,7 @@ +if (!process.env.COLLECTION_NAME) { + throw new Error('Missing collection name name in .env file'); +} + +const COLLECTION_NAME = process.env.COLLECTION_NAME ?? ''; + +export { COLLECTION_NAME }; diff --git a/config/pinecone.ts b/config/pinecone.ts deleted file mode 100644 index ce2dadaad..000000000 --- a/config/pinecone.ts +++ /dev/null @@ -1,13 +0,0 @@ -/** - * Change the namespace to the namespace on Pinecone you'd like to store your embeddings. - */ - -if (!process.env.PINECONE_INDEX_NAME) { - throw new Error('Missing Pinecone index name in .env file'); -} - -const PINECONE_INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? ''; - -const PINECONE_NAME_SPACE = 'pdf-test'; //namespace is optional for your vectors - -export { PINECONE_INDEX_NAME, PINECONE_NAME_SPACE }; diff --git a/package.json b/package.json index 82579df5b..2b46de325 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gpt4-langchain-pdf-chatbot", - "version": "0.1.0", + "version": "0.2.0", "private": true, "license": "MIT", "author": "Mayooear", @@ -18,6 +18,7 @@ "@microsoft/fetch-event-source": "^2.0.1", "@pinecone-database/pinecone": "0.0.12", "@radix-ui/react-accordion": "^1.1.1", + "chromadb": "1.4.1", "clsx": "^1.2.1", "dotenv": "^16.0.3", "langchain": "0.0.55", @@ -49,7 +50,7 @@ "keywords": [ "starter", "gpt4", - "pinecone", + "chroma", "typescript", "nextjs", "langchain", diff --git a/pages/api/chat.ts b/pages/api/chat.ts index b9f41f54d..90843d82e 100644 --- a/pages/api/chat.ts +++ b/pages/api/chat.ts @@ -1,9 +1,8 @@ import type { NextApiRequest, NextApiResponse } from 'next'; import { OpenAIEmbeddings } from 'langchain/embeddings/openai'; -import { PineconeStore } from 'langchain/vectorstores/pinecone'; import { makeChain } from '@/utils/makechain'; -import { pinecone } from '@/utils/pinecone-client'; -import { PINECONE_INDEX_NAME, PINECONE_NAME_SPACE } from '@/config/pinecone'; +import { COLLECTION_NAME } from '@/config/chroma'; +import { Chroma } from 'langchain/vectorstores/chroma'; export default async function handler( req: NextApiRequest, @@ -26,15 +25,11 @@ export default async function handler( const sanitizedQuestion = question.trim().replaceAll('\n', ' '); try { - const index = pinecone.Index(PINECONE_INDEX_NAME); - /* create vectorstore*/ - const vectorStore = await PineconeStore.fromExistingIndex( + const vectorStore = await Chroma.fromExistingCollection( new OpenAIEmbeddings({}), { - pineconeIndex: index, - textKey: 'text', - namespace: PINECONE_NAME_SPACE, //namespace comes from your config folder + collectionName: COLLECTION_NAME, }, ); diff --git a/pages/index.tsx b/pages/index.tsx index c80830751..b31f41626 100644 --- a/pages/index.tsx +++ b/pages/index.tsx @@ -262,7 +262,8 @@ export default function Home() { diff --git a/scripts/ingest-data.ts b/scripts/ingest-data.ts index f1e817982..ceb2d68e2 100644 --- a/scripts/ingest-data.ts +++ b/scripts/ingest-data.ts @@ -1,10 +1,9 @@ import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import { OpenAIEmbeddings } from 'langchain/embeddings/openai'; -import { PineconeStore } from 'langchain/vectorstores/pinecone'; -import { pinecone } from '@/utils/pinecone-client'; import { CustomPDFLoader } from '@/utils/customPDFLoader'; -import { PINECONE_INDEX_NAME, PINECONE_NAME_SPACE } from '@/config/pinecone'; import { DirectoryLoader } from 'langchain/document_loaders/fs/directory'; +import { Chroma } from 'langchain/vectorstores/chroma'; +import { COLLECTION_NAME } from '@/config/chroma'; /* Name of directory to retrieve your files from */ const filePath = 'docs'; @@ -31,14 +30,18 @@ export const run = async () => { console.log('creating vector store...'); /*create and store the embeddings in the vectorStore*/ const embeddings = new OpenAIEmbeddings(); - const index = pinecone.Index(PINECONE_INDEX_NAME); //change to your own index name - //embed the PDF documents - await PineconeStore.fromDocuments(docs, embeddings, { - pineconeIndex: index, - namespace: PINECONE_NAME_SPACE, - textKey: 'text', - }); + let chroma = new Chroma(embeddings, { collectionName: COLLECTION_NAME }); + await chroma.index?.reset(); + + // Ingest documents in batches of 100 + + for (let i = 0; i < docs.length; i += 100) { + const batch = docs.slice(i, i + 100); + await Chroma.fromDocuments(batch, embeddings, { + collectionName: COLLECTION_NAME, + }); + } } catch (error) { console.log('error', error); throw new Error('Failed to ingest your data'); diff --git a/utils/makechain.ts b/utils/makechain.ts index 45f6f1dff..1722e5646 100644 --- a/utils/makechain.ts +++ b/utils/makechain.ts @@ -1,6 +1,6 @@ import { OpenAI } from 'langchain/llms/openai'; -import { PineconeStore } from 'langchain/vectorstores/pinecone'; import { ConversationalRetrievalQAChain } from 'langchain/chains'; +import { Chroma } from 'langchain/vectorstores/chroma'; const CONDENSE_PROMPT = `Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question. @@ -18,7 +18,7 @@ If the question is not related to the context, politely respond that you are tun Question: {question} Helpful answer in markdown:`; -export const makeChain = (vectorstore: PineconeStore) => { +export const makeChain = (vectorstore: Chroma) => { const model = new OpenAI({ temperature: 0, // increase temepreature to get more creative answers modelName: 'gpt-3.5-turbo', //change this to gpt-4 if you have access diff --git a/utils/pinecone-client.ts b/utils/pinecone-client.ts deleted file mode 100644 index c84990034..000000000 --- a/utils/pinecone-client.ts +++ /dev/null @@ -1,23 +0,0 @@ -import { PineconeClient } from '@pinecone-database/pinecone'; - -if (!process.env.PINECONE_ENVIRONMENT || !process.env.PINECONE_API_KEY) { - throw new Error('Pinecone environment or api key vars missing'); -} - -async function initPinecone() { - try { - const pinecone = new PineconeClient(); - - await pinecone.init({ - environment: process.env.PINECONE_ENVIRONMENT ?? '', //this is in the dashboard - apiKey: process.env.PINECONE_API_KEY ?? '', - }); - - return pinecone; - } catch (error) { - console.log('error', error); - throw new Error('Failed to initialize Pinecone Client'); - } -} - -export const pinecone = await initPinecone(); diff --git a/yarn.lock b/yarn.lock index 9c300bbae..61578025d 100644 --- a/yarn.lock +++ b/yarn.lock @@ -881,6 +881,13 @@ chokidar@^3.5.3: optionalDependencies: fsevents "~2.3.2" +chromadb@1.4.1: + version "1.4.1" + resolved "https://registry.yarnpkg.com/chromadb/-/chromadb-1.4.1.tgz#a81a826956051617fdd25299fc5d3132bcb9ebd6" + integrity sha512-vRcig4CJxJXs++cKMt9tHmk9YjQprxzLK9sVYD6iXfqRJBoXeoFzk/RS95Dz1J6/7aSfBwDsyx3AE2ePP9FnYA== + dependencies: + axios "^0.26.0" + client-only@0.0.1: version "0.0.1" resolved "https://registry.yarnpkg.com/client-only/-/client-only-0.0.1.tgz#38bba5d403c41ab150bff64a95c85013cf73bca1"