From 9de3eeb194439d1371a36899cfefca6033d4d10b Mon Sep 17 00:00:00 2001 From: Hideyuki Kagami Date: Sat, 28 Dec 2024 01:35:42 +0900 Subject: [PATCH 1/3] chore(deps): update graphrag to version 1.0 --- README.md | 2 +- libs/ktem/ktem/index/file/graph/pipelines.py | 34 ++++++-------------- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 671c99f0..8df48d72 100644 --- a/README.md +++ b/README.md @@ -206,7 +206,7 @@ documents and developers who want to build their own RAG pipeline. - **Non-Docker Installation**: If you are not using Docker, install GraphRAG with the following command: ```shell - pip install "graphrag<=0.3.6" future + pip install "graphrag<=1.0.1" future ``` - **Setting Up API KEY**: To use the GraphRAG retriever feature, ensure you set the `GRAPHRAG_API_KEY` environment variable. You can do this directly in your environment or by adding it to a `.env` file. diff --git a/libs/ktem/ktem/index/file/graph/pipelines.py b/libs/ktem/ktem/index/file/graph/pipelines.py index 31b491b4..dd1ca1c7 100644 --- a/libs/ktem/ktem/index/file/graph/pipelines.py +++ b/libs/ktem/ktem/index/file/graph/pipelines.py @@ -2,7 +2,6 @@ import shutil import subprocess from pathlib import Path -from shutil import rmtree from typing import Generator from uuid import uuid4 @@ -27,7 +26,6 @@ read_indexer_reports, read_indexer_text_units, ) - from graphrag.query.input.loaders.dfs import store_entity_semantic_embeddings from graphrag.query.llm.oai.embedding import OpenAIEmbedding from graphrag.query.llm.oai.typing import OpenaiApiType from graphrag.query.structured_search.local_search.mixed_context import ( @@ -115,25 +113,16 @@ def call_graphrag_index(self, graph_id: str, all_docs: list[Document]): input_path = str(input_path.absolute()) # Construct the command - command = [ - "python", - "-m", - "graphrag.index", - "--root", - input_path, - "--reporter", - "rich", - "--init", - ] + init_command = ["graphrag", "init", "--root", input_path] + index_command = ["graphrag", "index", "--root", input_path] # Run the command yield Document( channel="debug", text="[GraphRAG] Creating index... This can take a long time.", ) - result = subprocess.run(command, capture_output=True, text=True) + result = subprocess.run(init_command, capture_output=True, text=True) print(result.stdout) - command = command[:-1] # copy customized GraphRAG config file if it exists if config("USE_CUSTOMIZED_GRAPHRAG_SETTING", default="value").lower() == "true": @@ -146,7 +135,9 @@ def call_graphrag_index(self, graph_id: str, all_docs: list[Document]): print("failed to copy customized GraphRAG config file. ") # Run the command and stream stdout - with subprocess.Popen(command, stdout=subprocess.PIPE, text=True) as process: + with subprocess.Popen( + index_command, stdout=subprocess.PIPE, text=True + ) as process: if process.stdout: for line in process.stdout: yield Document(channel="debug", text=line) @@ -227,14 +218,9 @@ def _build_graph_search(self): # load description embeddings to an in-memory lancedb vectorstore # to connect to a remote db, specify url and port values. description_embedding_store = LanceDBVectorStore( - collection_name="entity_description_embeddings", + collection_name="default-entity-description", ) description_embedding_store.connect(db_uri=LANCEDB_URI) - if Path(LANCEDB_URI).is_dir(): - rmtree(LANCEDB_URI) - _ = store_entity_semantic_embeddings( - entities=entities, vectorstore=description_embedding_store - ) print(f"Entity count: {len(entity_df)}") # Read relationships @@ -382,13 +368,13 @@ def run( # (if you are using a model with 8k limit, a good setting could be 5000) } - context_text, context_records = context_builder.build_context( + context_builder_result = context_builder.build_context( query=text, conversation_history=None, **local_context_params, ) - documents = self.format_context_records(context_records) - plot = self.plot_graph(context_records) + documents = self.format_context_records(context_builder_result.context_records) + plot = self.plot_graph(context_builder_result.context_records) return documents + [ RetrievedDocument( From e5b58a8c64694d90991ea676a16deb79442e96ed Mon Sep 17 00:00:00 2001 From: Hideyuki Kagami Date: Mon, 30 Dec 2024 15:51:44 +0900 Subject: [PATCH 2/3] fix: update settings.yaml 'embeddings.vector_store' setting is required from v1.0 https://github.com/microsoft/graphrag/blob/v1.0.1/graphrag/config/init_content.py --- settings.yaml.example | 135 ++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 78 deletions(-) diff --git a/settings.yaml.example b/settings.yaml.example index 7b0ca776..3598bb39 100644 --- a/settings.yaml.example +++ b/settings.yaml.example @@ -2,62 +2,53 @@ # The parameters in this file will only take effect when the USE_CUSTOMIZED_GRAPHRAG_SETTING is true in .env file. # For a comprehensive understanding of GraphRAG parameters, please refer to: https://microsoft.github.io/graphrag/config/json_yaml/. -encoding_model: cl100k_base -skip_workflows: [] +### This config file contains required core defaults that must be set, along with a handful of common optional settings. +### For a full list of available settings, see https://microsoft.github.io/graphrag/config/yaml/ + +### LLM settings ### +## There are a number of settings to tune the threading and token limits for LLM calls - check the docs. + +encoding_model: cl100k_base # this needs to be matched to your model! + llm: - api_key: ${GRAPHRAG_API_KEY} + api_key: ${GRAPHRAG_API_KEY} # set this in the generated .env file type: openai_chat # or azure_openai_chat - api_base: http://127.0.0.1:11434/v1 model: qwen2 model_supports_json: true # recommended if this is available for your model. - # max_tokens: 4000 - request_timeout: 1800.0 + # audience: "https://cognitiveservices.azure.com/.default" # api_base: https://.openai.azure.com # api_version: 2024-02-15-preview # organization: # deployment_name: - # tokens_per_minute: 150_000 # set a leaky bucket throttle - # requests_per_minute: 10_000 # set a leaky bucket throttle - # max_retries: 10 - # max_retry_wait: 10.0 - # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times + api_base: http://127.0.0.1:11434/v1 + request_timeout: 1800.0 concurrent_requests: 5 # the number of parallel inflight requests that may be made - # temperature: 0 # temperature for sampling - # top_p: 1 # top-p sampling - # n: 1 # Number of completions to generate parallelization: stagger: 0.3 - # num_threads: 50 # the number of threads to use for parallel processing + # num_threads: 50 async_mode: threaded # or asyncio embeddings: - ## parallelization: override the global parallelization settings for embeddings async_mode: threaded # or asyncio - # target: required # or all - # batch_size: 16 # the number of documents to send in a single request - # batch_max_tokens: 8191 # the maximum number of tokens to send in a single request + vector_store: + type: lancedb + db_uri: 'output/lancedb' + container_name: default + overwrite: true llm: api_base: http://localhost:11434/v1 api_key: ${GRAPHRAG_API_KEY} + type: openai_embedding # or azure_openai_embedding model: nomic-embed-text - type: openai_embedding # api_base: https://.openai.azure.com # api_version: 2024-02-15-preview + # audience: "https://cognitiveservices.azure.com/.default" # organization: # deployment_name: - # tokens_per_minute: 150_000 # set a leaky bucket throttle - # requests_per_minute: 10_000 # set a leaky bucket throttle - # max_retries: 10 - # max_retry_wait: 10.0 - # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times - # concurrent_requests: 25 # the number of parallel inflight requests that may be made -chunks: - size: 1200 - overlap: 100 - group_by_columns: [id] # by default, we don't allow chunks to cross documents +### Input settings ### input: type: file # or blob @@ -66,54 +57,53 @@ input: file_encoding: utf-8 file_pattern: ".*\\.txt$" +chunks: + size: 1200 + overlap: 100 + group_by_columns: [id] + +### Storage settings ### +## If blob storage is specified in the following four sections, +## connection_string and container_name must be provided + cache: type: file # or blob base_dir: "cache" - # connection_string: - # container_name: + +reporting: + type: file # or console, blob + base_dir: "logs" storage: type: file # or blob base_dir: "output" - # connection_string: - # container_name: -reporting: - type: file # or console, blob - base_dir: "output" - # connection_string: - # container_name: +## only turn this on if running `graphrag index` with custom settings +## we normally use `graphrag update` with the defaults +update_index_storage: + # type: file # or blob + # base_dir: "update_output" + +### Workflow settings ### + +skip_workflows: [] entity_extraction: - ## strategy: fully override the entity extraction strategy. - ## type: one of graph_intelligence, graph_intelligence_json and nltk - ## llm: override the global llm settings for this task - ## parallelization: override the global parallelization settings for this task - ## async_mode: override the global async_mode settings for this task prompt: "prompts/entity_extraction.txt" entity_types: [organization,person,geo,event] max_gleanings: 1 summarize_descriptions: - ## llm: override the global llm settings for this task - ## parallelization: override the global parallelization settings for this task - ## async_mode: override the global async_mode settings for this task prompt: "prompts/summarize_descriptions.txt" max_length: 500 claim_extraction: - ## llm: override the global llm settings for this task - ## parallelization: override the global parallelization settings for this task - ## async_mode: override the global async_mode settings for this task - # enabled: true + enabled: false prompt: "prompts/claim_extraction.txt" description: "Any claims or facts that could be relevant to information discovery." max_gleanings: 1 community_reports: - ## llm: override the global llm settings for this task - ## parallelization: override the global parallelization settings for this task - ## async_mode: override the global async_mode settings for this task prompt: "prompts/community_report.txt" max_length: 2000 max_input_length: 8000 @@ -123,37 +113,26 @@ cluster_graph: embed_graph: enabled: false # if true, will generate node2vec embeddings for nodes - # num_walks: 10 - # walk_length: 40 - # window_size: 2 - # iterations: 3 - # random_seed: 597832 umap: enabled: false # if true, will generate UMAP embeddings for nodes snapshots: graphml: false - raw_entities: false - top_level_nodes: false + embeddings: false + transient: false + +### Query settings ### +## The prompt locations are required here, but each search method has a number of optional knobs that can be tuned. +## See the config docs: https://microsoft.github.io/graphrag/config/yaml/#query local_search: - # text_unit_prop: 0.5 - # community_prop: 0.1 - # conversation_history_max_turns: 5 - # top_k_mapped_entities: 10 - # top_k_relationships: 10 - # llm_temperature: 0 # temperature for sampling - # llm_top_p: 1 # top-p sampling - # llm_n: 1 # Number of completions to generate - # max_tokens: 12000 + prompt: "prompts/local_search_system_prompt.txt" global_search: - # llm_temperature: 0 # temperature for sampling - # llm_top_p: 1 # top-p sampling - # llm_n: 1 # Number of completions to generate - # max_tokens: 12000 - # data_max_tokens: 12000 - # map_max_tokens: 1000 - # reduce_max_tokens: 2000 - # concurrency: 32 + map_prompt: "prompts/global_search_map_system_prompt.txt" + reduce_prompt: "prompts/global_search_reduce_system_prompt.txt" + knowledge_prompt: "prompts/global_search_knowledge_system_prompt.txt" + +drift_search: + prompt: "prompts/drift_search_system_prompt.txt" From 54f54ced02fdafaede4675ee2b7316508e9a3fa0 Mon Sep 17 00:00:00 2001 From: Hideyuki Kagami Date: Mon, 30 Dec 2024 15:53:43 +0900 Subject: [PATCH 3/3] fix: update return type of context_records https://github.com/microsoft/graphrag/blob/v1.0.1/graphrag/query/context_builder/builders.py#L21 --- libs/ktem/ktem/index/file/graph/pipelines.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libs/ktem/ktem/index/file/graph/pipelines.py b/libs/ktem/ktem/index/file/graph/pipelines.py index dd1ca1c7..d1b20b49 100644 --- a/libs/ktem/ktem/index/file/graph/pipelines.py +++ b/libs/ktem/ktem/index/file/graph/pipelines.py @@ -291,10 +291,10 @@ def _to_document(self, header: str, context_text: str) -> RetrievedDocument: ) def format_context_records(self, context_records) -> list[RetrievedDocument]: - entities = context_records.get("entities", []) - relationships = context_records.get("relationships", []) - reports = context_records.get("reports", []) - sources = context_records.get("sources", []) + entities = context_records.get("entities", pd.DataFrame()) + relationships = context_records.get("relationships", pd.DataFrame()) + reports = context_records.get("reports", pd.DataFrame()) + sources = context_records.get("sources", pd.DataFrame()) docs = []