From 9de3eeb194439d1371a36899cfefca6033d4d10b Mon Sep 17 00:00:00 2001
From: Hideyuki Kagami <hide212131@gmail.com>
Date: Sat, 28 Dec 2024 01:35:42 +0900
Subject: [PATCH 1/3] chore(deps): update graphrag to version 1.0

---
 README.md                                    |  2 +-
 libs/ktem/ktem/index/file/graph/pipelines.py | 34 ++++++--------------
 2 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 671c99f0..8df48d72 100644
--- a/README.md
+++ b/README.md
@@ -206,7 +206,7 @@ documents and developers who want to build their own RAG pipeline.
 - **Non-Docker Installation**: If you are not using Docker, install GraphRAG with the following command:
 
   ```shell
-  pip install "graphrag<=0.3.6" future
+  pip install "graphrag<=1.0.1" future
   ```
 
 - **Setting Up API KEY**: To use the GraphRAG retriever feature, ensure you set the `GRAPHRAG_API_KEY` environment variable. You can do this directly in your environment or by adding it to a `.env` file.
diff --git a/libs/ktem/ktem/index/file/graph/pipelines.py b/libs/ktem/ktem/index/file/graph/pipelines.py
index 31b491b4..dd1ca1c7 100644
--- a/libs/ktem/ktem/index/file/graph/pipelines.py
+++ b/libs/ktem/ktem/index/file/graph/pipelines.py
@@ -2,7 +2,6 @@
 import shutil
 import subprocess
 from pathlib import Path
-from shutil import rmtree
 from typing import Generator
 from uuid import uuid4
 
@@ -27,7 +26,6 @@
         read_indexer_reports,
         read_indexer_text_units,
     )
-    from graphrag.query.input.loaders.dfs import store_entity_semantic_embeddings
     from graphrag.query.llm.oai.embedding import OpenAIEmbedding
     from graphrag.query.llm.oai.typing import OpenaiApiType
     from graphrag.query.structured_search.local_search.mixed_context import (
@@ -115,25 +113,16 @@ def call_graphrag_index(self, graph_id: str, all_docs: list[Document]):
         input_path = str(input_path.absolute())
 
         # Construct the command
-        command = [
-            "python",
-            "-m",
-            "graphrag.index",
-            "--root",
-            input_path,
-            "--reporter",
-            "rich",
-            "--init",
-        ]
+        init_command = ["graphrag", "init", "--root", input_path]
+        index_command = ["graphrag", "index", "--root", input_path]
 
         # Run the command
         yield Document(
             channel="debug",
             text="[GraphRAG] Creating index... This can take a long time.",
         )
-        result = subprocess.run(command, capture_output=True, text=True)
+        result = subprocess.run(init_command, capture_output=True, text=True)
         print(result.stdout)
-        command = command[:-1]
 
         # copy customized GraphRAG config file if it exists
         if config("USE_CUSTOMIZED_GRAPHRAG_SETTING", default="value").lower() == "true":
@@ -146,7 +135,9 @@ def call_graphrag_index(self, graph_id: str, all_docs: list[Document]):
                 print("failed to copy customized GraphRAG config file. ")
 
         # Run the command and stream stdout
-        with subprocess.Popen(command, stdout=subprocess.PIPE, text=True) as process:
+        with subprocess.Popen(
+            index_command, stdout=subprocess.PIPE, text=True
+        ) as process:
             if process.stdout:
                 for line in process.stdout:
                     yield Document(channel="debug", text=line)
@@ -227,14 +218,9 @@ def _build_graph_search(self):
         # load description embeddings to an in-memory lancedb vectorstore
         # to connect to a remote db, specify url and port values.
         description_embedding_store = LanceDBVectorStore(
-            collection_name="entity_description_embeddings",
+            collection_name="default-entity-description",
         )
         description_embedding_store.connect(db_uri=LANCEDB_URI)
-        if Path(LANCEDB_URI).is_dir():
-            rmtree(LANCEDB_URI)
-        _ = store_entity_semantic_embeddings(
-            entities=entities, vectorstore=description_embedding_store
-        )
         print(f"Entity count: {len(entity_df)}")
 
         # Read relationships
@@ -382,13 +368,13 @@ def run(
             # (if you are using a model with 8k limit, a good setting could be 5000)
         }
 
-        context_text, context_records = context_builder.build_context(
+        context_builder_result = context_builder.build_context(
             query=text,
             conversation_history=None,
             **local_context_params,
         )
-        documents = self.format_context_records(context_records)
-        plot = self.plot_graph(context_records)
+        documents = self.format_context_records(context_builder_result.context_records)
+        plot = self.plot_graph(context_builder_result.context_records)
 
         return documents + [
             RetrievedDocument(

From e5b58a8c64694d90991ea676a16deb79442e96ed Mon Sep 17 00:00:00 2001
From: Hideyuki Kagami <hide212131@gmail.com>
Date: Mon, 30 Dec 2024 15:51:44 +0900
Subject: [PATCH 2/3] fix: update settings.yaml

'embeddings.vector_store' setting is required from v1.0
https://github.com/microsoft/graphrag/blob/v1.0.1/graphrag/config/init_content.py
---
 settings.yaml.example | 135 ++++++++++++++++++------------------------
 1 file changed, 57 insertions(+), 78 deletions(-)

diff --git a/settings.yaml.example b/settings.yaml.example
index 7b0ca776..3598bb39 100644
--- a/settings.yaml.example
+++ b/settings.yaml.example
@@ -2,62 +2,53 @@
 # The parameters in this file will only take effect when the USE_CUSTOMIZED_GRAPHRAG_SETTING is true in .env file.
 # For a comprehensive understanding of GraphRAG parameters, please refer to: https://microsoft.github.io/graphrag/config/json_yaml/.
 
-encoding_model: cl100k_base
-skip_workflows: []
+### This config file contains required core defaults that must be set, along with a handful of common optional settings.
+### For a full list of available settings, see https://microsoft.github.io/graphrag/config/yaml/
+
+### LLM settings ###
+## There are a number of settings to tune the threading and token limits for LLM calls - check the docs.
+
+encoding_model: cl100k_base # this needs to be matched to your model!
+
 llm:
-  api_key: ${GRAPHRAG_API_KEY}
+  api_key: ${GRAPHRAG_API_KEY} # set this in the generated .env file
   type: openai_chat # or azure_openai_chat
-  api_base: http://127.0.0.1:11434/v1
   model: qwen2
   model_supports_json: true # recommended if this is available for your model.
-  # max_tokens: 4000
-  request_timeout: 1800.0
+  # audience: "https://cognitiveservices.azure.com/.default"
   # api_base: https://<instance>.openai.azure.com
   # api_version: 2024-02-15-preview
   # organization: <organization_id>
   # deployment_name: <azure_model_deployment_name>
-  # tokens_per_minute: 150_000 # set a leaky bucket throttle
-  # requests_per_minute: 10_000 # set a leaky bucket throttle
-  # max_retries: 10
-  # max_retry_wait: 10.0
-  # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
+  api_base: http://127.0.0.1:11434/v1
+  request_timeout: 1800.0
   concurrent_requests: 5 # the number of parallel inflight requests that may be made
-  # temperature: 0 # temperature for sampling
-  # top_p: 1 # top-p sampling
-  # n: 1 # Number of completions to generate
 
 parallelization:
   stagger: 0.3
-  # num_threads: 50 # the number of threads to use for parallel processing
+  # num_threads: 50
 
 async_mode: threaded # or asyncio
 
 embeddings:
-  ## parallelization: override the global parallelization settings for embeddings
   async_mode: threaded # or asyncio
-  # target: required # or all
-  # batch_size: 16 # the number of documents to send in a single request
-  # batch_max_tokens: 8191 # the maximum number of tokens to send in a single request
+  vector_store: 
+    type: lancedb
+    db_uri: 'output/lancedb'
+    container_name: default
+    overwrite: true
   llm:
     api_base: http://localhost:11434/v1
     api_key: ${GRAPHRAG_API_KEY}
+    type: openai_embedding # or azure_openai_embedding
     model: nomic-embed-text
-    type: openai_embedding
     # api_base: https://<instance>.openai.azure.com
     # api_version: 2024-02-15-preview
+    # audience: "https://cognitiveservices.azure.com/.default"
     # organization: <organization_id>
     # deployment_name: <azure_model_deployment_name>
-    # tokens_per_minute: 150_000 # set a leaky bucket throttle
-    # requests_per_minute: 10_000 # set a leaky bucket throttle
-    # max_retries: 10
-    # max_retry_wait: 10.0
-    # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
-    # concurrent_requests: 25 # the number of parallel inflight requests that may be made
 
-chunks:
-  size: 1200
-  overlap: 100
-  group_by_columns: [id] # by default, we don't allow chunks to cross documents
+### Input settings ###
 
 input:
   type: file # or blob
@@ -66,54 +57,53 @@ input:
   file_encoding: utf-8
   file_pattern: ".*\\.txt$"
 
+chunks:
+  size: 1200
+  overlap: 100
+  group_by_columns: [id]
+
+### Storage settings ###
+## If blob storage is specified in the following four sections,
+## connection_string and container_name must be provided
+
 cache:
   type: file # or blob
   base_dir: "cache"
-  # connection_string: <azure_blob_storage_connection_string>
-  # container_name: <azure_blob_storage_container_name>
+
+reporting:
+  type: file # or console, blob
+  base_dir: "logs"
 
 storage:
   type: file # or blob
   base_dir: "output"
-  # connection_string: <azure_blob_storage_connection_string>
-  # container_name: <azure_blob_storage_container_name>
 
-reporting:
-  type: file # or console, blob
-  base_dir: "output"
-  # connection_string: <azure_blob_storage_connection_string>
-  # container_name: <azure_blob_storage_container_name>
+## only turn this on if running `graphrag index` with custom settings
+## we normally use `graphrag update` with the defaults
+update_index_storage:
+  # type: file # or blob
+  # base_dir: "update_output"
+
+### Workflow settings ###
+
+skip_workflows: []
 
 entity_extraction:
-  ## strategy: fully override the entity extraction strategy.
-  ##   type: one of graph_intelligence, graph_intelligence_json and nltk
-  ## llm: override the global llm settings for this task
-  ## parallelization: override the global parallelization settings for this task
-  ## async_mode: override the global async_mode settings for this task
   prompt: "prompts/entity_extraction.txt"
   entity_types: [organization,person,geo,event]
   max_gleanings: 1
 
 summarize_descriptions:
-  ## llm: override the global llm settings for this task
-  ## parallelization: override the global parallelization settings for this task
-  ## async_mode: override the global async_mode settings for this task
   prompt: "prompts/summarize_descriptions.txt"
   max_length: 500
 
 claim_extraction:
-  ## llm: override the global llm settings for this task
-  ## parallelization: override the global parallelization settings for this task
-  ## async_mode: override the global async_mode settings for this task
-  # enabled: true
+  enabled: false
   prompt: "prompts/claim_extraction.txt"
   description: "Any claims or facts that could be relevant to information discovery."
   max_gleanings: 1
 
 community_reports:
-  ## llm: override the global llm settings for this task
-  ## parallelization: override the global parallelization settings for this task
-  ## async_mode: override the global async_mode settings for this task
   prompt: "prompts/community_report.txt"
   max_length: 2000
   max_input_length: 8000
@@ -123,37 +113,26 @@ cluster_graph:
 
 embed_graph:
   enabled: false # if true, will generate node2vec embeddings for nodes
-  # num_walks: 10
-  # walk_length: 40
-  # window_size: 2
-  # iterations: 3
-  # random_seed: 597832
 
 umap:
   enabled: false # if true, will generate UMAP embeddings for nodes
 
 snapshots:
   graphml: false
-  raw_entities: false
-  top_level_nodes: false
+  embeddings: false
+  transient: false
+
+### Query settings ###
+## The prompt locations are required here, but each search method has a number of optional knobs that can be tuned.
+## See the config docs: https://microsoft.github.io/graphrag/config/yaml/#query
 
 local_search:
-  # text_unit_prop: 0.5
-  # community_prop: 0.1
-  # conversation_history_max_turns: 5
-  # top_k_mapped_entities: 10
-  # top_k_relationships: 10
-  # llm_temperature: 0 # temperature for sampling
-  # llm_top_p: 1 # top-p sampling
-  # llm_n: 1 # Number of completions to generate
-  # max_tokens: 12000
+  prompt: "prompts/local_search_system_prompt.txt"
 
 global_search:
-  # llm_temperature: 0 # temperature for sampling
-  # llm_top_p: 1 # top-p sampling
-  # llm_n: 1 # Number of completions to generate
-  # max_tokens: 12000
-  # data_max_tokens: 12000
-  # map_max_tokens: 1000
-  # reduce_max_tokens: 2000
-  # concurrency: 32
+  map_prompt: "prompts/global_search_map_system_prompt.txt"
+  reduce_prompt: "prompts/global_search_reduce_system_prompt.txt"
+  knowledge_prompt: "prompts/global_search_knowledge_system_prompt.txt"
+
+drift_search:
+  prompt: "prompts/drift_search_system_prompt.txt"

From 54f54ced02fdafaede4675ee2b7316508e9a3fa0 Mon Sep 17 00:00:00 2001
From: Hideyuki Kagami <hide212131@gmail.com>
Date: Mon, 30 Dec 2024 15:53:43 +0900
Subject: [PATCH 3/3] fix: update return type of context_records

https://github.com/microsoft/graphrag/blob/v1.0.1/graphrag/query/context_builder/builders.py#L21
---
 libs/ktem/ktem/index/file/graph/pipelines.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libs/ktem/ktem/index/file/graph/pipelines.py b/libs/ktem/ktem/index/file/graph/pipelines.py
index dd1ca1c7..d1b20b49 100644
--- a/libs/ktem/ktem/index/file/graph/pipelines.py
+++ b/libs/ktem/ktem/index/file/graph/pipelines.py
@@ -291,10 +291,10 @@ def _to_document(self, header: str, context_text: str) -> RetrievedDocument:
         )
 
     def format_context_records(self, context_records) -> list[RetrievedDocument]:
-        entities = context_records.get("entities", [])
-        relationships = context_records.get("relationships", [])
-        reports = context_records.get("reports", [])
-        sources = context_records.get("sources", [])
+        entities = context_records.get("entities", pd.DataFrame())
+        relationships = context_records.get("relationships", pd.DataFrame())
+        reports = context_records.get("reports", pd.DataFrame())
+        sources = context_records.get("sources", pd.DataFrame())
 
         docs = []