Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(deps): update graphrag to version 1.0 #588

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ documents and developers who want to build their own RAG pipeline.
- **Non-Docker Installation**: If you are not using Docker, install GraphRAG with the following command:

```shell
pip install "graphrag<=0.3.6" future
pip install "graphrag<=1.0.1" future
```

- **Setting Up API KEY**: To use the GraphRAG retriever feature, ensure you set the `GRAPHRAG_API_KEY` environment variable. You can do this directly in your environment or by adding it to a `.env` file.
Expand Down
42 changes: 14 additions & 28 deletions libs/ktem/ktem/index/file/graph/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import shutil
import subprocess
from pathlib import Path
from shutil import rmtree
from typing import Generator
from uuid import uuid4

Expand All @@ -27,7 +26,6 @@
read_indexer_reports,
read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import store_entity_semantic_embeddings
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.local_search.mixed_context import (
Expand Down Expand Up @@ -115,25 +113,16 @@ def call_graphrag_index(self, graph_id: str, all_docs: list[Document]):
input_path = str(input_path.absolute())

# Construct the command
command = [
"python",
"-m",
"graphrag.index",
"--root",
input_path,
"--reporter",
"rich",
"--init",
]
init_command = ["graphrag", "init", "--root", input_path]
index_command = ["graphrag", "index", "--root", input_path]

# Run the command
yield Document(
channel="debug",
text="[GraphRAG] Creating index... This can take a long time.",
)
result = subprocess.run(command, capture_output=True, text=True)
result = subprocess.run(init_command, capture_output=True, text=True)
print(result.stdout)
command = command[:-1]

# copy customized GraphRAG config file if it exists
if config("USE_CUSTOMIZED_GRAPHRAG_SETTING", default="value").lower() == "true":
Expand All @@ -146,7 +135,9 @@ def call_graphrag_index(self, graph_id: str, all_docs: list[Document]):
print("failed to copy customized GraphRAG config file. ")

# Run the command and stream stdout
with subprocess.Popen(command, stdout=subprocess.PIPE, text=True) as process:
with subprocess.Popen(
index_command, stdout=subprocess.PIPE, text=True
) as process:
if process.stdout:
for line in process.stdout:
yield Document(channel="debug", text=line)
Expand Down Expand Up @@ -227,14 +218,9 @@ def _build_graph_search(self):
# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
collection_name="entity_description_embeddings",
collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
if Path(LANCEDB_URI).is_dir():
rmtree(LANCEDB_URI)
_ = store_entity_semantic_embeddings(
entities=entities, vectorstore=description_embedding_store
)
print(f"Entity count: {len(entity_df)}")

# Read relationships
Expand Down Expand Up @@ -305,10 +291,10 @@ def _to_document(self, header: str, context_text: str) -> RetrievedDocument:
)

def format_context_records(self, context_records) -> list[RetrievedDocument]:
entities = context_records.get("entities", [])
relationships = context_records.get("relationships", [])
reports = context_records.get("reports", [])
sources = context_records.get("sources", [])
entities = context_records.get("entities", pd.DataFrame())
relationships = context_records.get("relationships", pd.DataFrame())
reports = context_records.get("reports", pd.DataFrame())
sources = context_records.get("sources", pd.DataFrame())

docs = []

Expand Down Expand Up @@ -382,13 +368,13 @@ def run(
# (if you are using a model with 8k limit, a good setting could be 5000)
}

context_text, context_records = context_builder.build_context(
context_builder_result = context_builder.build_context(
query=text,
conversation_history=None,
**local_context_params,
)
documents = self.format_context_records(context_records)
plot = self.plot_graph(context_records)
documents = self.format_context_records(context_builder_result.context_records)
plot = self.plot_graph(context_builder_result.context_records)

return documents + [
RetrievedDocument(
Expand Down
135 changes: 57 additions & 78 deletions settings.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -2,62 +2,53 @@
# The parameters in this file will only take effect when the USE_CUSTOMIZED_GRAPHRAG_SETTING is true in .env file.
# For a comprehensive understanding of GraphRAG parameters, please refer to: https://microsoft.github.io/graphrag/config/json_yaml/.

encoding_model: cl100k_base
skip_workflows: []
### This config file contains required core defaults that must be set, along with a handful of common optional settings.
### For a full list of available settings, see https://microsoft.github.io/graphrag/config/yaml/

### LLM settings ###
## There are a number of settings to tune the threading and token limits for LLM calls - check the docs.

encoding_model: cl100k_base # this needs to be matched to your model!

llm:
api_key: ${GRAPHRAG_API_KEY}
api_key: ${GRAPHRAG_API_KEY} # set this in the generated .env file
type: openai_chat # or azure_openai_chat
api_base: http://127.0.0.1:11434/v1
model: qwen2
model_supports_json: true # recommended if this is available for your model.
# max_tokens: 4000
request_timeout: 1800.0
# audience: "https://cognitiveservices.azure.com/.default"
# api_base: https://<instance>.openai.azure.com
# api_version: 2024-02-15-preview
# organization: <organization_id>
# deployment_name: <azure_model_deployment_name>
# tokens_per_minute: 150_000 # set a leaky bucket throttle
# requests_per_minute: 10_000 # set a leaky bucket throttle
# max_retries: 10
# max_retry_wait: 10.0
# sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
api_base: http://127.0.0.1:11434/v1
request_timeout: 1800.0
concurrent_requests: 5 # the number of parallel inflight requests that may be made
# temperature: 0 # temperature for sampling
# top_p: 1 # top-p sampling
# n: 1 # Number of completions to generate

parallelization:
stagger: 0.3
# num_threads: 50 # the number of threads to use for parallel processing
# num_threads: 50

async_mode: threaded # or asyncio

embeddings:
## parallelization: override the global parallelization settings for embeddings
async_mode: threaded # or asyncio
# target: required # or all
# batch_size: 16 # the number of documents to send in a single request
# batch_max_tokens: 8191 # the maximum number of tokens to send in a single request
vector_store:
type: lancedb
db_uri: 'output/lancedb'
container_name: default
overwrite: true
llm:
api_base: http://localhost:11434/v1
api_key: ${GRAPHRAG_API_KEY}
type: openai_embedding # or azure_openai_embedding
model: nomic-embed-text
type: openai_embedding
# api_base: https://<instance>.openai.azure.com
# api_version: 2024-02-15-preview
# audience: "https://cognitiveservices.azure.com/.default"
# organization: <organization_id>
# deployment_name: <azure_model_deployment_name>
# tokens_per_minute: 150_000 # set a leaky bucket throttle
# requests_per_minute: 10_000 # set a leaky bucket throttle
# max_retries: 10
# max_retry_wait: 10.0
# sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
# concurrent_requests: 25 # the number of parallel inflight requests that may be made

chunks:
size: 1200
overlap: 100
group_by_columns: [id] # by default, we don't allow chunks to cross documents
### Input settings ###

input:
type: file # or blob
Expand All @@ -66,54 +57,53 @@ input:
file_encoding: utf-8
file_pattern: ".*\\.txt$"

chunks:
size: 1200
overlap: 100
group_by_columns: [id]

### Storage settings ###
## If blob storage is specified in the following four sections,
## connection_string and container_name must be provided

cache:
type: file # or blob
base_dir: "cache"
# connection_string: <azure_blob_storage_connection_string>
# container_name: <azure_blob_storage_container_name>

reporting:
type: file # or console, blob
base_dir: "logs"

storage:
type: file # or blob
base_dir: "output"
# connection_string: <azure_blob_storage_connection_string>
# container_name: <azure_blob_storage_container_name>

reporting:
type: file # or console, blob
base_dir: "output"
# connection_string: <azure_blob_storage_connection_string>
# container_name: <azure_blob_storage_container_name>
## only turn this on if running `graphrag index` with custom settings
## we normally use `graphrag update` with the defaults
update_index_storage:
# type: file # or blob
# base_dir: "update_output"

### Workflow settings ###

skip_workflows: []

entity_extraction:
## strategy: fully override the entity extraction strategy.
## type: one of graph_intelligence, graph_intelligence_json and nltk
## llm: override the global llm settings for this task
## parallelization: override the global parallelization settings for this task
## async_mode: override the global async_mode settings for this task
prompt: "prompts/entity_extraction.txt"
entity_types: [organization,person,geo,event]
max_gleanings: 1

summarize_descriptions:
## llm: override the global llm settings for this task
## parallelization: override the global parallelization settings for this task
## async_mode: override the global async_mode settings for this task
prompt: "prompts/summarize_descriptions.txt"
max_length: 500

claim_extraction:
## llm: override the global llm settings for this task
## parallelization: override the global parallelization settings for this task
## async_mode: override the global async_mode settings for this task
# enabled: true
enabled: false
prompt: "prompts/claim_extraction.txt"
description: "Any claims or facts that could be relevant to information discovery."
max_gleanings: 1

community_reports:
## llm: override the global llm settings for this task
## parallelization: override the global parallelization settings for this task
## async_mode: override the global async_mode settings for this task
prompt: "prompts/community_report.txt"
max_length: 2000
max_input_length: 8000
Expand All @@ -123,37 +113,26 @@ cluster_graph:

embed_graph:
enabled: false # if true, will generate node2vec embeddings for nodes
# num_walks: 10
# walk_length: 40
# window_size: 2
# iterations: 3
# random_seed: 597832

umap:
enabled: false # if true, will generate UMAP embeddings for nodes

snapshots:
graphml: false
raw_entities: false
top_level_nodes: false
embeddings: false
transient: false

### Query settings ###
## The prompt locations are required here, but each search method has a number of optional knobs that can be tuned.
## See the config docs: https://microsoft.github.io/graphrag/config/yaml/#query

local_search:
# text_unit_prop: 0.5
# community_prop: 0.1
# conversation_history_max_turns: 5
# top_k_mapped_entities: 10
# top_k_relationships: 10
# llm_temperature: 0 # temperature for sampling
# llm_top_p: 1 # top-p sampling
# llm_n: 1 # Number of completions to generate
# max_tokens: 12000
prompt: "prompts/local_search_system_prompt.txt"

global_search:
# llm_temperature: 0 # temperature for sampling
# llm_top_p: 1 # top-p sampling
# llm_n: 1 # Number of completions to generate
# max_tokens: 12000
# data_max_tokens: 12000
# map_max_tokens: 1000
# reduce_max_tokens: 2000
# concurrency: 32
map_prompt: "prompts/global_search_map_system_prompt.txt"
reduce_prompt: "prompts/global_search_reduce_system_prompt.txt"
knowledge_prompt: "prompts/global_search_knowledge_system_prompt.txt"

drift_search:
prompt: "prompts/drift_search_system_prompt.txt"