From d0a4ef252ee480579777cdfda164651e0118bdf6 Mon Sep 17 00:00:00 2001
From: david <davidl@126.com>
Date: Tue, 10 Dec 2024 09:00:22 +0800
Subject: [PATCH 01/11] fix: rare embedding issue.

---
 lightrag/storage.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/lightrag/storage.py b/lightrag/storage.py
index 007d6534..4c043893 100644
--- a/lightrag/storage.py
+++ b/lightrag/storage.py
@@ -107,10 +107,16 @@ async def upsert(self, data: dict[str, dict]):
             embeddings = await f
             embeddings_list.append(embeddings)
         embeddings = np.concatenate(embeddings_list)
-        for i, d in enumerate(list_data):
-            d["__vector__"] = embeddings[i]
-        results = self._client.upsert(datas=list_data)
-        return results
+        if len(embeddings) == len(list_data):
+            for i, d in enumerate(list_data):
+                d["__vector__"] = embeddings[i]
+            results = self._client.upsert(datas=list_data)
+            return results
+        else:
+            # sometimes the embedding is not returned correctly. just log it.
+            logger.error(
+                f"embedding is not 1-1 with data, {len(embeddings)} != {len(list_data)}"
+            )
 
     async def query(self, query: str, top_k=5):
         embedding = await self.embedding_func([query])

From f6eeedb050a9bdf7d2c598634c76fa152de8e69b Mon Sep 17 00:00:00 2001
From: david <davidl@126.com>
Date: Tue, 10 Dec 2024 09:01:21 +0800
Subject: [PATCH 02/11] add concurrent embedding limit

---
 lightrag/utils.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 0220af06..bdb47592 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -17,6 +17,17 @@
 
 from lightrag.prompt import PROMPTS
 
+
+class UnlimitedSemaphore:
+    """A context manager that allows unlimited access."""
+
+    async def __aenter__(self):
+        pass
+
+    async def __aexit__(self, exc_type, exc, tb):
+        pass
+
+
 ENCODER = None
 
 logger = logging.getLogger("lightrag")
@@ -42,9 +53,17 @@ class EmbeddingFunc:
     embedding_dim: int
     max_token_size: int
     func: callable
+    concurrent_limit: int = 16
+
+    def __post_init__(self):
+        if self.concurrent_limit != 0:
+            self._semaphore = asyncio.Semaphore(self.concurrent_limit)
+        else:
+            self._semaphore = UnlimitedSemaphore()
 
     async def __call__(self, *args, **kwargs) -> np.ndarray:
-        return await self.func(*args, **kwargs)
+        async with self._semaphore:
+            return await self.func(*args, **kwargs)
 
 
 def locate_json_string_body_from_string(content: str) -> Union[str, None]:

From 21a3992e397e35449769f9d878e74af6f7a581ab Mon Sep 17 00:00:00 2001
From: david <davidl@126.com>
Date: Tue, 10 Dec 2024 09:52:27 +0800
Subject: [PATCH 03/11] fix extra keyword_extraction.

---
 lightrag/llm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lightrag/llm.py b/lightrag/llm.py
index d725ea85..dbf32844 100644
--- a/lightrag/llm.py
+++ b/lightrag/llm.py
@@ -63,6 +63,7 @@ async def openai_complete_if_cache(
         AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url)
     )
     kwargs.pop("hashing_kv", None)
+    kwargs.pop("keyword_extraction", None)
     messages = []
     if system_prompt:
         messages.append({"role": "system", "content": system_prompt})

From 58c0f943464852d7f88c6a766e981c7563d13b7f Mon Sep 17 00:00:00 2001
From: Magic_yuan <317617749@qq.com>
Date: Tue, 10 Dec 2024 14:13:11 +0800
Subject: [PATCH 04/11] =?UTF-8?q?fix(lightrag):=20=E4=BF=AE=E5=A4=8D?=
 =?UTF-8?q?=E5=8F=AA=E6=9C=89=E5=AE=9E=E4=BD=93=E6=B2=A1=E6=9C=89=E5=85=B3?=
 =?UTF-8?q?=E7=B3=BB=E7=9A=84chunk=E5=A4=84=E7=90=86=E9=80=BB=E8=BE=91=20-?=
 =?UTF-8?q?=20=E5=8F=AA=E6=9C=89=E5=AE=9E=E4=BD=93=E6=B2=A1=E6=9C=89?=
 =?UTF-8?q?=E5=85=B3=E7=B3=BB=E6=97=B6=EF=BC=8C=E7=BB=A7=E7=BB=AD=E5=A4=84?=
 =?UTF-8?q?=E7=90=86=EF=BC=8C=E8=80=8C=E4=B8=8D=E6=98=AF=E7=9B=B4=E6=8E=A5?=
 =?UTF-8?q?return=20-=20=E5=BD=93=E5=8F=AA=E6=9C=89=E5=AE=9E=E4=BD=93?=
 =?UTF-8?q?=E8=80=8C=E6=B2=A1=E6=9C=89=E5=85=B3=E7=B3=BB=E7=9A=84=E5=9B=BE?=
 =?UTF-8?q?=E7=89=87=E5=9C=A8=E9=AB=98=E9=98=B6=E6=9F=A5=E8=AF=A2=E5=85=B3?=
 =?UTF-8?q?=E7=B3=BB=E6=97=B6=E4=BC=9A=E8=BF=94=E5=9B=9E=E7=A9=BA=EF=BC=8C?=
 =?UTF-8?q?=E8=BF=99=E9=87=8C=E4=BC=98=E5=8C=96=E8=BF=94=E5=9B=9E=EF=BC=8C?=
 =?UTF-8?q?=E5=BD=93=E6=B2=A1=E6=9C=89=E5=85=B3=E7=B3=BB=E6=97=B6=E9=99=8D?=
 =?UTF-8?q?=E7=BA=A7=E4=B8=BAlocal=E6=9F=A5=E8=AF=A2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lightrag/operate.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 468f4b2f..ec55694d 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -412,15 +412,17 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
     ):
         all_relationships_data.append(await result)
 
-    if not len(all_entities_data):
-        logger.warning("Didn't extract any entities, maybe your LLM is not working")
-        return None
-    if not len(all_relationships_data):
+    if not len(all_entities_data) and not len(all_relationships_data):
         logger.warning(
-            "Didn't extract any relationships, maybe your LLM is not working"
+            "Didn't extract any entities or relationships, maybe your LLM is not working"
         )
         return None
 
+    if not len(all_entities_data):
+        logger.warning("Didn't extract any entities")
+    if not len(all_relationships_data):
+        logger.warning("Didn't extract any relationships")
+
     if entity_vdb is not None:
         data_for_vdb = {
             compute_mdhash_id(dp["entity_name"], prefix="ent-"): {
@@ -630,6 +632,13 @@ async def _build_query_context(
                 text_chunks_db,
                 query_param,
             )
+            if (
+                hl_entities_context == ""
+                and hl_relations_context == ""
+                and hl_text_units_context == ""
+            ):
+                logger.warn("No high level context found. Switching to local mode.")
+                query_param.mode = "local"
     if query_param.mode == "hybrid":
         entities_context, relations_context, text_units_context = combine_contexts(
             [hl_entities_context, ll_entities_context],
@@ -865,7 +874,7 @@ async def _get_edge_data(
     results = await relationships_vdb.query(keywords, top_k=query_param.top_k)
 
     if not len(results):
-        return None
+        return "", "", ""
 
     edge_datas = await asyncio.gather(
         *[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results]

From 316c4df949eda0590e0e33eae07e6cbd386edfc3 Mon Sep 17 00:00:00 2001
From: Magic_yuan <317617749@qq.com>
Date: Tue, 10 Dec 2024 14:15:43 +0800
Subject: [PATCH 05/11] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E6=97=A5=E5=BF=97?=
 =?UTF-8?q?=E6=8F=8F=E8=BF=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lightrag/operate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index ec55694d..bc5a9b13 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -414,7 +414,7 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
 
     if not len(all_entities_data) and not len(all_relationships_data):
         logger.warning(
-            "Didn't extract any entities or relationships, maybe your LLM is not working"
+            "Didn't extract any entities and relationships, maybe your LLM is not working"
         )
         return None
 

From 288d4b8355eb3fd1fdfaf165f1380501b58aaf67 Mon Sep 17 00:00:00 2001
From: david <davidl@126.com>
Date: Tue, 10 Dec 2024 17:16:21 +0800
Subject: [PATCH 06/11] fix lazy import

---
 lightrag/lightrag.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 833926e5..3a4276cb 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -48,18 +48,25 @@
 
 
 def lazy_external_import(module_name: str, class_name: str):
-    """Lazily import an external module and return a class from it."""
+    """Lazily import a class from an external module based on the package of the caller."""
 
-    def import_class():
+    # Get the caller's module and package
+    import inspect
+
+    caller_frame = inspect.currentframe().f_back
+    module = inspect.getmodule(caller_frame)
+    package = module.__package__ if module else None
+
+    def import_class(*args, **kwargs):
         import importlib
 
         # Import the module using importlib
-        module = importlib.import_module(module_name)
+        module = importlib.import_module(module_name, package=package)
 
-        # Get the class from the module
-        return getattr(module, class_name)
+        # Get the class from the module and instantiate it
+        cls = getattr(module, class_name)
+        return cls(*args, **kwargs)
 
-    # Return the import_class function itself, not its result
     return import_class
 
 

From 0a41cc8a9aa33bb3d91f8ea6290e3423721eda9f Mon Sep 17 00:00:00 2001
From: Magic_yuan <317617749@qq.com>
Date: Wed, 11 Dec 2024 12:45:10 +0800
Subject: [PATCH 07/11] =?UTF-8?q?feat(llm,=20prompt):=E5=A2=9E=E5=8A=A0?=
 =?UTF-8?q?=E6=97=A5=E5=BF=97=E8=BE=93=E5=87=BA=E5=B9=B6=E6=89=A9=E5=B1=95?=
 =?UTF-8?q?=E5=AE=9E=E4=BD=93=E7=B1=BB=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 在 llm.py 中添加了日志输出，用于调试和记录 LLM 查询输入
- 在 prompt.py 中增加了 "category" 实体类型，扩展了实体提取的范围
---
 lightrag/llm.py    | 7 ++++++-
 lightrag/prompt.py | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/lightrag/llm.py b/lightrag/llm.py
index d725ea85..e0277248 100644
--- a/lightrag/llm.py
+++ b/lightrag/llm.py
@@ -29,7 +29,7 @@
 from .utils import (
     wrap_embedding_func_with_attrs,
     locate_json_string_body_from_string,
-    safe_unicode_decode,
+    safe_unicode_decode, logger,
 )
 
 import sys
@@ -69,6 +69,11 @@ async def openai_complete_if_cache(
     messages.extend(history_messages)
     messages.append({"role": "user", "content": prompt})
 
+    # 添加日志输出
+    logger.debug("===== Query Input to LLM =====")
+    logger.debug(f"Query: {prompt}")
+    logger.debug(f"System prompt: {system_prompt}")
+    logger.debug("Full context:")
     if "response_format" in kwargs:
         response = await openai_async_client.beta.chat.completions.parse(
             model=model, messages=messages, **kwargs
diff --git a/lightrag/prompt.py b/lightrag/prompt.py
index b62f02b5..d5674f15 100644
--- a/lightrag/prompt.py
+++ b/lightrag/prompt.py
@@ -8,7 +8,7 @@
 PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
 PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
 
-PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event"]
+PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event", "category"]
 
 PROMPTS["entity_extraction"] = """-Goal-
 Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.

From a09e1ba190c155d64ebcbfd98498621068e111de Mon Sep 17 00:00:00 2001
From: Magic_yuan <317617749@qq.com>
Date: Wed, 11 Dec 2024 12:57:58 +0800
Subject: [PATCH 08/11] =?UTF-8?q?refactor(prompt):=20=E4=BC=98=E5=8C=96?=
 =?UTF-8?q?=E6=8F=90=E7=A4=BA=E6=A8=A1=E6=9D=BF=E4=BB=A5=E6=8F=90=E9=AB=98?=
 =?UTF-8?q?=E7=9B=B8=E4=BC=BC=E5=BA=A6=E8=AF=84=E4=BC=B0=E7=9A=84=E5=87=86?=
 =?UTF-8?q?=E7=A1=AE=E6=80=A7-=20=E6=98=8E=E7=A1=AE=E4=BA=86=E7=9B=B8?=
 =?UTF-8?q?=E4=BC=BC=E5=BA=A6=E8=AF=84=E5=88=86=E7=9A=84=E8=AF=84=E5=88=A4?=
 =?UTF-8?q?=E6=A0=87=E5=87=86=EF=BC=8C=E5=8C=85=E6=8B=AC=E4=B8=8D=E5=90=8C?=
 =?UTF-8?q?=E6=83=85=E5=86=B5=E4=B8=8B=E7=9A=84=E8=AF=84=E5=88=86=E4=BE=9D?=
 =?UTF-8?q?=E6=8D=AE=20-=20=E7=AE=80=E5=8C=96=E4=BA=86=E8=AF=84=E5=88=86?=
 =?UTF-8?q?=E6=B5=81=E7=A8=8B=EF=BC=8C=E8=A6=81=E6=B1=82=E7=9B=B4=E6=8E=A5?=
 =?UTF-8?q?=E8=BF=94=E5=9B=9E=E6=95=B0=E5=AD=97=20-=20=E6=9C=9F=E6=9C=9B?=
 =?UTF-8?q?=E9=80=9A=E8=BF=87=E8=BF=99=E4=BA=9B=E6=94=B9=E5=8A=A8=E6=8F=90?=
 =?UTF-8?q?=E9=AB=98=E8=AF=84=E4=BC=B0=E7=9A=84=E5=87=86=E7=A1=AE=E6=80=A7?=
 =?UTF-8?q?=E5=92=8C=E4=B8=80=E8=87=B4=E6=80=A7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lightrag/prompt.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/lightrag/prompt.py b/lightrag/prompt.py
index d5674f15..9d9e6034 100644
--- a/lightrag/prompt.py
+++ b/lightrag/prompt.py
@@ -268,14 +268,19 @@
 Question 1: {original_prompt}
 Question 2: {cached_prompt}
 
-Please evaluate:
+Please evaluate the following two points and provide a similarity score between 0 and 1 directly:
 1. Whether these two questions are semantically similar
 2. Whether the answer to Question 2 can be used to answer Question 1
-
-Please provide a similarity score between 0 and 1, where:
-0: Completely unrelated or answer cannot be reused
+Similarity score criteria:
+0: Completely unrelated or answer cannot be reused, including but not limited to:
+   - The questions have different topics
+   - The locations mentioned in the questions are different
+   - The times mentioned in the questions are different
+   - The specific individuals mentioned in the questions are different
+   - The specific events mentioned in the questions are different
+   - The background information in the questions is different
+   - The key conditions in the questions are different
 1: Identical and answer can be directly reused
 0.5: Partially related and answer needs modification to be used
-
 Return only a number between 0-1, without any additional content.
 """

From b89041b5b38c5d4b2185fe9bc7b84d557d6ac981 Mon Sep 17 00:00:00 2001
From: Magic_yuan <317617749@qq.com>
Date: Wed, 11 Dec 2024 13:53:05 +0800
Subject: [PATCH 09/11] =?UTF-8?q?feat(operate):=20=E6=B7=BB=E5=8A=A0?=
 =?UTF-8?q?=E5=AE=9E=E4=BD=93=E7=B1=BB=E5=9E=8B=E9=85=8D=E7=BD=AE=E5=B9=B6?=
 =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=8F=90=E7=A4=BA=E7=94=9F=E6=88=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 在全局配置中添加 entity_types 参数，用于自定义实体类型
- 在生成实体提取和关系提取的提示时，使用配置的实体类型替代默认值
- 优化了提示生成逻辑，提高了代码的可配置性和灵活性
---
 lightrag/operate.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index bc5a9b13..8b8ad85b 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -260,6 +260,9 @@ async def extract_entities(
     language = global_config["addon_params"].get(
         "language", PROMPTS["DEFAULT_LANGUAGE"]
     )
+    entity_types = global_config["addon_params"].get(
+        "entity_types", PROMPTS["DEFAULT_ENTITY_TYPES"]
+    )
     example_number = global_config["addon_params"].get("example_number", None)
     if example_number and example_number < len(PROMPTS["entity_extraction_examples"]):
         examples = "\n".join(
@@ -272,7 +275,7 @@ async def extract_entities(
         tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
         record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
         completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
-        entity_types=",".join(PROMPTS["DEFAULT_ENTITY_TYPES"]),
+        entity_types=",".join(entity_types),
         language=language,
     )
     # add example's format
@@ -283,7 +286,7 @@ async def extract_entities(
         tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
         record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
         completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
-        entity_types=",".join(PROMPTS["DEFAULT_ENTITY_TYPES"]),
+        entity_types=",".join(entity_types),
         examples=examples,
         language=language,
     )

From 9a2afc9484d9a7db93bd2b946a6a963672c327d1 Mon Sep 17 00:00:00 2001
From: Magic_yuan <317617749@qq.com>
Date: Wed, 11 Dec 2024 14:06:55 +0800
Subject: [PATCH 10/11] =?UTF-8?q?style(lightrag):=20=E8=B0=83=E6=95=B4?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81=E6=A0=BC=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lightrag/llm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lightrag/llm.py b/lightrag/llm.py
index e0277248..f3fed23f 100644
--- a/lightrag/llm.py
+++ b/lightrag/llm.py
@@ -29,7 +29,8 @@
 from .utils import (
     wrap_embedding_func_with_attrs,
     locate_json_string_body_from_string,
-    safe_unicode_decode, logger,
+    safe_unicode_decode,
+    logger,
 )
 
 import sys

From b63c6155ee0d2e5d8504c1c723b86cec342ae7a8 Mon Sep 17 00:00:00 2001
From: Magic_yuan <317617749@qq.com>
Date: Wed, 11 Dec 2024 14:10:27 +0800
Subject: [PATCH 11/11] =?UTF-8?q?style(lightrag):=20=E8=B0=83=E6=95=B4Read?=
 =?UTF-8?q?Me,=E5=8A=A0=E5=85=A5=E8=87=AA=E5=AE=9A=E4=B9=89=E5=AE=9E?=
 =?UTF-8?q?=E4=BD=93=E7=B1=BB=E5=9E=8B=E5=8F=82=E6=95=B0=E9=85=8D=E7=BD=AE?=
 =?UTF-8?q?=E7=A4=BA=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a1454792..a24c9b72 100644
--- a/README.md
+++ b/README.md
@@ -594,7 +594,7 @@ if __name__ == "__main__":
 | **llm\_model\_kwargs** | `dict` | Additional parameters for LLM generation |     |
 | **vector\_db\_storage\_cls\_kwargs** | `dict` | Additional parameters for vector database (currently not used) |     |
 | **enable\_llm\_cache** | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses | `TRUE` |
-| **addon\_params** | `dict` | Additional parameters, e.g., `{"example_number": 1, "language": "Simplified Chinese"}`: sets example limit and output language | `example_number: all examples, language: English` |
+| **addon\_params** | `dict` | Additional parameters, e.g., `{"example_number": 1, "language": "Simplified Chinese", "entity_types": ["organization", "person", "geo", "event"]}`: sets example limit and output language | `example_number: all examples, language: English` |
 | **convert\_response\_to\_json\_func** | `callable` | Not used | `convert_response_to_json` |
 | **embedding\_cache\_config** | `dict` | Configuration for question-answer caching. Contains three parameters:<br>- `enabled`: Boolean value to enable/disable cache lookup functionality. When enabled, the system will check cached responses before generating new answers.<br>- `similarity_threshold`: Float value (0-1), similarity threshold. When a new question's similarity with a cached question exceeds this threshold, the cached answer will be returned directly without calling the LLM.<br>- `use_llm_check`: Boolean value to enable/disable LLM similarity verification. When enabled, LLM will be used as a secondary check to verify the similarity between questions before returning cached answers. | Default: `{"enabled": False, "similarity_threshold": 0.95, "use_llm_check": False}` |