From 4c950cf4ce2a99890eff3777ba7c11f5eb76ebbf Mon Sep 17 00:00:00 2001 From: Magic_yuan <317617749@qq.com> Date: Sun, 29 Dec 2024 15:25:57 +0800 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20=E5=A2=9E=E5=BC=BA=E7=9F=A5?= =?UTF-8?q?=E8=AF=86=E5=9B=BE=E8=B0=B1=E5=85=B3=E7=B3=BB=E7=9A=84=E6=97=B6?= =?UTF-8?q?=E5=BA=8F=E6=80=A7=E6=94=AF=E6=8C=81=20-=20=E4=B8=BA=E5=85=B3?= =?UTF-8?q?=E7=B3=BB=E5=92=8C=E5=90=91=E9=87=8F=E6=95=B0=E6=8D=AE=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E6=97=B6=E9=97=B4=E6=88=B3=E6=94=AF=E6=8C=81,?= =?UTF-8?q?=E8=AE=B0=E5=BD=95=E7=9F=A5=E8=AF=86=E8=8E=B7=E5=8F=96=E7=9A=84?= =?UTF-8?q?=E6=97=B6=E9=97=B4=20-=20=E4=BC=98=E5=8C=96=E6=B7=B7=E5=90=88?= =?UTF-8?q?=E6=9F=A5=E8=AF=A2=E7=AD=96=E7=95=A5,=E5=90=8C=E6=97=B6?= =?UTF-8?q?=E8=80=83=E8=99=91=E8=AF=AD=E4=B9=89=E7=9B=B8=E5=85=B3=E6=80=A7?= =?UTF-8?q?=E5=92=8C=E6=97=B6=E9=97=B4=E9=A1=BA=E5=BA=8F=20-=20=E5=A2=9E?= =?UTF-8?q?=E5=BC=BA=E6=8F=90=E7=A4=BA=E8=AF=8D=E6=A8=A1=E6=9D=BF,?= =?UTF-8?q?=E6=8C=87=E5=AF=BCLLM=E5=9C=A8=E5=A4=84=E7=90=86=E5=86=B2?= =?UTF-8?q?=E7=AA=81=E4=BF=A1=E6=81=AF=E6=97=B6=E8=80=83=E8=99=91=E6=97=B6?= =?UTF-8?q?=E9=97=B4=E5=9B=A0=E7=B4=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lightrag/operate.py | 53 +++++++++++++++++++++++++++++++++++++++------ lightrag/prompt.py | 21 ++++++++++++++++-- lightrag/storage.py | 12 +++++++++- 3 files changed, 76 insertions(+), 10 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index b63e3754..9232b61f 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -30,6 +30,7 @@ QueryParam, ) from .prompt import GRAPH_FIELD_SEP, PROMPTS +import time def chunking_by_token_size( @@ -128,6 +129,9 @@ async def _handle_single_relationship_extraction( description=edge_description, keywords=edge_keywords, source_id=edge_source_id, + metadata={ + "created_at": time.time() + } ) @@ -445,6 +449,9 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): + dp["src_id"] + dp["tgt_id"] + dp["description"], + "metadata": { + "created_at": dp.get("metadata", {}).get("created_at", time.time()) + } } for dp in all_relationships_data } @@ -733,9 +740,13 @@ async def _get_node_data( entities_context = list_of_list_to_csv(entites_section_list) relations_section_list = [ - ["id", "source", "target", "description", "keywords", "weight", "rank"] + ["id", "source", "target", "description", "keywords", "weight", "rank", "created_at"] ] for i, e in enumerate(use_relations): + created_at = e.get("created_at", "未知") + # 转换时间戳为可读格式 + if isinstance(created_at, (int, float)): + created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at)) relations_section_list.append( [ i, @@ -745,6 +756,7 @@ async def _get_node_data( e["keywords"], e["weight"], e["rank"], + created_at ] ) relations_context = list_of_list_to_csv(relations_section_list) @@ -882,6 +894,8 @@ async def _get_edge_data( if not len(results): return "", "", "" + # 从 KV 存储中获取完整的关系信息 + edge_ids = [r["id"] for r in results] edge_datas = await asyncio.gather( *[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results] ) @@ -892,7 +906,13 @@ async def _get_edge_data( *[knowledge_graph_inst.edge_degree(r["src_id"], r["tgt_id"]) for r in results] ) edge_datas = [ - {"src_id": k["src_id"], "tgt_id": k["tgt_id"], "rank": d, **v} + { + "src_id": k["src_id"], + "tgt_id": k["tgt_id"], + "rank": d, + "created_at": k.get("__created_at__", None), # 从 KV 存储中获取时间元数据 + **v + } for k, v, d in zip(results, edge_datas, edge_degree) if v is not None ] @@ -916,9 +936,13 @@ async def _get_edge_data( ) relations_section_list = [ - ["id", "source", "target", "description", "keywords", "weight", "rank"] + ["id", "source", "target", "description", "keywords", "weight", "rank", "created_at"] ] for i, e in enumerate(edge_datas): + created_at = e.get("created_at", "未知") + # 转换时间戳为可读格式 + if isinstance(created_at, (int, float)): + created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at)) relations_section_list.append( [ i, @@ -928,6 +952,7 @@ async def _get_edge_data( e["keywords"], e["weight"], e["rank"], + created_at ] ) relations_context = list_of_list_to_csv(relations_section_list) @@ -1259,9 +1284,15 @@ async def get_vector_context(): chunks_ids = [r["id"] for r in results] chunks = await text_chunks_db.get_by_ids(chunks_ids) - valid_chunks = [ - chunk for chunk in chunks if chunk is not None and "content" in chunk - ] + valid_chunks = [] + for chunk, result in zip(chunks, results): + if chunk is not None and "content" in chunk: + # 合并 chunk 内容和时间元数据 + chunk_with_time = { + "content": chunk["content"], + "created_at": result.get("created_at", None) + } + valid_chunks.append(chunk_with_time) if not valid_chunks: return None @@ -1275,7 +1306,15 @@ async def get_vector_context(): if not maybe_trun_chunks: return None - return "\n--New Chunk--\n".join([c["content"] for c in maybe_trun_chunks]) + # 在内容中包含时间信息 + formatted_chunks = [] + for c in maybe_trun_chunks: + chunk_text = c["content"] + if c["created_at"]: + chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\n{chunk_text}" + formatted_chunks.append(chunk_text) + + return "\n--New Chunk--\n".join(formatted_chunks) except Exception as e: logger.error(f"Error in get_vector_context: {e}") return None diff --git a/lightrag/prompt.py b/lightrag/prompt.py index e0c9b4ab..e2ef59ad 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -164,6 +164,12 @@ If you don't know the answer, just say so. Do not make anything up. Do not include information where the supporting evidence for it is not provided. +When handling relationships with timestamps: +1. Each relationship has a "created_at" timestamp indicating when we acquired this knowledge +2. When encountering conflicting relationships, consider both the semantic content and the timestamp +3. Don't automatically prefer the most recently created relationships - use judgment based on the context +4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps + ---Target response length and format--- {response_type} @@ -172,8 +178,7 @@ {context_data} -Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. -""" +Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.""" PROMPTS["keywords_extraction"] = """---Role--- @@ -250,6 +255,12 @@ If you don't know the answer, just say so. Do not make anything up. Do not include information where the supporting evidence for it is not provided. +When handling content with timestamps: +1. Each piece of content has a "created_at" timestamp indicating when we acquired this knowledge +2. When encountering conflicting information, consider both the content and the timestamp +3. Don't automatically prefer the most recent content - use judgment based on the context +4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps + ---Target response length and format--- {response_type} @@ -293,6 +304,12 @@ Generate a concise response that summarizes relevant points from the provided information. If you don't know the answer, just say so. Do not make anything up or include information where the supporting evidence is not provided. +When handling information with timestamps: +1. Each piece of information (both relationships and content) has a "created_at" timestamp indicating when we acquired this knowledge +2. When encountering conflicting information, consider both the content/relationship and the timestamp +3. Don't automatically prefer the most recent information - use judgment based on the context +4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps + ---Data Sources--- 1. Knowledge Graph Data: diff --git a/lightrag/storage.py b/lightrag/storage.py index 0f65d09c..6be0d609 100644 --- a/lightrag/storage.py +++ b/lightrag/storage.py @@ -7,6 +7,7 @@ import networkx as nx import numpy as np from nano_vectordb import NanoVectorDB +import time from .utils import ( logger, @@ -87,9 +88,12 @@ async def upsert(self, data: dict[str, dict]): if not len(data): logger.warning("You insert an empty data to vector DB") return [] + + current_time = time.time() list_data = [ { "__id__": k, + "__created_at__": current_time, **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields}, } for k, v in data.items() @@ -132,7 +136,13 @@ async def query(self, query: str, top_k=5): better_than_threshold=self.cosine_better_than_threshold, ) results = [ - {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results + { + **dp, + "id": dp["__id__"], + "distance": dp["__metrics__"], + "created_at": dp.get("__created_at__") + } + for dp in results ] return results From 7b91dc7fd899ae4b1878a063cffdea52c6158e31 Mon Sep 17 00:00:00 2001 From: Magic_yuan <317617749@qq.com> Date: Sun, 29 Dec 2024 15:37:34 +0800 Subject: [PATCH 2/2] =?UTF-8?q?feat:=20=E5=A2=9E=E5=BC=BA=E7=9F=A5?= =?UTF-8?q?=E8=AF=86=E5=9B=BE=E8=B0=B1=E5=85=B3=E7=B3=BB=E7=9A=84=E6=97=B6?= =?UTF-8?q?=E5=BA=8F=E6=80=A7=E6=94=AF=E6=8C=81=20-=20=E4=B8=BA=E5=85=B3?= =?UTF-8?q?=E7=B3=BB=E5=92=8C=E5=90=91=E9=87=8F=E6=95=B0=E6=8D=AE=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E6=97=B6=E9=97=B4=E6=88=B3=E6=94=AF=E6=8C=81,?= =?UTF-8?q?=E8=AE=B0=E5=BD=95=E7=9F=A5=E8=AF=86=E8=8E=B7=E5=8F=96=E7=9A=84?= =?UTF-8?q?=E6=97=B6=E9=97=B4=20-=20=E4=BC=98=E5=8C=96=E6=B7=B7=E5=90=88?= =?UTF-8?q?=E6=9F=A5=E8=AF=A2=E7=AD=96=E7=95=A5,=E5=90=8C=E6=97=B6?= =?UTF-8?q?=E8=80=83=E8=99=91=E8=AF=AD=E4=B9=89=E7=9B=B8=E5=85=B3=E6=80=A7?= =?UTF-8?q?=E5=92=8C=E6=97=B6=E9=97=B4=E9=A1=BA=E5=BA=8F=20-=20=E5=A2=9E?= =?UTF-8?q?=E5=BC=BA=E6=8F=90=E7=A4=BA=E8=AF=8D=E6=A8=A1=E6=9D=BF,?= =?UTF-8?q?=E6=8C=87=E5=AF=BCLLM=E5=9C=A8=E5=A4=84=E7=90=86=E5=86=B2?= =?UTF-8?q?=E7=AA=81=E4=BF=A1=E6=81=AF=E6=97=B6=E8=80=83=E8=99=91=E6=97=B6?= =?UTF-8?q?=E9=97=B4=E5=9B=A0=E7=B4=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lightrag/operate.py | 56 ++++++++++++++++++++++++++++----------------- lightrag/storage.py | 10 ++++---- 2 files changed, 40 insertions(+), 26 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 9232b61f..f21e41ff 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -129,9 +129,7 @@ async def _handle_single_relationship_extraction( description=edge_description, keywords=edge_keywords, source_id=edge_source_id, - metadata={ - "created_at": time.time() - } + metadata={"created_at": time.time()}, ) @@ -451,7 +449,7 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): + dp["description"], "metadata": { "created_at": dp.get("metadata", {}).get("created_at", time.time()) - } + }, } for dp in all_relationships_data } @@ -740,11 +738,20 @@ async def _get_node_data( entities_context = list_of_list_to_csv(entites_section_list) relations_section_list = [ - ["id", "source", "target", "description", "keywords", "weight", "rank", "created_at"] + [ + "id", + "source", + "target", + "description", + "keywords", + "weight", + "rank", + "created_at", + ] ] for i, e in enumerate(use_relations): - created_at = e.get("created_at", "未知") - # 转换时间戳为可读格式 + created_at = e.get("created_at", "UNKNOWN") + # Convert timestamp to readable format if isinstance(created_at, (int, float)): created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at)) relations_section_list.append( @@ -756,7 +763,7 @@ async def _get_node_data( e["keywords"], e["weight"], e["rank"], - created_at + created_at, ] ) relations_context = list_of_list_to_csv(relations_section_list) @@ -894,8 +901,6 @@ async def _get_edge_data( if not len(results): return "", "", "" - # 从 KV 存储中获取完整的关系信息 - edge_ids = [r["id"] for r in results] edge_datas = await asyncio.gather( *[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results] ) @@ -907,11 +912,11 @@ async def _get_edge_data( ) edge_datas = [ { - "src_id": k["src_id"], - "tgt_id": k["tgt_id"], - "rank": d, + "src_id": k["src_id"], + "tgt_id": k["tgt_id"], + "rank": d, "created_at": k.get("__created_at__", None), # 从 KV 存储中获取时间元数据 - **v + **v, } for k, v, d in zip(results, edge_datas, edge_degree) if v is not None @@ -936,11 +941,20 @@ async def _get_edge_data( ) relations_section_list = [ - ["id", "source", "target", "description", "keywords", "weight", "rank", "created_at"] + [ + "id", + "source", + "target", + "description", + "keywords", + "weight", + "rank", + "created_at", + ] ] for i, e in enumerate(edge_datas): - created_at = e.get("created_at", "未知") - # 转换时间戳为可读格式 + created_at = e.get("created_at", "Unknown") + # Convert timestamp to readable format if isinstance(created_at, (int, float)): created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at)) relations_section_list.append( @@ -952,7 +966,7 @@ async def _get_edge_data( e["keywords"], e["weight"], e["rank"], - created_at + created_at, ] ) relations_context = list_of_list_to_csv(relations_section_list) @@ -1287,10 +1301,10 @@ async def get_vector_context(): valid_chunks = [] for chunk, result in zip(chunks, results): if chunk is not None and "content" in chunk: - # 合并 chunk 内容和时间元数据 + # Merge chunk content and time metadata chunk_with_time = { "content": chunk["content"], - "created_at": result.get("created_at", None) + "created_at": result.get("created_at", None), } valid_chunks.append(chunk_with_time) @@ -1306,7 +1320,7 @@ async def get_vector_context(): if not maybe_trun_chunks: return None - # 在内容中包含时间信息 + # Include time information in content formatted_chunks = [] for c in maybe_trun_chunks: chunk_text = c["content"] diff --git a/lightrag/storage.py b/lightrag/storage.py index 6be0d609..4c862dbe 100644 --- a/lightrag/storage.py +++ b/lightrag/storage.py @@ -88,7 +88,7 @@ async def upsert(self, data: dict[str, dict]): if not len(data): logger.warning("You insert an empty data to vector DB") return [] - + current_time = time.time() list_data = [ { @@ -137,11 +137,11 @@ async def query(self, query: str, top_k=5): ) results = [ { - **dp, - "id": dp["__id__"], + **dp, + "id": dp["__id__"], "distance": dp["__metrics__"], - "created_at": dp.get("__created_at__") - } + "created_at": dp.get("__created_at__"), + } for dp in results ] return results