Skip to content

Commit

Permalink
Merge pull request #525 from magicyuan876/main
Browse files Browse the repository at this point in the history
feat: 增强知识图谱关系的时序性支持
  • Loading branch information
LarFii authored Dec 29, 2024
2 parents 4ddf87f + b84c4d1 commit 889d056
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 10 deletions.
67 changes: 60 additions & 7 deletions lightrag/operate.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
QueryParam,
)
from .prompt import GRAPH_FIELD_SEP, PROMPTS
import time


def chunking_by_token_size(
Expand Down Expand Up @@ -128,6 +129,7 @@ async def _handle_single_relationship_extraction(
description=edge_description,
keywords=edge_keywords,
source_id=edge_source_id,
metadata={"created_at": time.time()},
)


Expand Down Expand Up @@ -445,6 +447,9 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
+ dp["src_id"]
+ dp["tgt_id"]
+ dp["description"],
"metadata": {
"created_at": dp.get("metadata", {}).get("created_at", time.time())
},
}
for dp in all_relationships_data
}
Expand Down Expand Up @@ -733,9 +738,22 @@ async def _get_node_data(
entities_context = list_of_list_to_csv(entites_section_list)

relations_section_list = [
["id", "source", "target", "description", "keywords", "weight", "rank"]
[
"id",
"source",
"target",
"description",
"keywords",
"weight",
"rank",
"created_at",
]
]
for i, e in enumerate(use_relations):
created_at = e.get("created_at", "UNKNOWN")
# Convert timestamp to readable format
if isinstance(created_at, (int, float)):
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
relations_section_list.append(
[
i,
Expand All @@ -745,6 +763,7 @@ async def _get_node_data(
e["keywords"],
e["weight"],
e["rank"],
created_at,
]
)
relations_context = list_of_list_to_csv(relations_section_list)
Expand Down Expand Up @@ -892,7 +911,13 @@ async def _get_edge_data(
*[knowledge_graph_inst.edge_degree(r["src_id"], r["tgt_id"]) for r in results]
)
edge_datas = [
{"src_id": k["src_id"], "tgt_id": k["tgt_id"], "rank": d, **v}
{
"src_id": k["src_id"],
"tgt_id": k["tgt_id"],
"rank": d,
"created_at": k.get("__created_at__", None), # 从 KV 存储中获取时间元数据
**v,
}
for k, v, d in zip(results, edge_datas, edge_degree)
if v is not None
]
Expand All @@ -916,9 +941,22 @@ async def _get_edge_data(
)

relations_section_list = [
["id", "source", "target", "description", "keywords", "weight", "rank"]
[
"id",
"source",
"target",
"description",
"keywords",
"weight",
"rank",
"created_at",
]
]
for i, e in enumerate(edge_datas):
created_at = e.get("created_at", "Unknown")
# Convert timestamp to readable format
if isinstance(created_at, (int, float)):
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
relations_section_list.append(
[
i,
Expand All @@ -928,6 +966,7 @@ async def _get_edge_data(
e["keywords"],
e["weight"],
e["rank"],
created_at,
]
)
relations_context = list_of_list_to_csv(relations_section_list)
Expand Down Expand Up @@ -1259,9 +1298,15 @@ async def get_vector_context():
chunks_ids = [r["id"] for r in results]
chunks = await text_chunks_db.get_by_ids(chunks_ids)

valid_chunks = [
chunk for chunk in chunks if chunk is not None and "content" in chunk
]
valid_chunks = []
for chunk, result in zip(chunks, results):
if chunk is not None and "content" in chunk:
# Merge chunk content and time metadata
chunk_with_time = {
"content": chunk["content"],
"created_at": result.get("created_at", None),
}
valid_chunks.append(chunk_with_time)

if not valid_chunks:
return None
Expand All @@ -1275,7 +1320,15 @@ async def get_vector_context():
if not maybe_trun_chunks:
return None

return "\n--New Chunk--\n".join([c["content"] for c in maybe_trun_chunks])
# Include time information in content
formatted_chunks = []
for c in maybe_trun_chunks:
chunk_text = c["content"]
if c["created_at"]:
chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\n{chunk_text}"
formatted_chunks.append(chunk_text)

return "\n--New Chunk--\n".join(formatted_chunks)
except Exception as e:
logger.error(f"Error in get_vector_context: {e}")
return None
Expand Down
21 changes: 19 additions & 2 deletions lightrag/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,12 @@
If you don't know the answer, just say so. Do not make anything up.
Do not include information where the supporting evidence for it is not provided.
When handling relationships with timestamps:
1. Each relationship has a "created_at" timestamp indicating when we acquired this knowledge
2. When encountering conflicting relationships, consider both the semantic content and the timestamp
3. Don't automatically prefer the most recently created relationships - use judgment based on the context
4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
---Target response length and format---
{response_type}
Expand All @@ -172,8 +178,7 @@
{context_data}
Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
"""
Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown."""

PROMPTS["keywords_extraction"] = """---Role---
Expand Down Expand Up @@ -250,6 +255,12 @@
If you don't know the answer, just say so. Do not make anything up.
Do not include information where the supporting evidence for it is not provided.
When handling content with timestamps:
1. Each piece of content has a "created_at" timestamp indicating when we acquired this knowledge
2. When encountering conflicting information, consider both the content and the timestamp
3. Don't automatically prefer the most recent content - use judgment based on the context
4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
---Target response length and format---
{response_type}
Expand Down Expand Up @@ -293,6 +304,12 @@
Generate a concise response that summarizes relevant points from the provided information. If you don't know the answer, just say so. Do not make anything up or include information where the supporting evidence is not provided.
When handling information with timestamps:
1. Each piece of information (both relationships and content) has a "created_at" timestamp indicating when we acquired this knowledge
2. When encountering conflicting information, consider both the content/relationship and the timestamp
3. Don't automatically prefer the most recent information - use judgment based on the context
4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
---Data Sources---
1. Knowledge Graph Data:
Expand Down
12 changes: 11 additions & 1 deletion lightrag/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import networkx as nx
import numpy as np
from nano_vectordb import NanoVectorDB
import time

from .utils import (
logger,
Expand Down Expand Up @@ -87,9 +88,12 @@ async def upsert(self, data: dict[str, dict]):
if not len(data):
logger.warning("You insert an empty data to vector DB")
return []

current_time = time.time()
list_data = [
{
"__id__": k,
"__created_at__": current_time,
**{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
}
for k, v in data.items()
Expand Down Expand Up @@ -132,7 +136,13 @@ async def query(self, query: str, top_k=5):
better_than_threshold=self.cosine_better_than_threshold,
)
results = [
{**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results
{
**dp,
"id": dp["__id__"],
"distance": dp["__metrics__"],
"created_at": dp.get("__created_at__"),
}
for dp in results
]
return results

Expand Down

0 comments on commit 889d056

Please sign in to comment.