From a9402513909606c76a2e8d5e040f12ecb8aa4739 Mon Sep 17 00:00:00 2001
From: Gurjot Singh <gurjotsingh@shorthills.ai>
Date: Tue, 7 Jan 2025 20:57:39 +0530
Subject: [PATCH 1/2] Implement custom chunking feature

---
 lightrag/lightrag.py | 66 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 7496d736..2225b2d1 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -458,6 +458,72 @@ async def ainsert(self, string_or_strings, split_by_character):
                     # Ensure all indexes are updated after each document
                     await self._insert_done()
 
+    def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
+        loop = always_get_an_event_loop()
+        return loop.run_until_complete(self.ainsert_custom_chunks(full_text, text_chunks))
+
+    async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]):
+        
+        update_storage = False
+        try:
+            doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-")
+            new_docs = {
+                doc_key: {"content": full_text.strip()}
+            }
+
+            _add_doc_keys = await self.full_docs.filter_keys([doc_key])
+            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
+            if not len(new_docs):
+                logger.warning("This document is already in the storage.")
+                return
+
+            update_storage = True
+            logger.info(f"[New Docs] inserting {len(new_docs)} docs")
+
+            inserting_chunks = {}
+            for chunk_text in text_chunks:
+                chunk_text_stripped = chunk_text.strip()
+                chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-")
+            
+                inserting_chunks[chunk_key] = {
+                    "content": chunk_text_stripped,
+                    "full_doc_id": doc_key,
+                }
+
+            _add_chunk_keys = await self.text_chunks.filter_keys(list(inserting_chunks.keys()))
+            inserting_chunks = {
+                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
+            }
+            if not len(inserting_chunks):
+                logger.warning("All chunks are already in the storage.")
+                return
+
+            logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks")
+
+            await self.chunks_vdb.upsert(inserting_chunks)
+
+            logger.info("[Entity Extraction]...")
+            maybe_new_kg = await extract_entities(
+                inserting_chunks,
+                knowledge_graph_inst=self.chunk_entity_relation_graph,
+                entity_vdb=self.entities_vdb,
+                relationships_vdb=self.relationships_vdb,
+                global_config=asdict(self),
+            )
+
+            if maybe_new_kg is None:
+                logger.warning("No new entities and relationships found")
+                return
+            else:
+                self.chunk_entity_relation_graph = maybe_new_kg
+
+            await self.full_docs.upsert(new_docs)
+            await self.text_chunks.upsert(inserting_chunks)
+
+        finally:
+            if update_storage:
+                await self._insert_done()
+
     async def _insert_done(self):
         tasks = []
         for storage_inst in [

From 9565a4663ad8878126f16d667455ca5a22f1d557 Mon Sep 17 00:00:00 2001
From: Gurjot Singh <gurjotsingh@shorthills.ai>
Date: Thu, 9 Jan 2025 00:39:22 +0530
Subject: [PATCH 2/2] Fix trailing whitespace and formatting issues in
 lightrag.py

---
 lightrag/lightrag.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 2225b2d1..6af29aa2 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -460,16 +460,15 @@ async def ainsert(self, string_or_strings, split_by_character):
 
     def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
         loop = always_get_an_event_loop()
-        return loop.run_until_complete(self.ainsert_custom_chunks(full_text, text_chunks))
+        return loop.run_until_complete(
+            self.ainsert_custom_chunks(full_text, text_chunks)
+        )
 
     async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]):
-        
         update_storage = False
         try:
             doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-")
-            new_docs = {
-                doc_key: {"content": full_text.strip()}
-            }
+            new_docs = {doc_key: {"content": full_text.strip()}}
 
             _add_doc_keys = await self.full_docs.filter_keys([doc_key])
             new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
@@ -484,13 +483,15 @@ async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]):
             for chunk_text in text_chunks:
                 chunk_text_stripped = chunk_text.strip()
                 chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-")
-            
+
                 inserting_chunks[chunk_key] = {
                     "content": chunk_text_stripped,
                     "full_doc_id": doc_key,
                 }
 
-            _add_chunk_keys = await self.text_chunks.filter_keys(list(inserting_chunks.keys()))
+            _add_chunk_keys = await self.text_chunks.filter_keys(
+                list(inserting_chunks.keys())
+            )
             inserting_chunks = {
                 k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
             }