doc_chunk updates and new parameters

Signed-off-by: Michele Dolfi <[email protected]>
IBM · Sep 16, 2024 · 939ed05 · 939ed05
1 parent dd96ca0
commit 939ed05
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 12 deletions.
diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md
@@ -2,7 +2,7 @@
 
 This transform is chunking documents. It supports multiple _chunker modules_ (see the `chunking_type` parameter).
 
-When using documents converted to JSON, the transform leverages the [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker`
+When using documents converted to JSON, the transform leverages the [Docling Core](https://github.com/DS4SD/docling-core) `HierarchicalChunker`
 to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc.
 It relies on documents converted with the Docling library in the [pdf2parquet transform](../pdf2parquet) using the option `contents_type: "application/json"`,
 which provides the required JSON structure.
@@ -19,12 +19,13 @@ The transform can be tuned with the following parameters.
 
 | Parameter  | Default  | Description  |
 |------------|----------|--------------|
-| `chunking_type`        | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.ibm.com/DeepSearch/quackling). |
-| `content_column_name_key`        | `contents` | Name of the column containing the text to be chunked. |
-| `output_chunk_column_name_key`   | `contents` | Column name to store the chunks in the output table. |
-| `output_jsonpath_column_name_key`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. |
-| `output_pageno_column_name_key`  | `page_number` | Column name to store the page number of the chunk in the output table. |
-| `output_bbox_column_name_key`    | `bbox` | Column name to store the bbox of the chunk in the output table. |
+| `chunking_type`        | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling). |
+| `content_column_name`        | `contents` | Name of the column containing the text to be chunked. |
+| `dl_min_chunk_len`           | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. |
+| `output_chunk_column_name`   | `contents` | Column name to store the chunks in the output table. |
+| `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. |
+| `output_pageno_column_name`  | `page_number` | Column name to store the page number of the chunk in the output table. |
+| `output_bbox_column_name`    | `bbox` | Column name to store the bbox of the chunk in the output table. |
 
 When invoking the CLI, the parameters must be set as `--doc_chunk_<name>`, e.g. `--doc_chunk_column_name_key=myoutput`.
 

diff --git a/transforms/language/doc_chunk/python/pyproject.toml b/transforms/language/doc_chunk/python/pyproject.toml
@@ -12,7 +12,8 @@ authors = [
 ]
 dependencies = [
     "data-prep-toolkit==0.2.1.dev0",
-    "quackling==0.4.0",
+    "docling-core==1.3.0",
+    "llama-index-core>=0.11.0,<0.12.0",
 ]
 
 [build-system]

diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py
@@ -10,14 +10,13 @@
 # limitations under the License.
 ################################################################################
 
-import math
 from abc import ABCMeta, abstractmethod
-from typing import Iterator
+from typing import Iterator, Optional
 
 from docling_core.types import Document as DLDocument
 from llama_index.core import Document as LIDocument
 from llama_index.core.node_parser import MarkdownNodeParser
-from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker
+from docling_core.transforms.chunker import HierarchicalChunker
 
 
 class ChunkingExecutor(metaclass=ABCMeta):
@@ -29,6 +28,7 @@ def chunk(self, content: str) -> Iterator[dict]:
 class DLJsonChunker(ChunkingExecutor):
     def __init__(
         self,
+        min_chunk_len: Optional[int],
         output_chunk_column_name: str,
         output_jsonpath_column_name: str,
         output_pageno_column_name_key: str,
@@ -38,7 +38,11 @@ def __init__(
         self.output_jsonpath_column_name = output_jsonpath_column_name
         self.output_pageno_column_name_key = output_pageno_column_name_key
         self.output_bbox_column_name_key = output_bbox_column_name_key
-        self._chunker = HierarchicalChunker(include_metadata=True)
+
+        chunker_kwargs = dict(include_metadata=True)
+        if min_chunk_len is not None:
+            chunker_kwargs["min_chunk_len"] = min_chunk_len
+        self._chunker = HierarchicalChunker(**chunker_kwargs)
 
     def chunk(self, content: str) -> Iterator[dict]:
         doc = DLDocument.model_validate_json(content)

diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py
@@ -25,12 +25,14 @@
 cli_prefix = f"{short_name}_"
 content_column_name_key = "content_column_name"
 chunking_type_key = "chunking_type"
+dl_min_chunk_len_key = "dl_min_chunk_len"
 output_chunk_column_name_key = "output_chunk_column_name"
 output_jsonpath_column_name_key = "output_jsonpath_column_name"
 output_pageno_column_name_key = "output_pageno_column_name"
 output_bbox_column_name_key = "output_bbox_column_name"
 content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}"
 chunking_type_cli_param = f"{cli_prefix}{chunking_type_key}"
+dl_min_chunk_len_cli_param = f"{cli_prefix}{dl_min_chunk_len_key}"
 output_chunk_column_name_cli_param = f"{cli_prefix}{output_chunk_column_name_key}"
 output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
 output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}"
@@ -47,6 +49,7 @@ def __str__(self):
 
 default_content_column_name = "contents"
 default_chunking_type = chunking_types.DL_JSON
+default_dl_min_chunk_len = None
 default_output_chunk_column_name = "contents"
 default_output_jsonpath_column_name = "doc_jsonpath"
 default_output_pageno_column_name = "page_number"
@@ -76,6 +79,7 @@ def __init__(self, config: dict[str, Any]):
         self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name)
 
         # Parameters for Docling JSON chunking
+        self.dl_min_chunk_len = config.get(dl_min_chunk_len_key, default_dl_min_chunk_len)
         self.output_jsonpath_column_name = config.get(
             output_jsonpath_column_name_key, default_output_jsonpath_column_name
         )
@@ -89,6 +93,7 @@ def __init__(self, config: dict[str, Any]):
         self.chunker: ChunkingExecutor
         if self.chunking_type == chunking_types.DL_JSON:
             self.chunker = DLJsonChunker(
+                min_chunk_len=self.dl_min_chunk_len,
                 output_chunk_column_name=self.output_chunk_column_name,
                 output_jsonpath_column_name=self.output_jsonpath_column_name,
                 output_pageno_column_name_key=self.output_pageno_column_name_key,
@@ -162,6 +167,11 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=default_content_column_name,
             help="Name of the column containing the text to be chunked",
         )
+        parser.add_argument(
+            f"--{dl_min_chunk_len_cli_param}",
+            default=default_dl_min_chunk_len,
+            help="Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a min_chunk_len=64.",
+        )
         parser.add_argument(
             f"--{output_chunk_column_name_cli_param}",
             default=default_output_chunk_column_name,