diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md index 1440b86e6..ca3cdec1b 100644 --- a/transforms/language/doc_chunk/python/README.md +++ b/transforms/language/doc_chunk/python/README.md @@ -2,7 +2,7 @@ This transform is chunking documents. It supports multiple _chunker modules_ (see the `chunking_type` parameter). -When using documents converted to JSON, the transform leverages the [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker` +When using documents converted to JSON, the transform leverages the [Docling Core](https://github.com/DS4SD/docling-core) `HierarchicalChunker` to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc. It relies on documents converted with the Docling library in the [pdf2parquet transform](../pdf2parquet) using the option `contents_type: "application/json"`, which provides the required JSON structure. @@ -19,12 +19,13 @@ The transform can be tuned with the following parameters. | Parameter | Default | Description | |------------|----------|--------------| -| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.ibm.com/DeepSearch/quackling). | -| `content_column_name_key` | `contents` | Name of the column containing the text to be chunked. | -| `output_chunk_column_name_key` | `contents` | Column name to store the chunks in the output table. | -| `output_jsonpath_column_name_key`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. | -| `output_pageno_column_name_key` | `page_number` | Column name to store the page number of the chunk in the output table. | -| `output_bbox_column_name_key` | `bbox` | Column name to store the bbox of the chunk in the output table. | +| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling). | +| `content_column_name` | `contents` | Name of the column containing the text to be chunked. | +| `dl_min_chunk_len` | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. | +| `output_chunk_column_name` | `contents` | Column name to store the chunks in the output table. | +| `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. | +| `output_pageno_column_name` | `page_number` | Column name to store the page number of the chunk in the output table. | +| `output_bbox_column_name` | `bbox` | Column name to store the bbox of the chunk in the output table. | When invoking the CLI, the parameters must be set as `--doc_chunk_`, e.g. `--doc_chunk_column_name_key=myoutput`. diff --git a/transforms/language/doc_chunk/python/pyproject.toml b/transforms/language/doc_chunk/python/pyproject.toml index 1b4988bdc..1dbf38560 100644 --- a/transforms/language/doc_chunk/python/pyproject.toml +++ b/transforms/language/doc_chunk/python/pyproject.toml @@ -12,7 +12,8 @@ authors = [ ] dependencies = [ "data-prep-toolkit==0.2.1.dev0", - "quackling==0.4.0", + "docling-core==1.3.0", + "llama-index-core>=0.11.0,<0.12.0", ] [build-system] diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py index b144a93bc..3deb1ecdc 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py @@ -10,14 +10,13 @@ # limitations under the License. ################################################################################ -import math from abc import ABCMeta, abstractmethod -from typing import Iterator +from typing import Iterator, Optional from docling_core.types import Document as DLDocument from llama_index.core import Document as LIDocument from llama_index.core.node_parser import MarkdownNodeParser -from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker +from docling_core.transforms.chunker import HierarchicalChunker class ChunkingExecutor(metaclass=ABCMeta): @@ -29,6 +28,7 @@ def chunk(self, content: str) -> Iterator[dict]: class DLJsonChunker(ChunkingExecutor): def __init__( self, + min_chunk_len: Optional[int], output_chunk_column_name: str, output_jsonpath_column_name: str, output_pageno_column_name_key: str, @@ -38,7 +38,11 @@ def __init__( self.output_jsonpath_column_name = output_jsonpath_column_name self.output_pageno_column_name_key = output_pageno_column_name_key self.output_bbox_column_name_key = output_bbox_column_name_key - self._chunker = HierarchicalChunker(include_metadata=True) + + chunker_kwargs = dict(include_metadata=True) + if min_chunk_len is not None: + chunker_kwargs["min_chunk_len"] = min_chunk_len + self._chunker = HierarchicalChunker(**chunker_kwargs) def chunk(self, content: str) -> Iterator[dict]: doc = DLDocument.model_validate_json(content) diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py index f5f13c03c..d52bf581d 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py @@ -25,12 +25,14 @@ cli_prefix = f"{short_name}_" content_column_name_key = "content_column_name" chunking_type_key = "chunking_type" +dl_min_chunk_len_key = "dl_min_chunk_len" output_chunk_column_name_key = "output_chunk_column_name" output_jsonpath_column_name_key = "output_jsonpath_column_name" output_pageno_column_name_key = "output_pageno_column_name" output_bbox_column_name_key = "output_bbox_column_name" content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}" chunking_type_cli_param = f"{cli_prefix}{chunking_type_key}" +dl_min_chunk_len_cli_param = f"{cli_prefix}{dl_min_chunk_len_key}" output_chunk_column_name_cli_param = f"{cli_prefix}{output_chunk_column_name_key}" output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}" output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}" @@ -47,6 +49,7 @@ def __str__(self): default_content_column_name = "contents" default_chunking_type = chunking_types.DL_JSON +default_dl_min_chunk_len = None default_output_chunk_column_name = "contents" default_output_jsonpath_column_name = "doc_jsonpath" default_output_pageno_column_name = "page_number" @@ -76,6 +79,7 @@ def __init__(self, config: dict[str, Any]): self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name) # Parameters for Docling JSON chunking + self.dl_min_chunk_len = config.get(dl_min_chunk_len_key, default_dl_min_chunk_len) self.output_jsonpath_column_name = config.get( output_jsonpath_column_name_key, default_output_jsonpath_column_name ) @@ -89,6 +93,7 @@ def __init__(self, config: dict[str, Any]): self.chunker: ChunkingExecutor if self.chunking_type == chunking_types.DL_JSON: self.chunker = DLJsonChunker( + min_chunk_len=self.dl_min_chunk_len, output_chunk_column_name=self.output_chunk_column_name, output_jsonpath_column_name=self.output_jsonpath_column_name, output_pageno_column_name_key=self.output_pageno_column_name_key, @@ -162,6 +167,11 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=default_content_column_name, help="Name of the column containing the text to be chunked", ) + parser.add_argument( + f"--{dl_min_chunk_len_cli_param}", + default=default_dl_min_chunk_len, + help="Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a min_chunk_len=64.", + ) parser.add_argument( f"--{output_chunk_column_name_cli_param}", default=default_output_chunk_column_name,