Skip to content

Commit

Permalink
doc_chunk updates and new parameters
Browse files Browse the repository at this point in the history
Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm committed Sep 16, 2024
1 parent dd96ca0 commit 939ed05
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 12 deletions.
15 changes: 8 additions & 7 deletions transforms/language/doc_chunk/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

This transform is chunking documents. It supports multiple _chunker modules_ (see the `chunking_type` parameter).

When using documents converted to JSON, the transform leverages the [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker`
When using documents converted to JSON, the transform leverages the [Docling Core](https://github.com/DS4SD/docling-core) `HierarchicalChunker`
to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc.
It relies on documents converted with the Docling library in the [pdf2parquet transform](../pdf2parquet) using the option `contents_type: "application/json"`,
which provides the required JSON structure.
Expand All @@ -19,12 +19,13 @@ The transform can be tuned with the following parameters.

| Parameter | Default | Description |
|------------|----------|--------------|
| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.ibm.com/DeepSearch/quackling). |
| `content_column_name_key` | `contents` | Name of the column containing the text to be chunked. |
| `output_chunk_column_name_key` | `contents` | Column name to store the chunks in the output table. |
| `output_jsonpath_column_name_key`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. |
| `output_pageno_column_name_key` | `page_number` | Column name to store the page number of the chunk in the output table. |
| `output_bbox_column_name_key` | `bbox` | Column name to store the bbox of the chunk in the output table. |
| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling). |
| `content_column_name` | `contents` | Name of the column containing the text to be chunked. |
| `dl_min_chunk_len` | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. |
| `output_chunk_column_name` | `contents` | Column name to store the chunks in the output table. |
| `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. |
| `output_pageno_column_name` | `page_number` | Column name to store the page number of the chunk in the output table. |
| `output_bbox_column_name` | `bbox` | Column name to store the bbox of the chunk in the output table. |

When invoking the CLI, the parameters must be set as `--doc_chunk_<name>`, e.g. `--doc_chunk_column_name_key=myoutput`.

Expand Down
3 changes: 2 additions & 1 deletion transforms/language/doc_chunk/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ authors = [
]
dependencies = [
"data-prep-toolkit==0.2.1.dev0",
"quackling==0.4.0",
"docling-core==1.3.0",
"llama-index-core>=0.11.0,<0.12.0",
]

[build-system]
Expand Down
12 changes: 8 additions & 4 deletions transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,13 @@
# limitations under the License.
################################################################################

import math
from abc import ABCMeta, abstractmethod
from typing import Iterator
from typing import Iterator, Optional

from docling_core.types import Document as DLDocument
from llama_index.core import Document as LIDocument
from llama_index.core.node_parser import MarkdownNodeParser
from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker
from docling_core.transforms.chunker import HierarchicalChunker


class ChunkingExecutor(metaclass=ABCMeta):
Expand All @@ -29,6 +28,7 @@ def chunk(self, content: str) -> Iterator[dict]:
class DLJsonChunker(ChunkingExecutor):
def __init__(
self,
min_chunk_len: Optional[int],
output_chunk_column_name: str,
output_jsonpath_column_name: str,
output_pageno_column_name_key: str,
Expand All @@ -38,7 +38,11 @@ def __init__(
self.output_jsonpath_column_name = output_jsonpath_column_name
self.output_pageno_column_name_key = output_pageno_column_name_key
self.output_bbox_column_name_key = output_bbox_column_name_key
self._chunker = HierarchicalChunker(include_metadata=True)

chunker_kwargs = dict(include_metadata=True)
if min_chunk_len is not None:
chunker_kwargs["min_chunk_len"] = min_chunk_len
self._chunker = HierarchicalChunker(**chunker_kwargs)

def chunk(self, content: str) -> Iterator[dict]:
doc = DLDocument.model_validate_json(content)
Expand Down
10 changes: 10 additions & 0 deletions transforms/language/doc_chunk/python/src/doc_chunk_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,14 @@
cli_prefix = f"{short_name}_"
content_column_name_key = "content_column_name"
chunking_type_key = "chunking_type"
dl_min_chunk_len_key = "dl_min_chunk_len"
output_chunk_column_name_key = "output_chunk_column_name"
output_jsonpath_column_name_key = "output_jsonpath_column_name"
output_pageno_column_name_key = "output_pageno_column_name"
output_bbox_column_name_key = "output_bbox_column_name"
content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}"
chunking_type_cli_param = f"{cli_prefix}{chunking_type_key}"
dl_min_chunk_len_cli_param = f"{cli_prefix}{dl_min_chunk_len_key}"
output_chunk_column_name_cli_param = f"{cli_prefix}{output_chunk_column_name_key}"
output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}"
Expand All @@ -47,6 +49,7 @@ def __str__(self):

default_content_column_name = "contents"
default_chunking_type = chunking_types.DL_JSON
default_dl_min_chunk_len = None
default_output_chunk_column_name = "contents"
default_output_jsonpath_column_name = "doc_jsonpath"
default_output_pageno_column_name = "page_number"
Expand Down Expand Up @@ -76,6 +79,7 @@ def __init__(self, config: dict[str, Any]):
self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name)

# Parameters for Docling JSON chunking
self.dl_min_chunk_len = config.get(dl_min_chunk_len_key, default_dl_min_chunk_len)
self.output_jsonpath_column_name = config.get(
output_jsonpath_column_name_key, default_output_jsonpath_column_name
)
Expand All @@ -89,6 +93,7 @@ def __init__(self, config: dict[str, Any]):
self.chunker: ChunkingExecutor
if self.chunking_type == chunking_types.DL_JSON:
self.chunker = DLJsonChunker(
min_chunk_len=self.dl_min_chunk_len,
output_chunk_column_name=self.output_chunk_column_name,
output_jsonpath_column_name=self.output_jsonpath_column_name,
output_pageno_column_name_key=self.output_pageno_column_name_key,
Expand Down Expand Up @@ -162,6 +167,11 @@ def add_input_params(self, parser: ArgumentParser) -> None:
default=default_content_column_name,
help="Name of the column containing the text to be chunked",
)
parser.add_argument(
f"--{dl_min_chunk_len_cli_param}",
default=default_dl_min_chunk_len,
help="Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a min_chunk_len=64.",
)
parser.add_argument(
f"--{output_chunk_column_name_cli_param}",
default=default_output_chunk_column_name,
Expand Down

0 comments on commit 939ed05

Please sign in to comment.