diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md index 4e56972f5..f962717d6 100644 --- a/transforms/language/doc_chunk/python/README.md +++ b/transforms/language/doc_chunk/python/README.md @@ -29,10 +29,12 @@ The transform can be tuned with the following parameters. | Parameter | Default | Description | |------------|----------|--------------| -| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling). | +| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `li_token_text` for using the LlamaIndex [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/), which chunks the text into fixed-sized windows of tokens. | | `content_column_name` | `contents` | Name of the column containing the text to be chunked. | | `doc_id_column_name` | `document_id` | Name of the column containing the doc_id to be propagated in the output. | | `dl_min_chunk_len` | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. | +| `chunk_size_tokens` | `128` | Size of the chunk in tokens for the token text chunker. | +| `chunk_overlap_tokens` | `30` | Number of tokens overlapping between chunks for the token text chunker. | | `output_chunk_column_name` | `contents` | Column name to store the chunks in the output table. | | `output_source_doc_id_column_name` | `source_document_id` | Column name to store the `doc_id` from the input table. | | `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. | diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py index 3deb1ecdc..a8ba44f61 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py @@ -11,9 +11,10 @@ ################################################################################ from abc import ABCMeta, abstractmethod -from typing import Iterator, Optional +from typing import Iterator, Optional, Dict, List from docling_core.types import Document as DLDocument +from llama_index.core.node_parser.text.token import TokenTextSplitter from llama_index.core import Document as LIDocument from llama_index.core.node_parser import MarkdownNodeParser from docling_core.transforms.chunker import HierarchicalChunker @@ -66,3 +67,73 @@ def chunk(self, content: str) -> Iterator[dict]: yield { self.output_chunk_column_name: node.text, } + + +class LITokenTextSplitter(ChunkingExecutor): + """ + A text chunker that leverages Llama Index's token-based text splitter. This splitter breaks input text into + fixed-window chunks, with each chunk measured in tokens rather than characters. + + The chunking process ensures that each chunk contains a specific number of tokens, and an optional overlap between + chunks (also measured in tokens) can be specified to preserve context between the chunks. + + Args: + output_chunk_column_name (str): Name of the output column containing the text of each chunk. + output_chunk_column_id (str): Name of the output column containing the ID of each chunk. + chunk_size_tokens (int): Length of each chunk in number of tokens. + chunk_overlap_tokens (int): Number of tokens overlapping between consecutive chunks. + + Attributes: + output_chunk_column_name (str) + output_chunk_column_id (str) + chunk_size_tokens (int) + chunk_overlap_tokens (int) + """ + + def __init__( + self, + output_chunk_column_name: str, + output_chunk_column_id: str, + chunk_size_tokens: int, + chunk_overlap_tokens: int + ): + self.output_chunk_column_name = output_chunk_column_name + self.output_chunk_column_id = output_chunk_column_id + self.chunk_size = chunk_size_tokens + self.chunk_overlap = chunk_overlap_tokens + + + def _chunk_text(self, text: str) -> List[str]: + """ + Internal method to chunk text using TokenTextSplitter. + + Args: + text (str): Input text to be chunked. + + Returns: + List[str]: List of chunked text. + """ + text_splitter = TokenTextSplitter( + chunk_size=self.chunk_size, + chunk_overlap=self.chunk_overlap + ) + return text_splitter.split_text(text) + + + def chunk(self, text: str) -> Iterator[Dict]: + """ + Chunks input text into fixed-window lengths with token overlap. + + Args: + text (str): Input text to be chunked. + + Yields: + Dict: Chunked text with ID. + """ + chunk_id = 0 + for chunk in self._chunk_text(text): + yield { + self.output_chunk_column_id: chunk_id, + self.output_chunk_column_name: chunk, + } + chunk_id += 1 \ No newline at end of file diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py index 20d980c22..e0fdfa871 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py @@ -17,11 +17,12 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils from doc_chunk_transform_python import DocChunkPythonTransformConfiguration - +from doc_chunk_transform import chunking_types # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) # input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_md")) +# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_token_text")) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = { "input_folder": input_folder, @@ -39,6 +40,11 @@ # doc_chunk params # "doc_chunk_chunking_type": "li_markdown", "doc_chunk_chunking_type": "dl_json", + # "doc_chunk_chunking_type": chunking_types.LI_TOKEN_TEXT, + # fixed-size params + # "doc_chunk_output_chunk_column_name": "chunk_text", + # "doc_chunk_chunk_size_tokens": 128, + # "doc_chunk_chunk_overlap_tokens": 30 } if __name__ == "__main__": # Set the simulated command line args diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py index 5495cf778..7acdd3ef1 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py @@ -18,7 +18,7 @@ import pyarrow as pa from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger -from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown +from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown, LITokenTextSplitter short_name = "doc_chunk" @@ -27,7 +27,10 @@ doc_id_column_name_key = "doc_id_column_name" chunking_type_key = "chunking_type" dl_min_chunk_len_key = "dl_min_chunk_len" +chunk_size_tokens_key = "chunk_size_tokens" +chunk_overlap_tokens_key = "chunk_overlap_tokens" output_chunk_column_name_key = "output_chunk_column_name" +output_chunk_column_id_key = "output_chunk_column_id" output_source_doc_id_column_name_key = "output_source_doc_id_column_name" output_jsonpath_column_name_key = "output_jsonpath_column_name" output_pageno_column_name_key = "output_pageno_column_name" @@ -41,11 +44,13 @@ output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}" output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}" output_bbox_column_name_cli_param = f"{cli_prefix}{output_bbox_column_name_key}" - +chunk_size_tokens_cli_param = f"{cli_prefix}{chunk_size_tokens_key}" +chunk_overlap_tokens_cli_param = f"{cli_prefix}{chunk_overlap_tokens_key}" class chunking_types(str, enum.Enum): LI_MARKDOWN = "li_markdown" DL_JSON = "dl_json" + LI_TOKEN_TEXT = "li_token_text" def __str__(self): return str(self.value) @@ -56,11 +61,13 @@ def __str__(self): default_chunking_type = chunking_types.DL_JSON default_dl_min_chunk_len = None default_output_chunk_column_name = "contents" +default_output_chunk_column_id = "chunk_id" default_output_source_doc_id_column_name = "source_document_id" default_output_jsonpath_column_name = "doc_jsonpath" default_output_pageno_column_name = "page_number" default_output_bbox_column_name = "bbox" - +default_chunk_size_tokens = 128 +default_chunk_overlap_tokens = 30 class DocChunkTransform(AbstractTableTransform): """ @@ -84,6 +91,7 @@ def __init__(self, config: dict[str, Any]): self.content_column_name = config.get(content_column_name_key, default_content_column_name) self.doc_id_column_name = config.get(doc_id_column_name_key, default_doc_id_column_name) self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name) + self.output_chunk_column_id = config.get(output_chunk_column_id_key, default_output_chunk_column_id) self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name) # Parameters for Docling JSON chunking @@ -96,6 +104,10 @@ def __init__(self, config: dict[str, Any]): ) self.output_bbox_column_name_key = config.get(output_bbox_column_name_key, default_output_bbox_column_name) + # Parameters for Fixed-size with overlap chunking + self.chunk_size_tokens = config.get(chunk_size_tokens_key, default_chunk_size_tokens) + self.chunk_overlap_tokens = config.get(chunk_overlap_tokens_key, default_chunk_overlap_tokens) + # Initialize chunker self.chunker: ChunkingExecutor @@ -111,6 +123,13 @@ def __init__(self, config: dict[str, Any]): self.chunker = LIMarkdown( output_chunk_column_name=self.output_chunk_column_name, ) + elif self.chunking_type == chunking_types.LI_TOKEN_TEXT: + self.chunker = LITokenTextSplitter( + output_chunk_column_name=self.output_chunk_column_name, + output_chunk_column_id=self.output_chunk_column_id, + chunk_size_tokens=self.chunk_size_tokens, + chunk_overlap_tokens=self.chunk_overlap_tokens + ) else: raise RuntimeError(f"{self.chunking_type=} is not valid.") @@ -213,6 +232,18 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=default_output_bbox_column_name, help="Column name to store the bbox of the chunk", ) + parser.add_argument( + f"--{chunk_size_tokens_cli_param}", + default=default_chunk_size_tokens, + type=int, + help="Size of the chunk in tokens for the fixed-sized chunker", + ) + parser.add_argument( + f"--{chunk_overlap_tokens_cli_param}", + default=default_chunk_overlap_tokens, + type=int, + help="Number of tokens overlapping between chunks for the fixed-sized chunker.", + ) def apply_input_params(self, args: Namespace) -> bool: """ diff --git a/transforms/language/doc_chunk/python/test-data/expected_token_text/metadata.json b/transforms/language/doc_chunk/python/test-data/expected_token_text/metadata.json new file mode 100644 index 000000000..4d84b5915 --- /dev/null +++ b/transforms/language/doc_chunk/python/test-data/expected_token_text/metadata.json @@ -0,0 +1,56 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "doc_chunk", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-04 14:00:40", + "end_time": "2024-10-04 14:00:41", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "chunking_type": "li_token_text", + "content_column_name": "contents", + "doc_id_column_name": "document_id", + "dl_min_chunk_len": null, + "output_chunk_column_name": "chunk_text", + "output_source_doc_id_column_name": "source_document_id", + "output_jsonpath_column_name": "doc_jsonpath", + "output_pageno_column_name": "page_number", + "output_bbox_column_name": "bbox", + "chunk_size_tokens": 128, + "chunk_overlap_tokens": 30, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [ + ".parquet" + ], + "num_processors": 0 + }, + "job_output_stats": { + "source_files": 1, + "source_size": 17749, + "result_files": 1, + "result_size": 8827, + "processing_time": 0.194, + "nfiles": 1, + "nrows": 10, + "source_doc_count": 2, + "result_doc_count": 10 + }, + "source": { + "name": "/Users/jmcappi/git/ai-foundation/data-prep-kit/transforms/language/doc_chunk/python/test-data/input_token_text", + "type": "path" + }, + "target": { + "name": "/Users/jmcappi/git/ai-foundation/data-prep-kit/transforms/language/doc_chunk/python/output", + "type": "path" + } +} \ No newline at end of file diff --git a/transforms/language/doc_chunk/python/test-data/expected_token_text/sample1.parquet b/transforms/language/doc_chunk/python/test-data/expected_token_text/sample1.parquet new file mode 100644 index 000000000..7c7065de0 Binary files /dev/null and b/transforms/language/doc_chunk/python/test-data/expected_token_text/sample1.parquet differ diff --git a/transforms/language/doc_chunk/python/test-data/input_token_text/sample1.parquet b/transforms/language/doc_chunk/python/test-data/input_token_text/sample1.parquet new file mode 100644 index 000000000..bcfb98661 Binary files /dev/null and b/transforms/language/doc_chunk/python/test-data/input_token_text/sample1.parquet differ diff --git a/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py b/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py index c593e1de5..5ecfa49a2 100644 --- a/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py +++ b/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py @@ -16,7 +16,11 @@ from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from doc_chunk_transform import chunking_type_cli_param, chunking_types +from doc_chunk_transform import ( + chunking_type_cli_param, + output_chunk_column_name_cli_param, + chunking_types +) from doc_chunk_transform_python import DocChunkPythonTransformConfiguration @@ -55,4 +59,17 @@ def get_test_transform_fixtures(self) -> list[tuple]: basedir + "/expected_md", ) ) + + # Run with fixed size token chunker + fixtures.append( + ( + launcher, + { + chunking_type_cli_param: chunking_types.LI_TOKEN_TEXT, + output_chunk_column_name_cli_param: "chunk_text" + }, + basedir + "/input_token_text", + basedir + "/expected_token_text", + ) + ) return fixtures