From 242fad147b19db35babc56c9cee745c348ad4bcd Mon Sep 17 00:00:00 2001 From: Juan Cappi Date: Sun, 29 Sep 2024 09:54:24 -0300 Subject: [PATCH 1/5] feat: #641 - first draft implementation, python only Signed-off-by: Juan Cappi --- .../language/doc_chunk/python/README.md | 4 +- .../python/src/doc_chunk_chunkers.py | 69 +++++++++++++++++- .../python/src/doc_chunk_local_python.py | 6 ++ .../python/src/doc_chunk_transform.py | 37 +++++++++- .../expected_fixed_size/sample1.parquet | Bin 0 -> 8827 bytes .../input_fixed_size/sample1.parquet | Bin 0 -> 17749 bytes .../python/test/test_doc_chunk_python.py | 19 ++++- 7 files changed, 129 insertions(+), 6 deletions(-) create mode 100644 transforms/language/doc_chunk/python/test-data/expected_fixed_size/sample1.parquet create mode 100644 transforms/language/doc_chunk/python/test-data/input_fixed_size/sample1.parquet diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md index 4e56972f5..eea2d8105 100644 --- a/transforms/language/doc_chunk/python/README.md +++ b/transforms/language/doc_chunk/python/README.md @@ -29,10 +29,12 @@ The transform can be tuned with the following parameters. | Parameter | Default | Description | |------------|----------|--------------| -| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling). | +| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `fixed_size` for chunking text into fixed-sized windows of tokens, where both the window size and overlap between windows are measured in tokens. | | `content_column_name` | `contents` | Name of the column containing the text to be chunked. | | `doc_id_column_name` | `document_id` | Name of the column containing the doc_id to be propagated in the output. | | `dl_min_chunk_len` | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. | +| `chunk_size_tokens` | `128` | Size of the chunk in tokens for the fixed-sized chunker. | +| `chunk_overlap_tokens` | `30` | Number of tokens overlapping between chunks for the fixed-sized chunker. | | `output_chunk_column_name` | `contents` | Column name to store the chunks in the output table. | | `output_source_doc_id_column_name` | `source_document_id` | Column name to store the `doc_id` from the input table. | | `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. | diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py index 3deb1ecdc..3b124aa11 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py @@ -11,9 +11,10 @@ ################################################################################ from abc import ABCMeta, abstractmethod -from typing import Iterator, Optional +from typing import Iterator, Optional, Dict, List from docling_core.types import Document as DLDocument +from llama_index.core.node_parser.text.token import TokenTextSplitter from llama_index.core import Document as LIDocument from llama_index.core.node_parser import MarkdownNodeParser from docling_core.transforms.chunker import HierarchicalChunker @@ -66,3 +67,69 @@ def chunk(self, content: str) -> Iterator[dict]: yield { self.output_chunk_column_name: node.text, } + + +class FixedTokenSizeChunker(ChunkingExecutor): + """ + Chunks input text into fixed-window lengths, measured in tokens, with an overlap also measured in tokens. + + Args: + output_chunk_column_name (str): Name of the output column containing the text of each chunk. + output_chunk_column_id (str): Name of the output column containing the ID of each chunk. + chunk_size_tokens (int): Length of each chunk in number of tokens. + chunk_overlap_tokens (int): Number of tokens overlapping between consecutive chunks. + + Attributes: + output_chunk_column_name (str) + output_chunk_column_id (str) + chunk_size_tokens (int) + chunk_overlap_tokens (int) + """ + + def __init__( + self, + output_chunk_column_name: str, + output_chunk_column_id: str, + chunk_size_tokens: int, + chunk_overlap_tokens: int + ): + self.output_chunk_column_name = output_chunk_column_name + self.output_chunk_column_id = output_chunk_column_id + self.chunk_size = chunk_size_tokens + self.chunk_overlap = chunk_overlap_tokens + + + def _chunk_text(self, text: str) -> List[str]: + """ + Internal method to chunk text using TokenTextSplitter. + + Args: + text (str): Input text to be chunked. + + Returns: + List[str]: List of chunked text. + """ + text_splitter = TokenTextSplitter( + chunk_size=self.chunk_size, + chunk_overlap=self.chunk_overlap + ) + return text_splitter.split_text(text) + + + def chunk(self, text: str) -> Iterator[Dict]: + """ + Chunks input text into fixed-window lengths with token overlap. + + Args: + text (str): Input text to be chunked. + + Yields: + Dict: Chunked text with ID. + """ + chunk_id = 0 + for chunk in self._chunk_text(text): + yield { + self.output_chunk_column_id: chunk_id, + self.output_chunk_column_name: chunk, + } + chunk_id += 1 \ No newline at end of file diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py index 20d980c22..ecadef287 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py @@ -22,6 +22,7 @@ # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) # input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_md")) +# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_fixed_size")) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = { "input_folder": input_folder, @@ -39,6 +40,11 @@ # doc_chunk params # "doc_chunk_chunking_type": "li_markdown", "doc_chunk_chunking_type": "dl_json", + # "doc_chunk_chunking_type": "fixed_size", + # fixed-size params + # "doc_chunk_output_chunk_column_name": "chunk_text", + # "doc_chunk_chunk_size_tokens": 128, + # "doc_chunk_chunk_overlap_tokens": 30 } if __name__ == "__main__": # Set the simulated command line args diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py index 5495cf778..2ab8f34ea 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py @@ -18,7 +18,7 @@ import pyarrow as pa from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger -from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown +from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown, FixedTokenSizeChunker short_name = "doc_chunk" @@ -27,7 +27,10 @@ doc_id_column_name_key = "doc_id_column_name" chunking_type_key = "chunking_type" dl_min_chunk_len_key = "dl_min_chunk_len" +chunk_size_tokens_key = "chunk_size_tokens" +chunk_overlap_tokens_key = "chunk_overlap_tokens" output_chunk_column_name_key = "output_chunk_column_name" +output_chunk_column_id_key = "output_chunk_column_id" output_source_doc_id_column_name_key = "output_source_doc_id_column_name" output_jsonpath_column_name_key = "output_jsonpath_column_name" output_pageno_column_name_key = "output_pageno_column_name" @@ -41,11 +44,13 @@ output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}" output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}" output_bbox_column_name_cli_param = f"{cli_prefix}{output_bbox_column_name_key}" - +chunk_size_tokens_cli_param = f"{cli_prefix}{chunk_size_tokens_key}" +chunk_overlap_tokens_cli_param = f"{cli_prefix}{chunk_overlap_tokens_key}" class chunking_types(str, enum.Enum): LI_MARKDOWN = "li_markdown" DL_JSON = "dl_json" + FIXED_SIZE = "fixed_size" def __str__(self): return str(self.value) @@ -56,11 +61,13 @@ def __str__(self): default_chunking_type = chunking_types.DL_JSON default_dl_min_chunk_len = None default_output_chunk_column_name = "contents" +default_output_chunk_column_id = "chunk_id" default_output_source_doc_id_column_name = "source_document_id" default_output_jsonpath_column_name = "doc_jsonpath" default_output_pageno_column_name = "page_number" default_output_bbox_column_name = "bbox" - +default_chunk_size_tokens = 128 +default_chunk_overlap_tokens = 30 class DocChunkTransform(AbstractTableTransform): """ @@ -84,6 +91,7 @@ def __init__(self, config: dict[str, Any]): self.content_column_name = config.get(content_column_name_key, default_content_column_name) self.doc_id_column_name = config.get(doc_id_column_name_key, default_doc_id_column_name) self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name) + self.output_chunk_column_id = config.get(output_chunk_column_id_key, default_output_chunk_column_id) self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name) # Parameters for Docling JSON chunking @@ -96,6 +104,10 @@ def __init__(self, config: dict[str, Any]): ) self.output_bbox_column_name_key = config.get(output_bbox_column_name_key, default_output_bbox_column_name) + # Parameters for Fixed-size with overlap chunking + self.chunk_size_tokens = config.get(chunk_size_tokens_key, default_chunk_size_tokens) + self.chunk_overlap_tokens = config.get(chunk_overlap_tokens_key, default_chunk_overlap_tokens) + # Initialize chunker self.chunker: ChunkingExecutor @@ -111,6 +123,13 @@ def __init__(self, config: dict[str, Any]): self.chunker = LIMarkdown( output_chunk_column_name=self.output_chunk_column_name, ) + elif self.chunking_type == chunking_types.FIXED_SIZE: + self.chunker = FixedTokenSizeChunker( + output_chunk_column_name=self.output_chunk_column_name, + output_chunk_column_id=self.output_chunk_column_id, + chunk_size_tokens=self.chunk_size_tokens, + chunk_overlap_tokens=self.chunk_overlap_tokens + ) else: raise RuntimeError(f"{self.chunking_type=} is not valid.") @@ -213,6 +232,18 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=default_output_bbox_column_name, help="Column name to store the bbox of the chunk", ) + parser.add_argument( + f"--{chunk_size_tokens_cli_param}", + default=default_chunk_size_tokens, + type=int, + help="Size of the chunk in tokens for the fixed-sized chunker", + ) + parser.add_argument( + f"--{chunk_overlap_tokens_cli_param}", + default=default_chunk_overlap_tokens, + type=int, + help="Number of tokens overlapping between chunks for the fixed-sized chunker.", + ) def apply_input_params(self, args: Namespace) -> bool: """ diff --git a/transforms/language/doc_chunk/python/test-data/expected_fixed_size/sample1.parquet b/transforms/language/doc_chunk/python/test-data/expected_fixed_size/sample1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7c7065de02a582dfa29be4d8684ffef212ae8f7e GIT binary patch literal 8827 zcmeHNdss~C_upM56{Q(7blHg!x=uAygr-tWCf&;|+B37~qG=|3_H^M;5``3%4!I`ko-?g6n zS%G#WbyBV>DN&UJDv$vLSyb43fHzdBK?bQ~%yu!2KYjWn-%xVN`-b}HgH2_d7jSKp zTj&py!>dm{IAuWJl$E=MLOP*&a#_@{+(RPDx(lVjF6ZO&B^UPGId^Dj{k_^wmBO5q zh4eX9i+On-aq-6|^WLSX#H6XxnFP$FipO@c^U4qfzlNS1g1j;%GT6#ALBzsSs8~!*>$1V1$Qbs9X-a zVW^akz+s3O6Gtha0qtl9K#(eE7^F2*G&Nr!AmUZ+7$ z(x4To(zO1kFicXC2m)=GME{fhN34V(6);3R#1!KBt6?ZwgCi8J!MS9} z;v2nySm{-N(c8PBMkMtCy&1@53XF#?;>&o7C{&6s67!WxJg-9TRiSlg(bBYx{tbrT z1Q|i{;E;5_IJxab2--;CRE5+abx1>r5`r{>L0bi&H6mGmaYKsM;5bEVU?N1I{d?~q zTD>KMUZSRvx<+IPL9{O2FNn2xB84<^5ND<;IaQTbsYAOq4B-6xqW^-`H*wzdB^8-* zzy=!J&Jj|B5I0DBQ{1Q-u2Gq|nyaz7PJNB>+J>M0cpDtCab>_LNNa3VlL2Gmn@Ss| z88*_{ca_bd@>M$yHP+CFGS_g%x3^9)^VP{GhpWt2!BlP>s4aD_Qg9NQk=KnKR!3Jp z6xirmxsP+=S@Vabx5cb!c)jGDZ&RM);@!HLn>Hbt+&WeFBni`M(bn__*WO>JKMQv; z3i>JX%q!0zM`O*qZfZNIqc>xdY%t;d=lS!}ACrT-;uqiAg?;pA zmCt@C*niy5nLg#&tb?iNFYk?|Ke@X=_hoozH8ohcU(r5Z$LT!eo)kyzd0W4!_}21v z+VC=IJahH_8$DXyCoex~e(B|0+&sOq$A6ka#o+$wwqL8Sv?Y}}dM*9Yb{90%quqBe z#`aF#SobWK6F%x@PhEH?rtrWxf z`Y~x{Z?N`dhUO8r)-D@MuQm0oy|-$NTi0r1MY&Dc(u*s+DC`9rf5f#5xGCz{^UV{C z=|g(!U*2wvnB%mIIzgVp?sTpxJE-orBR#OuWom7JXL>z#Q)MhYp=9gg+=h$Iw^964 zQJPVqImdT*Rz7JI#^Bkvx)C{MOBkK3PDBs(A z*(1%kYmNb`DYFgUMjkG6$>C4e?27Qs(%b&a&-E*Q<%M-FIn%wT@2RAHr{>3?CF_FeP%O5ZK+dxbk?jM>9LFZ@jzt^ zYdkx%GjaEwJ7X)uoPF@Ktyiw?SYEo!{rPVmh8fyrTFx=F32cL8AEf)}vzE#SjyC*z zb`^L1TH#ekyJs~WoS^k154~-=ceGsiigGNQyheYuq#|VcmQw%hqRX5rPUlt=dBc#| zHB+P|Tc>DxUmr4(EJ>2tB%7t!BA1<3Eir>U_iPV*APo<`$EzyF@+Z{)Z-S zzF6IxKI856eJ7R`u*ZuXmep1?P5rg;x>;U%N6nC-5fY|Rd4<@id{J{Zk~o2ICJKR*p@;LjYzvyUj5-Lwyu2pbQgAJcTxr= z|Bh;VnR$UVuV{tv_i^l8N%NH@;W3RHXp>S7n|rKFaNSb+JfiN-r73l@O@gIk?rgul zE9rdQ>K|TjFX(-IAh2n>etzH)qpXw(VT{uZi;Sw;gTvL{Sa|M&tZ!bc|9MPUbXB)* zSRzx=8of~SDdUtPl-m8UyAq!I+FQ-pj8p!!FJ7TUP6>RRQ2@4)f>7-c1}Jr zoniGM`|8-Z%yGjGQ9sU|WLkWrBs=iBSMfplwRJ^~Q&%DRY8BhauiIR5f||3#HB3_v zAA6<8@kj2ct=At(w!rBdA+|~%T;9LJ@R|=vfX6=ORLjVDK{+niC@Cwu6OhoLHmtDW~n5;DpDZ@ zdQy`wq@|^%9cB-KRmZ{}`|bM-yc--3aJHru7M(o*Gs9zj)6&qj-af0%Ow7F8;L!`n zKe0VRHjOSDRk$6WK-=q&Jm%8nQ=L1MOTeC(@|@TFFuo|_wFQ5ie_I=}of#|4+%)CN%;oMi@=Bk* zS8LP@Xd60{3s=hKn|tJWY<&2k-a=pcJj0>@ty(qdwrZ+tutUa)o(AoiPRC#DJ?H({ zg8ahoL1>?zm#}9Fwks>k=g7`2i%XzRQPJh?RXdAh(keacqs{L+3+-U!| zK_N#cfx8j7Fn+lkfh&5ZdaMYQ!faR~lSRS^4&&gC4dW;h1)SQ1ml zbU=zAF`$s3ai0+@4NXW-paYH~gasf#lVLD0DZ%YCXkdWFGSK-|$TtuPilrbf1eOC< z|7wtk2Vt;|k0J)>QX#kgvxSJY2(19lf#B_d0!;#Jm@Ep$#4$wtIS35!MBustIz37( zkr3+VgYc9Pk8W#6S1L}Z5Gx%8ti_WkxP(uDbpcYkEl{KcMT80zRtAqSHy)tEaRMiu z0sM+!^cgAPkUS(R8W9Tt&p$iLpR7vwDc~I?lj0(Y5rgU?Y(;EQ6rLF+#y~LK zZ(Lw?f{KfRojj-#rSM8UDScYGHMt0mO9=4*0pSb4sR%o0d&^>h4Kd5X@Ehn9Y^;6- zVh9ff0)PS#aZvz)Py}#yAO?`-D|aMzH4=k}B?wmnj)DF|;q$gpdOs2TgLT15#N@$^ zF&KCe!E1kbuC6hH9vrg;4A9!#zXa*p{h=3*#^K5!vn7ID;y~-Qr4sXW#7!~v}lvkHUE^^DN=bl3<{k%U$1np+PDO#wU5gzg_)|7FenTPfgT;J z%6-#DuHA9oEH|KZo+gx-c%x&ob(8n(QWu>wr#6sphge`G_Nz_Wg63P^?D?Px!6RmP zKEY|dStn2E>&X&4j2Fi6b^5M(N9WEpXgzTH?NuEp5nfexGERS`pmL;DT=og()IRBj z5Q{u_GzrP5-ynyMuQ+R)S;ev}uvCG-RNoA<(#J;?^ zZD)--P=jjSGjn||q<4}hu8;k_Gq3VP?>c&mKr26qU0@0&y3TmlD0H{GvTxE|S&2)? zT#Sni3!g0d@+}5Nw$NWPDUI7R?INgecjBdy#&@HzY1Px;l2V33?x@N4A^^pg##M$`rW z_?$oOKbn01Pb9v--|(>S(dzeT^?S4m|3jkHzkk(e|NIX#`E3l@kfudG>S+i;R3nNG zD=^S6Y)b!knsPzShIs5)ZY&ln0Q`t&7K;_jauc#VSu7#TU8wAH0~36SnmdcdCf=Yu z_y(;I@bj=G`lUqSrCdtcCKhIScmO6&pfb~QCee0_B}!$LsIXXa@NWFH@b{h_!=K}` zcwVUUUojB+0(uh?kd^vUk`wA8K)3}E<6eYl9z+ii?z2+<1ztb`_ytDuz2YQaNR;wS xgD6Czb%_GSt$5MVRv@_xa-AUmqs%|rF|6&ZZK(^uRqX3O==e|w@(2HR@_&k;A}s&_ literal 0 HcmV?d00001 diff --git a/transforms/language/doc_chunk/python/test-data/input_fixed_size/sample1.parquet b/transforms/language/doc_chunk/python/test-data/input_fixed_size/sample1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..bcfb98661d36f8f6c700f779911979b62d7f8345 GIT binary patch literal 17749 zcmeHP4RBmnb$-v@bykifl(15yMxl2V%a-VRkz~t>f*rDwC3ziLwk1b@E#uzZcX#($ z@4dTzKUyydVA078G&q5wp&<D< zRo~mS_8Z$flIU-1-_gFKyEC@l(-~X8wnOW?N-G9vVsvpt*SBxm{dK*&J<>I-?P%-j zyxx$xz!r{w+BDj?bwoP8q<1{4?>@3S_S1~tMN(<)+cdni;Z4)<`O$6ox4xVU`sJ}H zw9m+_>99HOuoJ@dOu=Tvir|*V=8F|}M6j`<^aMM`om^3PzHr$+j_C`79bduUXujd5 z%Xf{Leq{t>Wmu6@pIu>Dv%oB~u2 zU)2v@I@JFC)Vd#h|23CQcZa4;NY{o%)2=5RZZXey&1HNJt+2A;9!2gkSGfobJ!|3) zOPpg9!Yu@zbXXbPeo-((0(_{KgAM{LI=1X=)qU}n*G97psH<-+) zZqa-)9>eWNgB5Ivz)S5sg~1O8ey(V9CpuMv5H#Hk-Vciv6Mma_rOjM(1wP#V`~c?Q z?9ZMU8nR8x(to?zfN$#^149FYLyS8HD+H#|GS!^3f~@}3M=%stG&JBBnI(835KNhv zWh;>0_h~&t2UtO3Z2j)0@7|XUe3s*OiJJw?zY+BwG$%^oorX0oN@ z7d?!b_nCBB#v>5o_4?Pl$8FQaj`5;T_L)95kHNA+IK!`wcJ=n%?qf5tvTy~lP?MW2%WHWZFqdy*}5X7y-OVOwGn`5&%sedAU zbM$?OA(G7v0d?3cSCohiEuU#i?R@e4BkfDkUHV;}hMCVpX8lY04|1UVnh;YHulHwxhc#2uMu`n|W8 zIrJ|uvBL|(OGe`dJ@}L3D=b3xlRO2%ghV?Pj8Muh+8 z12@5UF~2NX1?ut=Z%93{Jk@s?2jF4 zc{mOW`hk1#PE8)rbtrGGUAwI*K+J*iv%HuyzXR3fgt>ws!1bpiNG#99Y}pt7W?+D;G^m)FnOGDt)0FJ|I0@X&|U3 znWx{PUxXXf#QJ@qMV7fZ2O$d}_*`d2Pp#~@)|T9z!|{~9J4x3Hf)mVQUf z&Wg2%+Ynzw;bR%`)=M9YoRNvIX%(L8@5a6zzWGqRnX>ksD|+n3^N+zkMLD{& z`|{&YUO?2gk3O&6g-cWHR1rWS%X7=f4f><~_g(sOYE{>gXP@mXNOt-03s2n;*#%kN z&&sa!#U1~m#Zqjdm+N0h$n>vjN#RFi`Q-0SvFQpz;5>KEV0*E?*`n#ngq=(Wjt^`z zDm|}%5(K`^E7AmL!CuYVcZqb*S2xb6aHHJMcI~vyq^B;2+y^i zj;-B0^spX1)qaDax3LG1Wte9r{y=jNR5a+Y<(y`U`xDy+ zE}Q%^JpVqg zcQ(RO2L`l(z6^3BFSy+9XGhZC>gEn~YkBgl)#ewGn_jrF4}ut`;ovs^EB#uw^((Pm z2!*yMtQFyn{^NG><4CTk-x^0eC>D|V-qWkI8cY21F03@ed>7;JC#GORxV{fZ7qWyMAe@zoKC0Qh z@9JIaUKS;UA#XYH#!HXH?I2fV8U3&0EGvC^F5zl#6Fr%huXG`5`K$LuRz&-148!$* zJCYUYMC;pQTJX(NiR(3H7MjLyN4U%r;>g2BrpR8LUvSy9xonoPVE`q$TVWn_A;j8Q z?4+ReuXcU{8IP@)+z((P&K%Ap;v(~k5((r?z#K&DfZ8Rxoao&}ye)IP5CnM32PG44 zj&vj}_|OZ_Xb#7+Uif^ZcoBbukE0_$~)`Qo#?rqlmj^jN8GW8dX5+oj#vpThq>W8;R!-b=d4kd-06VlOJgh$`ai1g0S0pny zfO-*~HiA52JhzJ6HUbN3EkcV>o>MR#TsR8MKmNe#ZOKYt+ZC7)Q#{uCs+O3+EBdEa zTQy(5Il{{6k876xqc+4-&OZ7;H}IPb3dPi#G(LR!@rOS1;`s|NzVH0^KK&4!v4AME zOj+?ApWkc7vr=X~cJWP`+4^X6a{-gL-g!O7O0>Q|c1o79`vLF?&L@9uWHM!_q9Zvf_`V~f3!fcYkwcVJ3tI%VxRsE1l(d!5UlmF*aBj43h1za z0r=_D$QY(w7VNIfbCEK?F&RsqMzrG2GONgdweE|>fmutY8x;|ej6SQaMBf;JXwNox zqFv+yxAknZ#_yj>zc+@0=*H)p(tjAsux0(a=l`ggLh+@~#tKrl?rVDHTYB-*XQR(q zEFC~8?j6WR$W-jO8^9K`GH`R^j+BVCsibw4n(2o(yn=Lw?5!ZPO1r?f8+lD(X=LFr zwNWAW6LTAoJmN!`+<;sODA#4Tr0TULu5k&1aQMU5+CwV4KdbBy^#qm+>OeEI>Tg$a+m6mYB~fIMHoaU z4TyvY6ijeGlnO<`8K~b8*io;blH6R}B5OVN6Yf%q+LTmjZjuCfmCCA2b*;7BiazLU zJ-N~)sC2?cm10%*dXl}$!mv9x=Ugh-Q*rg0=7UP>qzTrd76}6pQsUSs)i0}1saQZ1?25|UIAGcg`9)2v3`}o)vPLq z!`?XHm-rzf z0tbpzwFzN9s({omNy0Xpz>5L{=tCq^aG=VS5F`k9Tx#?6o8SQi%dg{u^}VJ*f-289 zPy%WX1X!M>x)RH;B84kptOB;k+e$+O9n`Y(0Tov8E~-1Xq6dORswBZZ>j(msDH*v& z4iPHbfDtGX47lYLI6?`g?4hP#M-$4uVD%7C5bg;P1!??ksg$}FF@~rN6e*V4Q51V1 z{JH>`Ei^Nt2Y?T$~{cr9?tx@vraG^|YCYLUw*^b(diSfm0lkn_oFBbYFRRTUi~ z8CC5dP!v&NqeumtVyP$`2Fk0T88s^;01<-HH}q4O&tuDwt7=CgX@yPEJ;XLO53b@H zq*3P@_$I5ox*Srn^%#euRfuuaBq9lus(pmP5u~AuD$*ggE{r8us8Z{?(^4!5BXl)w zg+(YwUX#XpmZNznd#OP6av-Q4WGKkf`TgdJUkHIoyyh!rFze~fmuE5+8%mKL3cWg) zSz~55F`C4s5~WGme-o!!9gBEP+R7OBl@gmY6j8N?_(|+$?U4cDxhq)C>L8Gf=rycs zo$OSRypHayk-Mrx>1hk~NnW?XebT@+@^k%hgknIei>|StbW=SI3Nau{-LsBOl<0bf zsnN^ez-ml4Mxu*rJU7JvS`@LR;ie$El>A@KL0uqCiL0fDCQ2|&y@fih3a{wX+Q{k} zy1&Y#QqH7}p>`R?2LuYxsbtlVfUWnk@Zn5yPyHe{Q>`JbUB|Yn+3WaLwW`g8t6J$+ z%C#PG(>)>kO7H>;RkSr)uR_97T>;slIHmA9CN^{w@)j_E6y2`M#)1H5AstKB03Ta5 z1)?)SVx$sf6rlSmss=Snj)0CTAxwr;ut(9uNFta~VmTpEtFnJc)hd%iPPHV(*D7$J zVq2a{8Je(4*@if(IyIxMqrhWG-l}-HzHBNX6n(220ID%L57(?FTIX@AT1xHN^vr|R zk-DVd4OVyS1Bj5{t+}l7X5x5>*;fLR)yPRht~U&8Rk{}l8vHF_0CY!}!2v<}s%ILt zU0=mjp*gR`{sK;reF*%kXBTxGFg;Y7hgRVtEI>9w3HbtMKczTf1sxSDTzw2w z6$+b_l3FN7{S%mKP`nLB0r6a;i{T@XXoE2(=GuWDq1ami9KwpL0x9O0_AYHdCC6s| zc*~=~m(FcG8jR)e#D};D^>pxq_jP>I-u;C;c7O7EdOFxB)0HXL%Q=M;YJ_#8OozkQ z_y^#)Mwwp6xHQUi6%iX{dZSEll2jWqP&lY?SG6hs}V!QKqAa zZj|YCNUBk$D+aYurdJPX{huz=uXYHwYid~Q+yT;E*t~rQPDl4{+jR6=Yi~wNc*3_= zo9afbuUcw_eYLS8($(E{@E3A|d-wnPv&Xv|6bYEH#z_XMVH+nI8YdYVCmF(Xk;FaY z;CACAL+vnS<0Qk+bo#P!k|8{)**M97r&#oulb%{#b-pw9I{GQ652w@VllY~dyVL3Y>Cr-ZBAqUz#|o-# zbaYdHnHtAy{iEfv@H1Z3D5^h`PM6cT5AQU{;Q{Ky=XCn`NxY5Tma(m1X>xA=!qD_e z_Q;%{b&i?G&C%?_Y`KuJt8LauDQnMIVsh?HnKQC5KRDoJ#*UuJ4xJm!&JTxe`#dvi zPxwo-nFHus&JInZZ!RzKWNc78GB}Bao?Q3G(SABFk~%f zOw8$w7BN>bH#jmN7DlVrS%GnQaX|Ik!-Ls{(cy)CGsC&bdmwAt=j;ebb6y^65tXAG^ExvnWAdC8j<-}=nm@O^Ff?4u&WGa}lh!~MW7BxK z$qDFZ;!XAQ8S}%X#i5a9td-Vt?EJyHZ!#p`&M)=BKN=9JGwd35+pQBzt6?IWK!;(4ciM?mKFseI;i?<|DA}q-Eyp ziHZpATNd*Zm7G1YFSOZ`m2;*`*`fW)9>-2C2FGUimotv?lMOlTxk_%xTFKIwW0}g- znM`oZJUT+|m`*R^ceHx#G5k%xqXT$JPt%KX>M==M1s_I7XH;X3UX?$qkL2eVX+xK) zb?5|c8aaiR+wpmP0-J`WqF41N{gF=JgWu5szA)o0sr_@3##Q}zL1%&X;}P0V_)(qo z2QC>?7mcZ#>c;cXH@bNrsC`&YW20i6a8uJo>P{Leoj#;>G91xA8wh`Iu2kBqf()w$ X5f+A*u7lvr!+)B*Nz+c?f2i``e$4cO literal 0 HcmV?d00001 diff --git a/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py b/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py index c593e1de5..fbc1cc609 100644 --- a/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py +++ b/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py @@ -16,7 +16,11 @@ from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from doc_chunk_transform import chunking_type_cli_param, chunking_types +from doc_chunk_transform import ( + chunking_type_cli_param, + output_chunk_column_name_cli_param, + chunking_types +) from doc_chunk_transform_python import DocChunkPythonTransformConfiguration @@ -55,4 +59,17 @@ def get_test_transform_fixtures(self) -> list[tuple]: basedir + "/expected_md", ) ) + + # Run with fixed size token chunker + fixtures.append( + ( + launcher, + { + chunking_type_cli_param: chunking_types.FIXED_SIZE, + output_chunk_column_name_cli_param: "chunk_text" + }, + basedir + "/input_fixed_size", + basedir + "/expected_fixed_size", + ) + ) return fixtures From c481c5cb87f43ee4d1381c6ce1ec30d86293952d Mon Sep 17 00:00:00 2001 From: Juan Cappi Date: Thu, 3 Oct 2024 13:50:33 -0300 Subject: [PATCH 2/5] fix: change naming to better reflect the new chunker is also leveraging a Llama Index chunker Signed-off-by: Juan Cappi #641 --- .../doc_chunk/python/src/doc_chunk_chunkers.py | 8 ++++++-- .../doc_chunk/python/src/doc_chunk_local_python.py | 6 +++--- .../doc_chunk/python/src/doc_chunk_transform.py | 8 ++++---- .../sample1.parquet | Bin .../sample1.parquet | Bin .../doc_chunk/python/test/test_doc_chunk_python.py | 6 +++--- 6 files changed, 16 insertions(+), 12 deletions(-) rename transforms/language/doc_chunk/python/test-data/{expected_fixed_size => expected_token_text}/sample1.parquet (100%) rename transforms/language/doc_chunk/python/test-data/{input_fixed_size => input_token_text}/sample1.parquet (100%) diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py index 3b124aa11..a8ba44f61 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py @@ -69,9 +69,13 @@ def chunk(self, content: str) -> Iterator[dict]: } -class FixedTokenSizeChunker(ChunkingExecutor): +class LITokenTextSplitter(ChunkingExecutor): """ - Chunks input text into fixed-window lengths, measured in tokens, with an overlap also measured in tokens. + A text chunker that leverages Llama Index's token-based text splitter. This splitter breaks input text into + fixed-window chunks, with each chunk measured in tokens rather than characters. + + The chunking process ensures that each chunk contains a specific number of tokens, and an optional overlap between + chunks (also measured in tokens) can be specified to preserve context between the chunks. Args: output_chunk_column_name (str): Name of the output column containing the text of each chunk. diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py index ecadef287..b3f4de43a 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py @@ -17,12 +17,12 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils from doc_chunk_transform_python import DocChunkPythonTransformConfiguration - +from doc_chunk_transform import chunking_types # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) # input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_md")) -# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_fixed_size")) +# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_token_text")) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = { "input_folder": input_folder, @@ -40,7 +40,7 @@ # doc_chunk params # "doc_chunk_chunking_type": "li_markdown", "doc_chunk_chunking_type": "dl_json", - # "doc_chunk_chunking_type": "fixed_size", + "doc_chunk_chunking_type": chunking_types.LI_TOKEN_TEXT, # fixed-size params # "doc_chunk_output_chunk_column_name": "chunk_text", # "doc_chunk_chunk_size_tokens": 128, diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py index 2ab8f34ea..7acdd3ef1 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py @@ -18,7 +18,7 @@ import pyarrow as pa from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger -from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown, FixedTokenSizeChunker +from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown, LITokenTextSplitter short_name = "doc_chunk" @@ -50,7 +50,7 @@ class chunking_types(str, enum.Enum): LI_MARKDOWN = "li_markdown" DL_JSON = "dl_json" - FIXED_SIZE = "fixed_size" + LI_TOKEN_TEXT = "li_token_text" def __str__(self): return str(self.value) @@ -123,8 +123,8 @@ def __init__(self, config: dict[str, Any]): self.chunker = LIMarkdown( output_chunk_column_name=self.output_chunk_column_name, ) - elif self.chunking_type == chunking_types.FIXED_SIZE: - self.chunker = FixedTokenSizeChunker( + elif self.chunking_type == chunking_types.LI_TOKEN_TEXT: + self.chunker = LITokenTextSplitter( output_chunk_column_name=self.output_chunk_column_name, output_chunk_column_id=self.output_chunk_column_id, chunk_size_tokens=self.chunk_size_tokens, diff --git a/transforms/language/doc_chunk/python/test-data/expected_fixed_size/sample1.parquet b/transforms/language/doc_chunk/python/test-data/expected_token_text/sample1.parquet similarity index 100% rename from transforms/language/doc_chunk/python/test-data/expected_fixed_size/sample1.parquet rename to transforms/language/doc_chunk/python/test-data/expected_token_text/sample1.parquet diff --git a/transforms/language/doc_chunk/python/test-data/input_fixed_size/sample1.parquet b/transforms/language/doc_chunk/python/test-data/input_token_text/sample1.parquet similarity index 100% rename from transforms/language/doc_chunk/python/test-data/input_fixed_size/sample1.parquet rename to transforms/language/doc_chunk/python/test-data/input_token_text/sample1.parquet diff --git a/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py b/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py index fbc1cc609..5ecfa49a2 100644 --- a/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py +++ b/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py @@ -65,11 +65,11 @@ def get_test_transform_fixtures(self) -> list[tuple]: ( launcher, { - chunking_type_cli_param: chunking_types.FIXED_SIZE, + chunking_type_cli_param: chunking_types.LI_TOKEN_TEXT, output_chunk_column_name_cli_param: "chunk_text" }, - basedir + "/input_fixed_size", - basedir + "/expected_fixed_size", + basedir + "/input_token_text", + basedir + "/expected_token_text", ) ) return fixtures From 6d21ef394e41e075d793b25fe5d08c5c05bca8be Mon Sep 17 00:00:00 2001 From: Juan Cappi Date: Thu, 3 Oct 2024 15:25:04 -0300 Subject: [PATCH 3/5] fix: adjust documentation - #641 Signed-off-by: Juan Cappi --- transforms/language/doc_chunk/python/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md index eea2d8105..f962717d6 100644 --- a/transforms/language/doc_chunk/python/README.md +++ b/transforms/language/doc_chunk/python/README.md @@ -29,12 +29,12 @@ The transform can be tuned with the following parameters. | Parameter | Default | Description | |------------|----------|--------------| -| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `fixed_size` for chunking text into fixed-sized windows of tokens, where both the window size and overlap between windows are measured in tokens. | +| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `li_token_text` for using the LlamaIndex [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/), which chunks the text into fixed-sized windows of tokens. | | `content_column_name` | `contents` | Name of the column containing the text to be chunked. | | `doc_id_column_name` | `document_id` | Name of the column containing the doc_id to be propagated in the output. | | `dl_min_chunk_len` | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. | -| `chunk_size_tokens` | `128` | Size of the chunk in tokens for the fixed-sized chunker. | -| `chunk_overlap_tokens` | `30` | Number of tokens overlapping between chunks for the fixed-sized chunker. | +| `chunk_size_tokens` | `128` | Size of the chunk in tokens for the token text chunker. | +| `chunk_overlap_tokens` | `30` | Number of tokens overlapping between chunks for the token text chunker. | | `output_chunk_column_name` | `contents` | Column name to store the chunks in the output table. | | `output_source_doc_id_column_name` | `source_document_id` | Column name to store the `doc_id` from the input table. | | `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. | From 73e35d79ddeb1849b3616610d2a0dbb83c747b10 Mon Sep 17 00:00:00 2001 From: Juan Cappi Date: Fri, 4 Oct 2024 14:17:20 -0300 Subject: [PATCH 4/5] fix: add missing metadata.json as expected file in test fixture Signed-off-by: Juan Cappi --- .../expected_token_text/metadata.json | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 transforms/language/doc_chunk/python/test-data/expected_token_text/metadata.json diff --git a/transforms/language/doc_chunk/python/test-data/expected_token_text/metadata.json b/transforms/language/doc_chunk/python/test-data/expected_token_text/metadata.json new file mode 100644 index 000000000..4d84b5915 --- /dev/null +++ b/transforms/language/doc_chunk/python/test-data/expected_token_text/metadata.json @@ -0,0 +1,56 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "doc_chunk", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-04 14:00:40", + "end_time": "2024-10-04 14:00:41", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "chunking_type": "li_token_text", + "content_column_name": "contents", + "doc_id_column_name": "document_id", + "dl_min_chunk_len": null, + "output_chunk_column_name": "chunk_text", + "output_source_doc_id_column_name": "source_document_id", + "output_jsonpath_column_name": "doc_jsonpath", + "output_pageno_column_name": "page_number", + "output_bbox_column_name": "bbox", + "chunk_size_tokens": 128, + "chunk_overlap_tokens": 30, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [ + ".parquet" + ], + "num_processors": 0 + }, + "job_output_stats": { + "source_files": 1, + "source_size": 17749, + "result_files": 1, + "result_size": 8827, + "processing_time": 0.194, + "nfiles": 1, + "nrows": 10, + "source_doc_count": 2, + "result_doc_count": 10 + }, + "source": { + "name": "/Users/jmcappi/git/ai-foundation/data-prep-kit/transforms/language/doc_chunk/python/test-data/input_token_text", + "type": "path" + }, + "target": { + "name": "/Users/jmcappi/git/ai-foundation/data-prep-kit/transforms/language/doc_chunk/python/output", + "type": "path" + } +} \ No newline at end of file From 137fb2d18959e8a9cda8f4a45ced98b3821a52ab Mon Sep 17 00:00:00 2001 From: Juan Cappi Date: Fri, 4 Oct 2024 14:20:17 -0300 Subject: [PATCH 5/5] fix: comment extra config line Signed-off-by: Juan Cappi --- .../language/doc_chunk/python/src/doc_chunk_local_python.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py index b3f4de43a..e0fdfa871 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py @@ -40,7 +40,7 @@ # doc_chunk params # "doc_chunk_chunking_type": "li_markdown", "doc_chunk_chunking_type": "dl_json", - "doc_chunk_chunking_type": chunking_types.LI_TOKEN_TEXT, + # "doc_chunk_chunking_type": chunking_types.LI_TOKEN_TEXT, # fixed-size params # "doc_chunk_output_chunk_column_name": "chunk_text", # "doc_chunk_chunk_size_tokens": 128,