Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add post chunking strategy #3869

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 45 additions & 4 deletions unstructured/chunking/dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,12 @@ def __call__(


def add_chunking_strategy(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
"""Decorator for chunking text.
"""Decorator for chunking text in both pre-processing and post-processing way.

Chunks the element sequence produced by the partitioner it decorates when a `chunking_strategy`
argument is present in the partitioner call and it names an available chunking strategy.
The chunking process could be both pre-process and post-process chunking

"""
# -- Patch the docstring of the decorated function to add chunking strategy and
# -- chunking-related argument documentation. This only applies when `chunking_strategy`
Expand All @@ -65,6 +67,25 @@ def add_chunking_strategy(func: Callable[_P, list[Element]]) -> Callable[_P, lis
+ "\n\t\t\tChunks elements text and text_as_html (if present) into chunks"
+ "\n\t\t\tof length n characters, a hard max."
)
# -- Patch the docstring of the decorated function to add contexual chunking strategy and
# -- contextual_chunking-related argument documentation.
# -- This only applies when `contextual_chunking_strategy` is an explicit argument
# -- of the decorated function and "contextual_chunking_strategy" is not
# -- already mentioned in the docstring.
if func.__doc__ and (
"contextual_chunking_strategy" in func.__code__.co_varnames
and "contextual_chunking_strategy" not in func.__doc__
):
func.__doc__ += (
"\ncontextual_chunking_strategy"
+ "\n\tStrategy used to contextualize chunks into chunks with prefixs."
+ "\n\tDefaults to `None`"
+ "\n\tAdditional Parameters:"
+ "\n\t\\service_name"
+ "\n\t\t\tThe service name that describes the provider and its purpose"
+ "\n\t\tauth_env"
+ "\n\t\t\tthe authentication environment var name to get the auth token"
)

@functools.wraps(func)
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
Expand All @@ -82,23 +103,43 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
return elements

# -- otherwise, chunk away :) --
# here, chunk() can be both pre-process and post-process chunking
return chunk(elements, chunking_strategy, **call_args)

return wrapper


def chunk(elements: Iterable[Element], chunking_strategy: str, **kwargs: Any) -> list[Element]:
"""Dispatch chunking of `elements` to the chunking function for `chunking_strategy`."""
"""Dispatch chunking of `elements` to the chunking function if only `chunking_strategy` present
if both `chunking_strategy` and `contextual_chunking_strategy` args are present and None,
use the chunketized results and perform contextual chunking function afterwards.

"""
chunker_spec = _chunker_registry.get(chunking_strategy)

if chunker_spec is None:
raise ValueError(f"unrecognized chunking strategy {repr(chunking_strategy)}")
# extract and remove contextual_chunking_strategy from kwargs if present
contextual_chunking_strategy = kwargs.pop("contextual_chunking_strategy", None)

# -- `kwargs` will in general be an omnibus dict of all keyword arguments to the partitioner;
# -- pick out and use only those supported by this chunker.
chunking_kwargs = {k: v for k, v in kwargs.items() if k in chunker_spec.kw_arg_names}

return chunker_spec.chunker(elements, **chunking_kwargs)
chunks = chunker_spec.chunker(elements, **chunking_kwargs)
if contextual_chunking_strategy == "v1":
contextual_chunking_spec = _chunker_registry.get(contextual_chunking_strategy)
if contextual_chunking_spec is None:
raise ValueError(
f"unrecognized contextual chunking strategy {repr(contextual_chunking_strategy)}"
)
# prepare kwargs for the contextual chunkin strategy such as service name, auth env etc
contextual_chunking_kwargs = {
k: v for k, v in kwargs.items() if k in contextual_chunking_spec.kw_arg_names
}
# perform post-chunking using contextual_chunking_strategy
chunks = contextual_chunking_spec.chunker(chunks, **contextual_chunking_kwargs)

return chunks


def register_chunking_strategy(name: str, chunker: Chunker) -> None:
Expand Down
Loading