From 70f81cbf4d7e8721d2d5623e70e29e7806a7749d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20G=C3=BCnther?= Date: Mon, 23 Sep 2024 17:18:46 +0200 Subject: [PATCH] feat: add semantic chunking to eval script; add wrapper for minilm (#11) * feat: add semantic chunking to eval script; add wrapper for minilm * fix: gaps in semantic chunking * feat: add option to pass custom model for chunking * refactor: add second model to semantic chunking test --- chunked_pooling/chunking.py | 8 +++---- chunked_pooling/mteb_chunked_eval.py | 2 ++ chunked_pooling/wrappers.py | 11 +++++++-- run_chunked_eval.py | 9 ++++++- tests/test_chunking_methods.py | 36 ++++++++++++++++++++++++---- 5 files changed, 54 insertions(+), 12 deletions(-) diff --git a/chunked_pooling/chunking.py b/chunked_pooling/chunking.py index beabb8c..facf1b0 100644 --- a/chunked_pooling/chunking.py +++ b/chunked_pooling/chunking.py @@ -31,6 +31,7 @@ def _setup_semantic_chunking(self, embedding_model_name): self.embed_model = HuggingFaceEmbedding( model_name=self.embedding_model_name, trust_remote_code=True, + embed_batch_size=1, ) self.splitter = SemanticSplitterNodeParser( embed_model=self.embed_model, @@ -71,13 +72,12 @@ def chunk_semantically( start_chunk_index = bisect.bisect_left( [offset[0] for offset in token_offsets], char_start ) - end_chunk_index = ( - bisect.bisect_right([offset[1] for offset in token_offsets], char_end) - - 1 + end_chunk_index = bisect.bisect_right( + [offset[1] for offset in token_offsets], char_end ) # Add the chunk span if it's within the tokenized text - if start_chunk_index < len(token_offsets) and end_chunk_index < len( + if start_chunk_index < len(token_offsets) and end_chunk_index <= len( token_offsets ): chunk_spans.append((start_chunk_index, end_chunk_index)) diff --git a/chunked_pooling/mteb_chunked_eval.py b/chunked_pooling/mteb_chunked_eval.py index 8da9a54..2433e7f 100644 --- a/chunked_pooling/mteb_chunked_eval.py +++ b/chunked_pooling/mteb_chunked_eval.py @@ -25,6 +25,7 @@ def __init__( chunk_size: Optional[int] = None, n_sentences: Optional[int] = None, model_has_instructions: bool = False, + embedding_model_name: Optional[str] = None, # for semantic chunking **kwargs, ): super().__init__(**kwargs) @@ -45,6 +46,7 @@ def __init__( self.chunking_args = { 'chunk_size': chunk_size, 'n_sentences': n_sentences, + 'embedding_model_name': embedding_model_name, } def load_data(self, **kwargs): diff --git a/chunked_pooling/wrappers.py b/chunked_pooling/wrappers.py index 44984c5..a4bb0ef 100644 --- a/chunked_pooling/wrappers.py +++ b/chunked_pooling/wrappers.py @@ -2,6 +2,7 @@ import torch import torch.nn as nn +from sentence_transformers import SentenceTransformer from transformers import AutoModel @@ -61,7 +62,10 @@ def has_instructions(): return True -MODEL_WRAPPERS = {'jinaai/jina-embeddings-v3': JinaEmbeddingsV3Wrapper} +MODEL_WRAPPERS = { + 'jinaai/jina-embeddings-v3': JinaEmbeddingsV3Wrapper, + 'sentence-transformers/all-MiniLM-L6-v2': SentenceTransformer, +} MODELS_WITHOUT_PROMPT_NAME_ARG = [ 'jinaai/jina-embeddings-v2-small-en', 'jinaai/jina-embeddings-v2-base-en', @@ -82,7 +86,10 @@ def wrapper(self, *args, **kwargs): def load_model(model_name, **model_kwargs): if model_name in MODEL_WRAPPERS: model = MODEL_WRAPPERS[model_name](model_name, **model_kwargs) - has_instructions = MODEL_WRAPPERS[model_name].has_instructions() + if hasattr(MODEL_WRAPPERS[model_name], 'has_instructions'): + has_instructions = MODEL_WRAPPERS[model_name].has_instructions() + else: + has_instructions = False else: model = AutoModel.from_pretrained(model_name, trust_remote_code=True) has_instructions = False diff --git a/run_chunked_eval.py b/run_chunked_eval.py index 4f0057c..ff49da0 100644 --- a/run_chunked_eval.py +++ b/run_chunked_eval.py @@ -29,7 +29,13 @@ @click.option( '--eval-split', default='test', help='The name of the evaluation split in the task.' ) -def main(model_name, strategy, task_name, eval_split): +@click.option( + '--chunking-model', + default=None, + required=False, + help='The name of the model used for semantic chunking.', +) +def main(model_name, strategy, task_name, eval_split, chunking_model): try: task_cls = globals()[task_name] except: @@ -44,6 +50,7 @@ def main(model_name, strategy, task_name, eval_split): 'n_sentences': DEFAULT_N_SENTENCES, 'chunking_strategy': strategy, 'model_has_instructions': has_instructions, + 'embedding_model_name': chunking_model if chunking_model else model_name, } if torch.cuda.is_available(): diff --git a/tests/test_chunking_methods.py b/tests/test_chunking_methods.py index d99cc17..02c3e17 100644 --- a/tests/test_chunking_methods.py +++ b/tests/test_chunking_methods.py @@ -98,16 +98,42 @@ def test_chunk_by_tokens(): assert end - start <= 10 -def test_chunk_semantically(): +@pytest.mark.parametrize( + 'model_name', + ['jinaai/jina-embeddings-v2-small-en', 'sentence-transformers/all-MiniLM-L6-v2'], +) +def test_chunk_semantically(model_name): chunker = Chunker(chunking_strategy="semantic") - tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") - chunks = chunker.chunk( + tokenizer = AutoTokenizer.from_pretrained(model_name) + tokens = tokenizer.encode_plus( + EXAMPLE_TEXT_1, add_special_tokens=False, return_offsets_mapping=True + ) + boundary_cues = chunker.chunk( EXAMPLE_TEXT_1, tokenizer=tokenizer, chunking_strategy='semantic', - embedding_model_name='jinaai/jina-embeddings-v2-small-en', + embedding_model_name=model_name, + ) + + # check if it returns boundary cues + assert len(boundary_cues) > 0 + + # test if bounaries are at the end of sentences + for start_token_idx, end_token_idx in boundary_cues: + assert ( + EXAMPLE_TEXT_1[tokens.offset_mapping[end_token_idx - 1][0]] in PUNCTATIONS + ) + decoded_text_chunk = tokenizer.decode( + tokens.input_ids[start_token_idx:end_token_idx] + ) + + # check that the boundary cues are continuous (no token is missing) + assert all( + [ + boundary_cues[i][1] == boundary_cues[i + 1][0] + for i in range(len(boundary_cues) - 1) + ] ) - assert len(chunks) > 0 def test_empty_input():