Skip to content

Commit

Permalink
chore: solve merge conflict
Browse files Browse the repository at this point in the history
  • Loading branch information
guenthermi committed Sep 24, 2024
2 parents 4ca4204 + 70f81cb commit 7ee85da
Showing 1 changed file with 27 additions and 3 deletions.
30 changes: 27 additions & 3 deletions tests/test_chunking_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,17 +98,41 @@ def test_chunk_by_tokens():
assert end - start <= 10


def test_chunk_semantically():
@pytest.mark.parametrize(
'model_name',
['jinaai/jina-embeddings-v2-small-en', 'sentence-transformers/all-MiniLM-L6-v2'],
)
def test_chunk_semantically(model_name):
chunker = Chunker(chunking_strategy="semantic")
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-small-en')
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokens = tokenizer.encode_plus(
EXAMPLE_TEXT_1, add_special_tokens=False, return_offsets_mapping=True
)
boundary_cues = chunker.chunk(
EXAMPLE_TEXT_1,
tokenizer=tokenizer,
chunking_strategy='semantic',
embedding_model_name='jinaai/jina-embeddings-v2-small-en',
embedding_model_name=model_name,
)

# check if it returns boundary cues
assert len(boundary_cues) > 0

# test if bounaries are at the end of sentences
for start_token_idx, end_token_idx in boundary_cues:
assert (
EXAMPLE_TEXT_1[tokens.offset_mapping[end_token_idx - 1][0]] in PUNCTATIONS
)
decoded_text_chunk = tokenizer.decode(
tokens.input_ids[start_token_idx:end_token_idx]
)

# check that the boundary cues are continuous (no token is missing)
assert all(
[
boundary_cues[i][1] == boundary_cues[i + 1][0]
for i in range(len(boundary_cues) - 1)
]
)

# check if it returns boundary cues
Expand Down

0 comments on commit 7ee85da

Please sign in to comment.