chore: solve merge conflict

jina-ai · Sep 24, 2024 · 7ee85da · 7ee85da
2 parents 4ca4204 + 70f81cb
commit 7ee85da
Showing 1 changed file with 27 additions and 3 deletions.
diff --git a/tests/test_chunking_methods.py b/tests/test_chunking_methods.py
@@ -98,17 +98,41 @@ def test_chunk_by_tokens():
         assert end - start <= 10
 
 
-def test_chunk_semantically():
+@pytest.mark.parametrize(
+    'model_name',
+    ['jinaai/jina-embeddings-v2-small-en', 'sentence-transformers/all-MiniLM-L6-v2'],
+)
+def test_chunk_semantically(model_name):
     chunker = Chunker(chunking_strategy="semantic")
-    tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-small-en')
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
     tokens = tokenizer.encode_plus(
         EXAMPLE_TEXT_1, add_special_tokens=False, return_offsets_mapping=True
     )
     boundary_cues = chunker.chunk(
         EXAMPLE_TEXT_1,
         tokenizer=tokenizer,
         chunking_strategy='semantic',
-        embedding_model_name='jinaai/jina-embeddings-v2-small-en',
+        embedding_model_name=model_name,
+    )
+
+    # check if it returns boundary cues
+    assert len(boundary_cues) > 0
+
+    # test if bounaries are at the end of sentences
+    for start_token_idx, end_token_idx in boundary_cues:
+        assert (
+            EXAMPLE_TEXT_1[tokens.offset_mapping[end_token_idx - 1][0]] in PUNCTATIONS
+        )
+        decoded_text_chunk = tokenizer.decode(
+            tokens.input_ids[start_token_idx:end_token_idx]
+        )
+
+    # check that the boundary cues are continuous (no token is missing)
+    assert all(
+        [
+            boundary_cues[i][1] == boundary_cues[i + 1][0]
+            for i in range(len(boundary_cues) - 1)
+        ]
     )
 
     # check if it returns boundary cues