Skip to content

Commit

Permalink
test: add text match negative test case (#38982)
Browse files Browse the repository at this point in the history
pr: #38981

Signed-off-by: zhuwenxing <[email protected]>
  • Loading branch information
zhuwenxing authored Jan 11, 2025
1 parent b0afe32 commit 95df38e
Showing 1 changed file with 111 additions and 0 deletions.
111 changes: 111 additions & 0 deletions tests/python_client/testcases/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -6620,6 +6620,7 @@ def test_query_text_match_with_nullable(self):


class TestQueryTextMatchNegative(TestcaseBase):

@pytest.mark.tags(CaseLabel.L0)
def test_query_text_match_with_unsupported_tokenizer(self):
"""
Expand Down Expand Up @@ -6687,6 +6688,116 @@ def test_query_text_match_with_unsupported_tokenizer(self):
check_items=error,
)

@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_query_text_match_when_enable_match_false(
self, tokenizer, enable_inverted_index, enable_partition_key
):
"""
target: test text match when field enable_match is false
method: 1. not enable text match and insert data with varchar
2. get the most common words and query with text match
3. verify the result
expected: text match failed and return error
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=False,
is_partition_key=enable_partition_key,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=False,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=False,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=False,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields=fields, description="test collection")
data_size = 3000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
if tokenizer == "jieba":
language = "zh"
fake = fake_zh
else:
language = "en"

data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
# only if the collection is flushed, the inverted index ca be applied.
# growing segment may be not applied, although in strong consistency.
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
)
if enable_inverted_index:
collection_w.create_index("word", {"index_type": "INVERTED"})
collection_w.load()
# analyze the croup
text_fields = ["word", "sentence", "paragraph", "text"]
wf_map = {}
for field in text_fields:
wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
# query single field for one token
for field in text_fields:
token = wf_map[field].most_common()[0][0]
expr = f"text_match({field}, '{token}')"
log.info(f"expr: {expr}")
res, _ = collection_w.query(expr=expr, output_fields=["id", field],
check_task=CheckTasks.err_res,
check_items={ct.err_code: 65535, ct.err_msg: "query failed"})

class TestQueryFunction(TestcaseBase):
@pytest.mark.tags(CaseLabel.L1)
Expand Down

0 comments on commit 95df38e

Please sign in to comment.