Skip to content

Commit

Permalink
Label Filter Cases (only compatible with milvus)
Browse files Browse the repository at this point in the history
Signed-off-by: min.tian <[email protected]>
  • Loading branch information
alwayslove2013 committed Jan 22, 2025
1 parent d30f816 commit eaecc39
Show file tree
Hide file tree
Showing 16 changed files with 483 additions and 104 deletions.
28 changes: 1 addition & 27 deletions vectordb_bench/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,33 +27,7 @@ class config:
DROP_OLD = env.bool("DROP_OLD", True)
USE_SHUFFLED_DATA = env.bool("USE_SHUFFLED_DATA", True)

NUM_CONCURRENCY = env.list(
"NUM_CONCURRENCY",
[
1,
5,
10,
15,
20,
25,
30,
35,
40,
45,
50,
55,
60,
65,
70,
75,
80,
85,
90,
95,
100,
],
subcast=int,
)
NUM_CONCURRENCY = env.list("NUM_CONCURRENCY", [1, 5, 10, 20, 30, 40, 60, 80], subcast=int)

CONCURRENCY_DURATION = 30

Expand Down
91 changes: 72 additions & 19 deletions vectordb_bench/backend/cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@

from vectordb_bench import config
from vectordb_bench.backend.clients.api import MetricType
from vectordb_bench.backend.filter import Filter, FilterType, IntFilter, LabelFilter, non_filter
from vectordb_bench.base import BaseModel
from vectordb_bench.frontend.components.custom.getCustomConfig import (
CustomDatasetConfig,
)
from vectordb_bench.frontend.components.custom.getCustomConfig import CustomDatasetConfig

from .dataset import CustomDataset, Dataset, DatasetManager, DatasetWithSizeType

Expand Down Expand Up @@ -50,6 +49,8 @@ class CaseType(Enum):

StreamingPerformanceCase = 200

LabelFilterPerformanceCase = 300

def case_cls(self, custom_configs: dict | None = None) -> type["Case"]:
if custom_configs is None:
return type2case.get(self)()
Expand Down Expand Up @@ -97,15 +98,21 @@ class Case(BaseModel):
filter_rate: float | None = None

@property
def filters(self) -> dict | None:
if self.filter_rate is not None:
target_id = round(self.filter_rate * self.dataset.data.size)
return {
"metadata": f">={target_id}",
"id": target_id,
}
def filters(self) -> Filter:
return non_filter

@property
def with_scalar_labels(self) -> bool:
return self.filters.type == FilterType.Label

def check_scalar_labels(self) -> None:
if self.with_scalar_labels and not self.dataset.data.with_scalar_labels:
msg = f"Case init failed: no scalar_labels data in current dataset ({self.dataset.data.full_name})"
raise ValueError(msg)

return None
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.check_scalar_labels()


class CapacityCase(Case):
Expand Down Expand Up @@ -151,6 +158,14 @@ class Performance768D10M(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_10M


class IntFilterPerformanceCase(PerformanceCase):
@property
def filters(self) -> Filter:
int_field = self.dataset.data.train_id_field
int_value = int(self.dataset.data.size * self.filter_rate)
return IntFilter(filter_rate=self.filter_rate, int_field=int_field, int_value=int_value)


class Performance768D1M(PerformanceCase):
case_id: CaseType = CaseType.Performance768D1M
dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
Expand All @@ -162,7 +177,7 @@ class Performance768D1M(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_1M


class Performance768D10M1P(PerformanceCase):
class Performance768D10M1P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance768D10M1P
filter_rate: float | int | None = 0.01
dataset: DatasetManager = Dataset.COHERE.manager(10_000_000)
Expand All @@ -174,7 +189,7 @@ class Performance768D10M1P(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_10M


class Performance768D1M1P(PerformanceCase):
class Performance768D1M1P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance768D1M1P
filter_rate: float | int | None = 0.01
dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
Expand All @@ -186,7 +201,7 @@ class Performance768D1M1P(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_1M


class Performance768D10M99P(PerformanceCase):
class Performance768D10M99P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance768D10M99P
filter_rate: float | int | None = 0.99
dataset: DatasetManager = Dataset.COHERE.manager(10_000_000)
Expand All @@ -198,7 +213,7 @@ class Performance768D10M99P(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_10M


class Performance768D1M99P(PerformanceCase):
class Performance768D1M99P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance768D1M99P
filter_rate: float | int | None = 0.99
dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
Expand Down Expand Up @@ -246,7 +261,7 @@ class Performance1536D5M(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_5M


class Performance1536D500K1P(PerformanceCase):
class Performance1536D500K1P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance1536D500K1P
filter_rate: float | int | None = 0.01
dataset: DatasetManager = Dataset.OPENAI.manager(500_000)
Expand All @@ -258,7 +273,7 @@ class Performance1536D500K1P(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_500K


class Performance1536D5M1P(PerformanceCase):
class Performance1536D5M1P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance1536D5M1P
filter_rate: float | int | None = 0.01
dataset: DatasetManager = Dataset.OPENAI.manager(5_000_000)
Expand All @@ -270,7 +285,7 @@ class Performance1536D5M1P(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_5M


class Performance1536D500K99P(PerformanceCase):
class Performance1536D500K99P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance1536D500K99P
filter_rate: float | int | None = 0.99
dataset: DatasetManager = Dataset.OPENAI.manager(500_000)
Expand All @@ -282,7 +297,7 @@ class Performance1536D500K99P(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_500K


class Performance1536D5M99P(PerformanceCase):
class Performance1536D5M99P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance1536D5M99P
filter_rate: float | int | None = 0.99
dataset: DatasetManager = Dataset.OPENAI.manager(5_000_000)
Expand Down Expand Up @@ -408,6 +423,43 @@ def __init__(
)


class LabelFilterPerformanceCase(PerformanceCase):
case_id: CaseType = CaseType.LabelFilterPerformanceCase
dataset_with_size_type: DatasetWithSizeType
label_percentage: float

def __init__(
self,
dataset_with_size_type: DatasetWithSizeType | str,
label_percentage: float,
**kwargs,
):
if not isinstance(dataset_with_size_type, DatasetWithSizeType):
dataset_with_size_type = DatasetWithSizeType(dataset_with_size_type)
name = f"Label-Filter-{label_percentage*100:.1f}% - {dataset_with_size_type.value}"
description = f"Label-Filter-{label_percentage*100:.1f}% Performance Test ({dataset_with_size_type.value})"
dataset = dataset_with_size_type.get_manager()
load_timeout = dataset_with_size_type.get_load_timeout()
optimize_timeout = dataset_with_size_type.get_optimize_timeout()
filters = LabelFilter(label_percentage=label_percentage)
filter_rate = filters.filter_rate
super().__init__(
name=name,
description=description,
dataset=dataset,
load_timeout=load_timeout,
optimize_timeout=optimize_timeout,
filter_rate=filter_rate,
dataset_with_size_type=dataset_with_size_type,
label_percentage=label_percentage,
**kwargs,
)

@property
def filters(self) -> Filter:
return LabelFilter(label_percentage=self.label_percentage)


type2case = {
CaseType.CapacityDim960: CapacityDim960,
CaseType.CapacityDim128: CapacityDim128,
Expand All @@ -427,4 +479,5 @@ def __init__(
CaseType.Performance1536D50K: Performance1536D50K,
CaseType.PerformanceCustomDataset: PerformanceCustomDataset,
CaseType.StreamingPerformanceCase: StreamingPerformanceCase,
CaseType.LabelFilterPerformanceCase: LabelFilterPerformanceCase,
}
21 changes: 19 additions & 2 deletions vectordb_bench/backend/clients/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from pydantic import BaseModel, SecretStr, validator

from vectordb_bench.backend.filter import Filter, FilterType


class MetricType(str, Enum):
L2 = "L2"
Expand Down Expand Up @@ -110,6 +112,21 @@ class VectorDB(ABC):
>>> milvus.search_embedding()
"""

"The filtering types supported by the VectorDB Client, default only non-filter"
supported_filter_types: list[FilterType] = [FilterType.NonFilter]

@classmethod
def filter_supported(cls, filters: Filter) -> bool:
"""Ensure that the filters are supported before testing filtering cases."""
return filters.type in cls.supported_filter_types

def prepare_filter(self, filters: Filter):
"""The vector database is allowed to pre-prepare different filter conditions
to reduce redundancy during the testing process.
(All search tests in a case use consistent filtering conditions.)"""
return

@abstractmethod
def __init__(
self,
Expand Down Expand Up @@ -160,8 +177,9 @@ def insert_embeddings(
self,
embeddings: list[list[float]],
metadata: list[int],
labels_data: list[str] | None = None,
**kwargs,
) -> (int, Exception):
) -> tuple[int, Exception]:
"""Insert the embeddings to the vector database. The default number of embeddings for
each insert_embeddings is 5000.
Expand All @@ -180,7 +198,6 @@ def search_embedding(
self,
query: list[float],
k: int = 100,
filters: dict | None = None,
) -> list[int]:
"""Get k most similar embeddings to query vector.
Expand Down
3 changes: 2 additions & 1 deletion vectordb_bench/backend/clients/milvus/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


class MilvusConfig(DBConfig):
uri: SecretStr = "http://10.102.7.230:19530"
uri: SecretStr = "http://localhost:19530"
user: str | None = None
password: SecretStr | None = None

Expand Down Expand Up @@ -33,6 +33,7 @@ class MilvusIndexConfig(BaseModel):

index: IndexType
metric_type: MetricType | None = None
use_partition_key: bool = True # for label-filter

@property
def is_gpu_index(self) -> bool:
Expand Down
Loading

0 comments on commit eaecc39

Please sign in to comment.