From dfa6f84db56111d5c39cbbd031cd924fe36f8781 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sat, 10 Aug 2024 17:47:16 -0400 Subject: [PATCH 01/49] Add SMILES, AI Paraphrase and Inter-Source Paragraphs PairClassification Tasks --- mteb/tasks/PairClassification/__init__.py | 7 ++ .../eng/PubChemAIParagraphsParaphrasePC.py | 62 +++++++++++++++ .../eng/PubChemAISentenceParaphrasePC.py | 62 +++++++++++++++ .../eng/PubChemSMILESCanonDescPC.py | 75 ++++++++++++++++++ .../eng/PubChemSMILESCanonTitlePC.py | 78 +++++++++++++++++++ .../eng/PubChemSMILESIsoDescPC.py | 75 ++++++++++++++++++ .../eng/PubChemSMILESIsoTitlePC.py | 78 +++++++++++++++++++ .../eng/PubChemWikiParagraphsPC.py | 62 +++++++++++++++ 8 files changed, 499 insertions(+) create mode 100644 mteb/tasks/PairClassification/eng/PubChemAIParagraphsParaphrasePC.py create mode 100644 mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py create mode 100644 mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py create mode 100644 mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py create mode 100644 mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py create mode 100644 mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py create mode 100644 mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index c2057a4952..5bfba98f2e 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -7,6 +7,13 @@ from .eng.SprintDuplicateQuestionsPC import * from .eng.TwitterSemEval2015PC import * from .eng.TwitterURLCorpusPC import * +from .eng.PubChemWikiParagraphsPC import * +from .eng.PubChemAISentenceParaphrasePC import * +from .eng.PubChemAIParagraphsParaphrasePC import * +from .eng.PubChemSMILESCanonDescPC import * +from .eng.PubChemSMILESCanonTitlePC import * +from .eng.PubChemSMILESIsoDescPC import * +from .eng.PubChemSMILESIsoTitlePC import * from .fas.FarsTail import * from .hye.ArmenianParaphrasePC import * from .ind.IndoNLI import * diff --git a/mteb/tasks/PairClassification/eng/PubChemAIParagraphsParaphrasePC.py b/mteb/tasks/PairClassification/eng/PubChemAIParagraphsParaphrasePC.py new file mode 100644 index 0000000000..fe491be68d --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemAIParagraphsParaphrasePC.py @@ -0,0 +1,62 @@ +from __future__ import annotations + + +from typing import Any + +import datasets + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class PubChemAIParagraphsParaphrasePC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemAIParagraphsParaphrasePC", + description="""TBW""", + reference="https://pubchem.ncbi.nlm.nih.gov/", + dataset={ + "path": "BASF-We-Create-Chemistry/PubChemAIParagraphsParaphrasePC", + "revision": "bc3efec1bde242c3cdc3b7870c094f4e5a935fee" + }, + type="PairClassification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=None, + sample_creation="created", + bibtex_citation=None, + descriptive_stats={} + ) + + def load_data(self, **kwargs: Any) -> None: + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + + self.dataset = datasets.load_dataset( + self.metadata_dict["dataset"]["path"], + revision=self.metadata_dict["dataset"]["revision"], + trust_remote_code=True, + ) + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset["sent1"], + "sentence2": hf_dataset["sent2"], + "labels": hf_dataset["labels"] + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py new file mode 100644 index 0000000000..cbb763bb01 --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py @@ -0,0 +1,62 @@ +from __future__ import annotations + + +from typing import Any + +import datasets + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class PubChemAISentenceParaphrasePC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemAISentenceParaphrasePC", + description="""TBW""", + reference="https://pubchem.ncbi.nlm.nih.gov/", + dataset={ + "path": "BASF-We-Create-Chemistry/PubChemAISentenceParaphrasePC", + "revision": "eeaad4bb9ec83058589faec127cdcb38fc7bfb2e" + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=None, + sample_creation="created", + bibtex_citation=None, + descriptive_stats={} + ) + + def load_data(self, **kwargs: Any) -> None: + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + + self.dataset = datasets.load_dataset( + self.metadata_dict["dataset"]["path"], + revision=self.metadata_dict["dataset"]["revision"], + trust_remote_code=True, + ) + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset["sent1"], + "sentence2": hf_dataset["sent2"], + "labels": hf_dataset["labels"] + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py new file mode 100644 index 0000000000..417aca9fef --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py @@ -0,0 +1,75 @@ +from __future__ import annotations + + +from typing import Any + +import datasets + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +_DATASET_COLUMN_MAP = { + "sentence1": "description", + "sentence2": "canonical_smiles", + "labels": "labels", +} + + +class PubChemSMILESCanonDescPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemSMILESCanonDescPC", + description="""TBW""", + reference="https://pubchem.ncbi.nlm.nih.gov/", + dataset={ + "path": "BASF-We-Create-Chemistry/PubChemSMILESCanonDescPC", + "revision": "ed91e93abf734d82ad1a0abf4d7653521173e2fb" + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation="created", + bibtex_citation=None, + descriptive_stats={} + ) + + def load_data(self, **kwargs: Any) -> None: + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + + _dataset = datasets.load_dataset( + self.metadata_dict["dataset"]["path"], + revision=self.metadata_dict["dataset"]["revision"], + trust_remote_code=True, + ) + + self.dataset = _dataset + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=self.metadata_dict["eval_splits"], label="labels" + ) + + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset[_DATASET_COLUMN_MAP["sentence1"]], + "sentence2": hf_dataset[_DATASET_COLUMN_MAP["sentence2"]], + "labels": hf_dataset[_DATASET_COLUMN_MAP["labels"]] + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py new file mode 100644 index 0000000000..cb227dc9d7 --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py @@ -0,0 +1,78 @@ +from __future__ import annotations + + +from typing import Any + +import datasets + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +_DATASET_COLUMN_MAP = { + "sentence1": "title", + "sentence2": "canonical_smiles", + "labels": "labels", +} + + +class PubChemSMILESCanonTitlePC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemSMILESCanonTitlePC", + description="""TBW""", + reference="https://pubchem.ncbi.nlm.nih.gov/", + dataset={ + "path": "BASF-We-Create-Chemistry/PubChemSMILESCanonTitlePC", + "revision": "e0df8b7d5a9184cd8fc981959e6a81228f28c3a1" + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation="created", + bibtex_citation=None, + descriptive_stats={ + "n_samples": {"train": 43052}, + "avg_character_length": {"train": 34} + } + ) + + def load_data(self, **kwargs: Any) -> None: + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + + _dataset = datasets.load_dataset( + self.metadata_dict["dataset"]["path"], + revision=self.metadata_dict["dataset"]["revision"], + trust_remote_code=True, + ) + + self.dataset = _dataset + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=self.metadata_dict["eval_splits"], label="labels" + ) + + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset[_DATASET_COLUMN_MAP["sentence1"]], + "sentence2": hf_dataset[_DATASET_COLUMN_MAP["sentence2"]], + "labels": hf_dataset[_DATASET_COLUMN_MAP["labels"]] + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py new file mode 100644 index 0000000000..c93954fd4c --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py @@ -0,0 +1,75 @@ +from __future__ import annotations + + +from typing import Any + +import datasets + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +_DATASET_COLUMN_MAP = { + "sentence1": "description", + "sentence2": "isomeric_smiles", + "labels": "labels", +} + + +class PubChemSMILESIsoDescPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemSMILESIsoDescPC", + description="""TBW""", + reference="https://pubchem.ncbi.nlm.nih.gov/", + dataset={ + "path": "BASF-We-Create-Chemistry/PubChemSMILESIsoDescPC", + "revision": "a6c8bb2cb2ced89bf4576744c85b66b3319aa330" + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation="created", + bibtex_citation=None, + descriptive_stats={} + ) + + def load_data(self, **kwargs: Any) -> None: + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + + _dataset = datasets.load_dataset( + self.metadata_dict["dataset"]["path"], + revision=self.metadata_dict["dataset"]["revision"], + trust_remote_code=True, + ) + + self.dataset = _dataset + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=self.metadata_dict["eval_splits"], label="labels" + ) + + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset[_DATASET_COLUMN_MAP["sentence1"]], + "sentence2": hf_dataset[_DATASET_COLUMN_MAP["sentence2"]], + "labels": hf_dataset[_DATASET_COLUMN_MAP["labels"]], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py new file mode 100644 index 0000000000..cde72117d1 --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py @@ -0,0 +1,78 @@ +from __future__ import annotations + + +from typing import Any + +import datasets + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +_DATASET_COLUMN_MAP = { + "sentence1": "title", + "sentence2": "isomeric_smiles", + "labels": "labels", +} + + +class PubChemSMILESIsoTitlePC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemSMILESIsoTitlePC", + description="""TBW""", + reference="https://pubchem.ncbi.nlm.nih.gov/", + dataset={ + "path": "BASF-We-Create-Chemistry/PubChemSMILESIsoTitlePC", + "revision": "d8865a1b3a269aa2f4b4059a13c1037a9adfbc5d" + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation="created", + bibtex_citation=None, + descriptive_stats={ + "n_samples": {"train": 43052}, + "avg_character_length": {"train": 34} + } + ) + + def load_data(self, **kwargs: Any) -> None: + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + + _dataset = datasets.load_dataset( + self.metadata_dict["dataset"]["path"], + revision=self.metadata_dict["dataset"]["revision"], + trust_remote_code=True, + ) + + self.dataset = _dataset + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=self.metadata_dict["eval_splits"], label="labels" + ) + + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset[_DATASET_COLUMN_MAP["sentence1"]], + "sentence2": hf_dataset[_DATASET_COLUMN_MAP["sentence2"]], + "labels": hf_dataset[_DATASET_COLUMN_MAP["labels"]], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py new file mode 100644 index 0000000000..062ac3ed9c --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py @@ -0,0 +1,62 @@ +from __future__ import annotations + + +from typing import Any + +import datasets + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class PubChemWikiParagraphsPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemWikiParagraphsPC", + description="""TBW""", + reference="https://pubchem.ncbi.nlm.nih.gov/", + dataset={ + "path": "BASF-We-Create-Chemistry/PubChemWikiParagraphsPC", + "revision": "3a12c34da3bdfeaca47058b624e8223513c7ae46" + }, + type="PairClassification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=None, + sample_creation="created", + bibtex_citation=None, + descriptive_stats={} + ) + + def load_data(self, **kwargs: Any) -> None: + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + + self.dataset = datasets.load_dataset( + self.metadata_dict["dataset"]["path"], + revision=self.metadata_dict["dataset"]["revision"], + trust_remote_code=True, + ) + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset["sent1"], + "sentence2": hf_dataset["sent2"], + "labels": hf_dataset["labels"] + } + ] + self.dataset = _dataset From b56e01712161f7f4aabe0cbb8d542af6df6f526f Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Mon, 12 Aug 2024 08:30:10 -0400 Subject: [PATCH 02/49] Add chemical subsets of NQ and HotpotQA datasets as Retrieval tasks --- .../Retrieval/eng/ChemHotpotQARetrieval.py | 57 +++++++++++++++++++ mteb/tasks/Retrieval/eng/ChemNQRetrieval.py | 42 ++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py create mode 100644 mteb/tasks/Retrieval/eng/ChemNQRetrieval.py diff --git a/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py b/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py new file mode 100644 index 0000000000..1df15eb5c4 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class ChemHotpotQARetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ChemHotpotQARetrieval", + dataset={ + "path": "BASF-We-Create-Chemistry/ChemHotpotQARetrieval", + "revision": "f39c1d16edd269f233be381216bc8146f0857124", + }, + description=( + "HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong" + " supervision for supporting facts to enable more explainable question answering systems." + ), + reference="https://hotpotqa.github.io/", + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train", "dev", "test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation="""@inproceedings{yang-etal-2018-hotpotqa, + title = "{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering", + author = "Yang, Zhilin and + Qi, Peng and + Zhang, Saizheng and + Bengio, Yoshua and + Cohen, William and + Salakhutdinov, Ruslan and + Manning, Christopher D.", + editor = "Riloff, Ellen and + Chiang, David and + Hockenmaier, Julia and + Tsujii, Jun{'}ichi", + booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", + month = oct # "-" # nov, + year = "2018", + address = "Brussels, Belgium", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/D18-1259", + doi = "10.18653/v1/D18-1259", + pages = "2369--2380", + abstract = "Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.", +}""", + descriptive_stats={} + ) diff --git a/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py b/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py new file mode 100644 index 0000000000..70ae066e72 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py @@ -0,0 +1,42 @@ +from __future__ import annotations +import os +import logging + +from mteb.abstasks.TaskMetadata import TaskMetadata + + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + +logger = logging.getLogger(__name__) + + +class ChemNQRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ChemNQRetrieval", + dataset={ + "path": "BASF-We-Create-Chemistry/ChemNQRetrieval", + "revision": "023e7a813e3b73d8d33551ed2aea511314d612e2", + }, + description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval", + reference="https://ai.google.com/research/NaturalQuestions/", + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation="""@article{47761,title = {Natural Questions: a Benchmark for Question Answering Research}, + author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh + and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee + and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le + and Slav Petrov},year = {2019},journal = {Transactions of the Association of Computational + Linguistics}}""", + descriptive_stats={} + ) From 678dbc9dc57a47dc99329896f41e626c2af5d914 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Mon, 12 Aug 2024 08:43:12 -0400 Subject: [PATCH 03/49] Add PubChem Synonyms PairClassification task --- .../eng/PubChemSynonymPC.py | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 mteb/tasks/PairClassification/eng/PubChemSynonymPC.py diff --git a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py new file mode 100644 index 0000000000..3ce1fa2cf2 --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py @@ -0,0 +1,71 @@ +from __future__ import annotations + + +from typing import Any + +import datasets + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +_DATASET_COLUMN_MAP = { + "sentence1": "title", + "sentence2": "synonyms", + "labels": "labels", +} + + +class PubChemSynonymPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemSynonymPC", + description="""TBW""", + reference="https://pubchem.ncbi.nlm.nih.gov/", + dataset={ + "path": "BASF-We-Create-Chemistry/PubChemSynonymPC", + "revision": "dcd82d56e7a8388db23aa962c99d59fe9a2ba7e3" + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation="created", + bibtex_citation=None, + descriptive_stats={} + ) + + def load_data(self, **kwargs: Any) -> None: + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + + _dataset = datasets.load_dataset( + self.metadata_dict["dataset"]["path"], + revision=self.metadata_dict["dataset"]["revision"], + trust_remote_code=True, + ) + self.dataset = _dataset + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + _dataset = {} + + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset[_DATASET_COLUMN_MAP["sentence1"]], + "sentence2": hf_dataset[_DATASET_COLUMN_MAP["sentence2"]], + "labels": hf_dataset[_DATASET_COLUMN_MAP["labels"]], + } + ] + self.dataset = _dataset From 9c8f7f5f910be4e0fcdd539ef16ce0de2275de52 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Mon, 12 Aug 2024 08:49:36 -0400 Subject: [PATCH 04/49] Update task __init__ for previously added tasks --- mteb/tasks/PairClassification/__init__.py | 1 + mteb/tasks/Retrieval/__init__.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index 5bfba98f2e..e9d45fa9f4 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -14,6 +14,7 @@ from .eng.PubChemSMILESCanonTitlePC import * from .eng.PubChemSMILESIsoDescPC import * from .eng.PubChemSMILESIsoTitlePC import * +from .eng.PubChemSynonymPC import * from .fas.FarsTail import * from .hye.ArmenianParaphrasePC import * from .ind.IndoNLI import * diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index aee7749d53..28d7ab6a03 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -27,6 +27,8 @@ from .eng.ARCChallengeRetrieval import * from .eng.ArguAnaRetrieval import * from .eng.BrightRetrieval import * +from .eng.ChemNQRetrieval import * +from .eng.ChemHotpotQARetrieval import * from .eng.ClimateFEVERRetrieval import * from .eng.CQADupstackAndroidRetrieval import * from .eng.CQADupstackEnglishRetrieval import * From 5e312080408cc9986022ae9648f2e3b323decb6e Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Mon, 12 Aug 2024 08:49:50 -0400 Subject: [PATCH 05/49] Add nomic-bert loader --- mteb/models/__init__.py | 2 + mteb/models/nomic_bert_models.py | 149 +++++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 mteb/models/nomic_bert_models.py diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py index c68e4f5a5a..668ed9f7ba 100644 --- a/mteb/models/__init__.py +++ b/mteb/models/__init__.py @@ -19,6 +19,7 @@ llm2vec_models, mxbai_models, nomic_models, + nomic_bert_models, openai_models, ru_sentence_models, salesforce_models, @@ -132,6 +133,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe llm2vec_models, mxbai_models, nomic_models, + nomic_bert_models, openai_models, ru_sentence_models, salesforce_models, diff --git a/mteb/models/nomic_bert_models.py b/mteb/models/nomic_bert_models.py new file mode 100644 index 0000000000..281a84f9a5 --- /dev/null +++ b/mteb/models/nomic_bert_models.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +from functools import partial + +from sentence_transformers import SentenceTransformer +from sentence_transformers.models import Transformer, Pooling + +from mteb.model_meta import ModelMeta + +from transformers import AutoTokenizer, AutoConfig, AutoModelForMaskedLM +from typing import Any, Dict, Optional + +import torch.nn as nn +import torch + + +class NomicBertTransformer(Transformer): + def __init__( + self, + model_name_or_path: str, + max_seq_length: Optional[int] = None, + model_args: Optional[Dict[str, Any]] = None, + tokenizer_args: Optional[Dict[str, Any]] = None, + config_args: Optional[Dict[str, Any]] = None, + cache_dir: Optional[str] = None, + do_lower_case: bool = False, + tokenizer_name_or_path: str = None, + revision: str = None + ) -> None: + nn.Module.__init__(self) + self.config_keys = ["max_seq_length", "do_lower_case"] + self.do_lower_case = do_lower_case + if model_args is None: + model_args = {} + if tokenizer_args is None: + tokenizer_args = {} + if config_args is None: + config_args = {} + + config = AutoConfig.from_pretrained( + model_name_or_path, **config_args, cache_dir=cache_dir) + self.auto_model = AutoModelForMaskedLM.from_pretrained( + model_name_or_path, config=config, revision=revision, cache_dir=cache_dir, **model_args + ) + self.auto_model.cls = nn.Identity() + if max_seq_length is not None and "model_max_length" not in tokenizer_args: + tokenizer_args["model_max_length"] = max_seq_length + + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path, + cache_dir=cache_dir, + **tokenizer_args, + ) + # No max_seq_length set. Try to infer from model + if max_seq_length is None: + if ( + hasattr(self.auto_model, "config") + and hasattr(self.auto_model.config, "max_position_embeddings") + and hasattr(self.tokenizer, "model_max_length") + ): + max_seq_length = min( + self.auto_model.config.max_position_embeddings, self.tokenizer.model_max_length) + + self.max_seq_length = max_seq_length + if tokenizer_name_or_path is not None: + self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__ + + def forward(self, features: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + """Returns token_embeddings, cls_token""" + trans_features = { + "input_ids": features["input_ids"], "attention_mask": features["attention_mask"]} + if "token_type_ids" in features: + trans_features["token_type_ids"] = features["token_type_ids"] + + output_states = self.auto_model(**trans_features) + output_tokens = output_states.logits + + features.update({"token_embeddings": output_tokens, + "attention_mask": features["attention_mask"]}) + + if self.auto_model.config.output_hidden_states: + all_layer_idx = 2 + if len(output_states) < 3: # Some models only output last_hidden_states and all_hidden_states + all_layer_idx = 1 + + hidden_states = output_states[all_layer_idx] + features.update({"all_layer_embeddings": hidden_states}) + + return features + + +class SentenceTransformerWithNormalization(SentenceTransformer): + def encode(self, sentences, *args, **kwargs): + if "normalize_embeddings" not in kwargs: + kwargs["normalize_embeddings"] = True + + return super().encode(sentences, *args, **kwargs) + + +def nomic_bert_loader( + model_name: str, revision: str | None, **kwargs +) -> SentenceTransformer: + nomic_bert_transformer = NomicBertTransformer( + model_name_or_path=model_name, + tokenizer_name_or_path='bert-base-uncased', + config_args={'trust_remote_code': True}, + model_args={'trust_remote_code': True}, + revision=revision + ) + + pooling_model = Pooling( + nomic_bert_transformer.get_word_embedding_dimension()) + + return SentenceTransformerWithNormalization(modules=[nomic_bert_transformer, pooling_model]) + + +def custom_nomic_bert_loader( + model_name: str, + tokenizer_name: str, + revision: str | None, **kwargs +) -> SentenceTransformer: + nomic_bert_transformer = NomicBertTransformer( + model_name_or_path=model_name, + tokenizer_name_or_path=tokenizer_name, + config_args={'trust_remote_code': True}, + model_args={'trust_remote_code': True, 'use_auth_token': True}, + tokenizer_args={'use_auth_token': True}, + revision=revision + ) + + pooling_model = Pooling( + nomic_bert_transformer.get_word_embedding_dimension()) + + return SentenceTransformerWithNormalization(modules=[nomic_bert_transformer, pooling_model]) + + +nomic_bert = ModelMeta( + loader=partial( # type: ignore + nomic_bert_loader, + model_name="nomic-ai/nomic-bert-2048", + revision=None, + ), + name="nomic-ai/nomic-bert-2048", + languages=["eng-Latn"], + open_source=True, + revision=None, + release_date="2024-01-03", # first commit +) + From 9806073e3f25e31c600c1dab642dfa08d77e7e68 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Mon, 12 Aug 2024 08:59:07 -0400 Subject: [PATCH 06/49] Add a script to run the evaluation pipeline for chemical-related tasks --- chem_eval.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 chem_eval.py diff --git a/chem_eval.py b/chem_eval.py new file mode 100644 index 0000000000..b8cbbff3b9 --- /dev/null +++ b/chem_eval.py @@ -0,0 +1,29 @@ +import mteb +from tqdm import tqdm + +models = ["allenai/scibert_scivocab_uncased", + "google-bert/bert-base-uncased", + "intfloat/multilingual-e5-small", + "intfloat/multilingual-e5-base", + "intfloat/multilingual-e5-large", + "nomic-ai/nomic-embed-text-v1.5", + "nomic-ai/nomic-embed-text-v1", + "nomic-ai/nomic-bert-2048" + ] + +tasks = mteb.get_tasks(tasks=["PubChemAIParagraphsParaphrasePC", + "PubChemAISentenceParaphrasePC", + "PubChemSynonymPC", + "PubChemSMILESIsoTitlePC", + "PubChemSMILESIsoDescPC", + "PubChemSMILESCanonTitlePC", + "PubChemSMILESCanonDescPC", + "PubChemWikiParagraphsPC", + "ChemNQRetrieval", + "ChemHotpotQARetrieval" + ]) + +for model_name in tqdm(models): + model = mteb.get_model(model_name) + evaluation = mteb.MTEB(tasks=tasks) + evaluation.run(model, output_folder="chem_results") From 947e07a81d9a590caba531b63416b113d540dd86 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sat, 17 Aug 2024 03:44:49 -0400 Subject: [PATCH 07/49] Add 15 Wikipedia article classification tasks --- mteb/tasks/Classification/__init__.py | 15 +++++++++ .../eng/WikipediaEZ10Classification.py | 31 +++++++++++++++++++ .../eng/WikipediaEZ2Classification.py | 31 +++++++++++++++++++ .../eng/WikipediaEasy10Classification.py | 31 +++++++++++++++++++ ...eneExpressionVsMetallurgyClassification.py | 31 +++++++++++++++++++ .../WikipediaEasy2GreenhouseVsEnantiopure.py | 31 +++++++++++++++++++ .../WikipediaEasy2SolidStateVsColloidal.py | 31 +++++++++++++++++++ .../WikipediaEasy2SpecialClassification.py | 31 +++++++++++++++++++ .../eng/WikipediaEasy5Classification.py | 31 +++++++++++++++++++ ...pediaHard2BioluminescenceVsLuminescence.py | 31 +++++++++++++++++++ ...IsotopesVsFissionProductsNuclearFission.py | 31 +++++++++++++++++++ ...pediaHard2SaltsVsSemiconductorMaterials.py | 31 +++++++++++++++++++ ...aMedium2BioluminescenceVsNeurochemistry.py | 31 +++++++++++++++++++ ...iaMedium2ComputationalVsSpectroscopists.py | 31 +++++++++++++++++++ ...stallographyVsChromatographyTitrationpH.py | 31 +++++++++++++++++++ .../eng/WikipediaMedium5Classification.py | 31 +++++++++++++++++++ 16 files changed, 480 insertions(+) create mode 100644 mteb/tasks/Classification/eng/WikipediaEZ10Classification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaEZ2Classification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaEasy10Classification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaEasy2GeneExpressionVsMetallurgyClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopure.py create mode 100644 mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidal.py create mode 100644 mteb/tasks/Classification/eng/WikipediaEasy2SpecialClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaEasy5Classification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescence.py create mode 100644 mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFission.py create mode 100644 mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterials.py create mode 100644 mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistry.py create mode 100644 mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopists.py create mode 100644 mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpH.py create mode 100644 mteb/tasks/Classification/eng/WikipediaMedium5Classification.py diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index d8f87f8ea9..045150d330 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -37,6 +37,21 @@ from .eng.ToxicConversationsClassification import * from .eng.TweetSentimentExtractionClassification import * from .eng.TweetTopicSingleClassification import * +from .eng.WikipediaEasy2GeneExpressionVsMetallurgyClassification import * +from .eng.WikipediaEasy2GreenhouseVsEnantiopure import * +from .eng.WikipediaEasy2SolidStateVsColloidal import * +from .eng.WikipediaEasy2SpecialClassification import * +from .eng.WikipediaEasy5Classification import * +from .eng.WikipediaEasy10Classification import * +from .eng.WikipediaEZ2Classification import * +from .eng.WikipediaEZ10Classification import * +from .eng.WikipediaHard2BioluminescenceVsLuminescence import * +from .eng.WikipediaHard2IsotopesVsFissionProductsNuclearFission import * +from .eng.WikipediaHard2SaltsVsSemiconductorMaterials import * +from .eng.WikipediaMedium2BioluminescenceVsNeurochemistry import * +from .eng.WikipediaMedium2CrystallographyVsChromatographyTitrationpH import * +from .eng.WikipediaMedium2ComputationalVsSpectroscopists import * +from .eng.WikipediaMedium5Classification import * from .eng.YahooAnswersTopicsClassification import * from .eng.YelpReviewFullClassification import * from .est.estonian_valence import * diff --git a/mteb/tasks/Classification/eng/WikipediaEZ10Classification.py b/mteb/tasks/Classification/eng/WikipediaEZ10Classification.py new file mode 100644 index 0000000000..d63fdb7fed --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaEZ10Classification.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaEZ10Classification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaEZ10Classification", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/WikipediaEZ10Class", + "revision": "4dcc49b22904b1c91d8a5eac701b1182ff988bda", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/WikipediaEZ2Classification.py b/mteb/tasks/Classification/eng/WikipediaEZ2Classification.py new file mode 100644 index 0000000000..1ceddaf36d --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaEZ2Classification.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaEZ2Classification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaEZ2Classification", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/WikipediaEZ2Class", + "revision": "03d7df0886b6c450a182f221d11f555a95507417", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/WikipediaEasy10Classification.py b/mteb/tasks/Classification/eng/WikipediaEasy10Classification.py new file mode 100644 index 0000000000..023a89de58 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaEasy10Classification.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaEasy10Classification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaEasy10Classification", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Easy_10_Class", + "revision": "c5ae57cbc4acbf63c49f1b4b1408eb4209e5cf74", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2GeneExpressionVsMetallurgyClassification.py b/mteb/tasks/Classification/eng/WikipediaEasy2GeneExpressionVsMetallurgyClassification.py new file mode 100644 index 0000000000..7f2e7f7508 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaEasy2GeneExpressionVsMetallurgyClassification.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaEasy2GeneExpressionVsMetallurgyClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaEasy2GeneExpressionVsMetallurgyClassification", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Easy_2_Class_Gene_Expression_vs_Metallurgy", + "revision": "2a386fa589c865c8bcd6afbf201bc4f871fe9ef6", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopure.py b/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopure.py new file mode 100644 index 0000000000..c20d4ab858 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopure.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaEasy2GreenhouseVsEnantiopure(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaEasy2GreenhouseVsEnantiopure", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Easy_2_Class_Greenhouse_vs_Enantiopure", + "revision": "92cddec63a3c8ef29dc72ebaba7204625d864a2b", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidal.py b/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidal.py new file mode 100644 index 0000000000..83d5685158 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidal.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaEasy2SolidStateVsColloidal(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaEasy2SolidStateVsColloidal", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Easy_2_Class_Solid_State_vs_Colloidal", + "revision": "c9d4228c53c402cf3d340d3ccbcdb2cc37c8d6f3", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2SpecialClassification.py b/mteb/tasks/Classification/eng/WikipediaEasy2SpecialClassification.py new file mode 100644 index 0000000000..28269d283a --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaEasy2SpecialClassification.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaEasy2SpecialClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaEasy2SpecialClassification", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Easy_2_Class_Special", + "revision": "2e748237767f6a8651901493ec5f20d9c125af11", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/WikipediaEasy5Classification.py b/mteb/tasks/Classification/eng/WikipediaEasy5Classification.py new file mode 100644 index 0000000000..b34986b72a --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaEasy5Classification.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaEasy5Classification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaEasy5Classification", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Easy_5_Class", + "revision": "14df6a56f71a288622c63a473e81f9205af8e1a7", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescence.py b/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescence.py new file mode 100644 index 0000000000..34f43acbd0 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescence.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaHard2BioluminescenceVsLuminescence(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaHard2BioluminescenceVsLuminescence", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Hard_2_Class_Bioluminescence_vs_Luminescence", + "revision": "907895e5fe3138626c1c8d8ff26ac90b3c447cf2", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFission.py b/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFission.py new file mode 100644 index 0000000000..26f47a27ec --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFission.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaHard2IsotopesVsFissionProductsNuclearFission(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaHard2IsotopesVsFissionProductsNuclearFission", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Hard_2_Class_Isotopes_vs_Fission_Products_Nuclear_Fission", + "revision": "9c3974e039e774828742e739a3cc7bced7b337d5", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterials.py b/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterials.py new file mode 100644 index 0000000000..7cde0c13b7 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterials.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaHard2SaltsVsSemiconductorMaterials(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaHard2SaltsVsSemiconductorMaterials", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Hard_2_Class_Salts_vs_Semiconductor_Materials", + "revision": "b8f1f0eb9c3f54db47a6a6080938dcfdf307ef9f", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistry.py b/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistry.py new file mode 100644 index 0000000000..2b0bec3925 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistry.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaMedium2BioluminescenceVsNeurochemistry(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaMedium2BioluminescenceVsNeurochemistry", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Medium_2_Class_Bioluminescence_vs_Neurochemistry", + "revision": "4b1018f7a60702173d5ff9c08fda4704961ca3be", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopists.py b/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopists.py new file mode 100644 index 0000000000..83712fe740 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopists.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaMedium2ComputationalVsSpectroscopists(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaMedium2ComputationalVsSpectroscopists", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Medium_2_Class_Computational_vs_Spectroscopists", + "revision": "e74c1a94e9a0aca888324e89df2b7086a2f0923f", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpH.py b/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpH.py new file mode 100644 index 0000000000..3461c5696d --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpH.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaMedium2CrystallographyVsChromatographyTitrationpH(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaMedium2CrystallographyVsChromatographyTitrationpH", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Medium_2_Class_Crystallography_vs_Chromatography_Titration_pH", + "revision": "f1b8f8ca2afd4e8e988e077ac7f42aeae1e1a51c", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/WikipediaMedium5Classification.py b/mteb/tasks/Classification/eng/WikipediaMedium5Classification.py new file mode 100644 index 0000000000..19a31f7d8b --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaMedium5Classification.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaMedium5Classification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaMedium5Classification", + description="TBW", + reference="https://wikipedia.org", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Medium_5_Class", + "revision": "bc4750e59690c46be0ea4cf41ab52d034cea3a06", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) From 47b550f8e6c0ec48747e50305b81ffb8dde38c80 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sat, 17 Aug 2024 09:00:11 -0400 Subject: [PATCH 08/49] Add PairClassification and BitextMining tasks for Coconut SMILES --- mteb/tasks/BitextMining/__init__.py | 2 + .../CoconutSmiles2NameBitextMining1.py | 61 ++++++++++++++++ .../CoconutSmiles2NameBitextMining2.py | 61 ++++++++++++++++ mteb/tasks/PairClassification/__init__.py | 11 +-- .../CoconutSmiles2NamePairClassification.py | 72 +++++++++++++++++++ 5 files changed, 202 insertions(+), 5 deletions(-) create mode 100644 mteb/tasks/BitextMining/multilingual/CoconutSmiles2NameBitextMining1.py create mode 100644 mteb/tasks/BitextMining/multilingual/CoconutSmiles2NameBitextMining2.py create mode 100644 mteb/tasks/PairClassification/eng/CoconutSmiles2NamePairClassification.py diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py index c176077215..b72e4da9c7 100644 --- a/mteb/tasks/BitextMining/__init__.py +++ b/mteb/tasks/BitextMining/__init__.py @@ -5,6 +5,8 @@ from .multilingual.BibleNLPBitextMining import * from .multilingual.BUCCBitextMining import * from .multilingual.BUCCBitextMiningFast import * +from .multilingual.CoconutSmiles2NameBitextMining1 import * +from .multilingual.CoconutSmiles2NameBitextMining2 import * from .multilingual.DiaBLaBitextMining import * from .multilingual.FloresBitextMining import * from .multilingual.IN22ConvBitextMining import * diff --git a/mteb/tasks/BitextMining/multilingual/CoconutSmiles2NameBitextMining1.py b/mteb/tasks/BitextMining/multilingual/CoconutSmiles2NameBitextMining1.py new file mode 100644 index 0000000000..eb4929e9a1 --- /dev/null +++ b/mteb/tasks/BitextMining/multilingual/CoconutSmiles2NameBitextMining1.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CoconutSmiles2NameBitextMining1(AbsTaskBitextMining, MultilingualTask): + metadata = TaskMetadata( + name="CoconutSmiles2NameBitextMining1", + dataset={ + "path": "BASF-We-Create-Chemistry/CoconutSmiles2NameBitextMining1", + "revision": "cd1089904b8633a55a3ab3fa379c7fd76c02c722" + }, + description="TBW", + reference="https://coconut.naturalproducts.net/", + type="BitextMining", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs={ + "en-en": ["en-Latn", "eng-Latn"] + }, + main_score="f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation=None, + descriptive_stats={"n_samples": {}, "avg_character_length": {}}, + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub and convert it to the standard format.""" + if self.data_loaded: + return + + self.dataset = {} + + for lang in self.hf_subsets: + self.dataset[lang] = datasets.load_dataset( + **self.metadata_dict["dataset"]) + + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + def create_columns(row): + """Put all English titles in column 'sentence1' and SMILES strings in 'sentence2' column""" + row["sentence1"] = row["name"] + row["sentence2"] = row["canonical_smiles"] + return row + + # Convert to standard format + for lang in self.hf_subsets: + self.dataset[lang] = self.dataset[lang].map(create_columns) diff --git a/mteb/tasks/BitextMining/multilingual/CoconutSmiles2NameBitextMining2.py b/mteb/tasks/BitextMining/multilingual/CoconutSmiles2NameBitextMining2.py new file mode 100644 index 0000000000..a4e4ad9ee3 --- /dev/null +++ b/mteb/tasks/BitextMining/multilingual/CoconutSmiles2NameBitextMining2.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CoconutSmiles2NameBitextMining2(AbsTaskBitextMining, MultilingualTask): + metadata = TaskMetadata( + name="CoconutSmiles2NameBitextMining2", + dataset={ + "path": "BASF-We-Create-Chemistry/CoconutSmiles2NameBitextMining2", + "revision": "9613249c6b80f794d6cfd4c732c5889e94d7c96e" + }, + description="TBW", + reference="https://coconut.naturalproducts.net/", + type="BitextMining", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs={ + "en-en": ["en-Latn", "eng-Latn"] + }, + main_score="f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation=None, + descriptive_stats={"n_samples": {}, "avg_character_length": {}}, + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub and convert it to the standard format.""" + if self.data_loaded: + return + + self.dataset = {} + + for lang in self.hf_subsets: + self.dataset[lang] = datasets.load_dataset( + **self.metadata_dict["dataset"]) + + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + def create_columns(row): + """Put all English titles in column 'sentence1' and SMILES strings in 'sentence2' column""" + row["sentence1"] = row["name"] + row["sentence2"] = row["canonical_smiles"] + return row + + # Convert to standard format + for lang in self.hf_subsets: + self.dataset[lang] = self.dataset[lang].map(create_columns) diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index e9d45fa9f4..10ae932494 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -3,18 +3,19 @@ from .ara.ArEntail import * from .ces.CTKFactsNLI import * from .deu.FalseFriendsDeEnPC import * +from .eng.CoconutSmiles2NamePairClassification import * from .eng.LegalBenchPC import * -from .eng.SprintDuplicateQuestionsPC import * -from .eng.TwitterSemEval2015PC import * -from .eng.TwitterURLCorpusPC import * -from .eng.PubChemWikiParagraphsPC import * -from .eng.PubChemAISentenceParaphrasePC import * from .eng.PubChemAIParagraphsParaphrasePC import * +from .eng.PubChemAISentenceParaphrasePC import * from .eng.PubChemSMILESCanonDescPC import * from .eng.PubChemSMILESCanonTitlePC import * from .eng.PubChemSMILESIsoDescPC import * from .eng.PubChemSMILESIsoTitlePC import * from .eng.PubChemSynonymPC import * +from .eng.PubChemWikiParagraphsPC import * +from .eng.SprintDuplicateQuestionsPC import * +from .eng.TwitterSemEval2015PC import * +from .eng.TwitterURLCorpusPC import * from .fas.FarsTail import * from .hye.ArmenianParaphrasePC import * from .ind.IndoNLI import * diff --git a/mteb/tasks/PairClassification/eng/CoconutSmiles2NamePairClassification.py b/mteb/tasks/PairClassification/eng/CoconutSmiles2NamePairClassification.py new file mode 100644 index 0000000000..a1fbb85f38 --- /dev/null +++ b/mteb/tasks/PairClassification/eng/CoconutSmiles2NamePairClassification.py @@ -0,0 +1,72 @@ +from __future__ import annotations + + +from typing import Any + +import datasets + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +_DATASET_COLUMN_MAP = { + "sentence1": "name", + "sentence2": "canonical_smiles", + "labels": "label", +} + + +class CoconutSmiles2NamePairClassification(AbsTaskPairClassification): + metadata = TaskMetadata( + name="CoconutSmiles2NamePairClassification", + description="""TBW""", + reference="https://coconut.naturalproducts.net/", + dataset={ + "path": "BASF-We-Create-Chemistry/CoconutSmiles2NamePairClassification", + "revision": "6e7c8b5419a8b437fc1217bb35101a198e723db6" + }, + type="PairClassification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=None, + sample_creation="created", + bibtex_citation=None, + descriptive_stats={} + ) + + def load_data(self, **kwargs: Any) -> None: + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + + self.dataset = datasets.load_dataset( + self.metadata_dict["dataset"]["path"], + revision=self.metadata_dict["dataset"]["revision"], + trust_remote_code=True, + ) + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"], label=_DATASET_COLUMN_MAP["labels"] + ) + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset[_DATASET_COLUMN_MAP["sentence1"]], + "sentence2": hf_dataset[_DATASET_COLUMN_MAP["sentence2"]], + "labels": hf_dataset[_DATASET_COLUMN_MAP["labels"]] + } + ] + self.dataset = _dataset From 79d9111920c6bd94ca1066b342d6b204b5e68ed5 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 18 Aug 2024 07:06:22 -0400 Subject: [PATCH 09/49] Fix naming of some Classification and PairClassification tasks --- mteb/tasks/Classification/__init__.py | 16 ++++++++-------- ...asy2GreenhouseVsEnantiopureClassification.py} | 0 ...aEasy2SolidStateVsColloidalClassification.py} | 0 ...oluminescenceVsLuminescenceClassification.py} | 0 ...ssionProductsNuclearFissionClassification.py} | 0 ...ltsVsSemiconductorMaterialsClassification.py} | 0 ...uminescenceVsNeurochemistryClassification.py} | 0 ...putationalVsSpectroscopistsClassification.py} | 0 ...VsChromatographyTitrationpHClassification.py} | 0 mteb/tasks/PairClassification/__init__.py | 2 +- ...Classification.py => CoconutSmiles2NamePC.py} | 4 ++-- 11 files changed, 11 insertions(+), 11 deletions(-) rename mteb/tasks/Classification/eng/{WikipediaEasy2GreenhouseVsEnantiopure.py => WikipediaEasy2GreenhouseVsEnantiopureClassification.py} (100%) rename mteb/tasks/Classification/eng/{WikipediaEasy2SolidStateVsColloidal.py => WikipediaEasy2SolidStateVsColloidalClassification.py} (100%) rename mteb/tasks/Classification/eng/{WikipediaHard2BioluminescenceVsLuminescence.py => WikipediaHard2BioluminescenceVsLuminescenceClassification.py} (100%) rename mteb/tasks/Classification/eng/{WikipediaHard2IsotopesVsFissionProductsNuclearFission.py => WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py} (100%) rename mteb/tasks/Classification/eng/{WikipediaHard2SaltsVsSemiconductorMaterials.py => WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py} (100%) rename mteb/tasks/Classification/eng/{WikipediaMedium2BioluminescenceVsNeurochemistry.py => WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py} (100%) rename mteb/tasks/Classification/eng/{WikipediaMedium2ComputationalVsSpectroscopists.py => WikipediaMedium2ComputationalVsSpectroscopistsClassification.py} (100%) rename mteb/tasks/Classification/eng/{WikipediaMedium2CrystallographyVsChromatographyTitrationpH.py => WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py} (100%) rename mteb/tasks/PairClassification/eng/{CoconutSmiles2NamePairClassification.py => CoconutSmiles2NamePC.py} (94%) diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index 045150d330..317267446d 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -38,19 +38,19 @@ from .eng.TweetSentimentExtractionClassification import * from .eng.TweetTopicSingleClassification import * from .eng.WikipediaEasy2GeneExpressionVsMetallurgyClassification import * -from .eng.WikipediaEasy2GreenhouseVsEnantiopure import * -from .eng.WikipediaEasy2SolidStateVsColloidal import * +from .eng.WikipediaEasy2GreenhouseVsEnantiopureClassification import * +from .eng.WikipediaEasy2SolidStateVsColloidalClassification import * from .eng.WikipediaEasy2SpecialClassification import * from .eng.WikipediaEasy5Classification import * from .eng.WikipediaEasy10Classification import * from .eng.WikipediaEZ2Classification import * from .eng.WikipediaEZ10Classification import * -from .eng.WikipediaHard2BioluminescenceVsLuminescence import * -from .eng.WikipediaHard2IsotopesVsFissionProductsNuclearFission import * -from .eng.WikipediaHard2SaltsVsSemiconductorMaterials import * -from .eng.WikipediaMedium2BioluminescenceVsNeurochemistry import * -from .eng.WikipediaMedium2CrystallographyVsChromatographyTitrationpH import * -from .eng.WikipediaMedium2ComputationalVsSpectroscopists import * +from .eng.WikipediaHard2BioluminescenceVsLuminescenceClassification import * +from .eng.WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification import * +from .eng.WikipediaHard2SaltsVsSemiconductorMaterialsClassification import * +from .eng.WikipediaMedium2BioluminescenceVsNeurochemistryClassification import * +from .eng.WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification import * +from .eng.WikipediaMedium2ComputationalVsSpectroscopistsClassification import * from .eng.WikipediaMedium5Classification import * from .eng.YahooAnswersTopicsClassification import * from .eng.YelpReviewFullClassification import * diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopure.py b/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopureClassification.py similarity index 100% rename from mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopure.py rename to mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopureClassification.py diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidal.py b/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidalClassification.py similarity index 100% rename from mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidal.py rename to mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidalClassification.py diff --git a/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescence.py b/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescenceClassification.py similarity index 100% rename from mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescence.py rename to mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescenceClassification.py diff --git a/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFission.py b/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py similarity index 100% rename from mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFission.py rename to mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py diff --git a/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterials.py b/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py similarity index 100% rename from mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterials.py rename to mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistry.py b/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py similarity index 100% rename from mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistry.py rename to mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopists.py b/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopistsClassification.py similarity index 100% rename from mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopists.py rename to mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopistsClassification.py diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpH.py b/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py similarity index 100% rename from mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpH.py rename to mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index 10ae932494..5ba6e47dd8 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -3,7 +3,7 @@ from .ara.ArEntail import * from .ces.CTKFactsNLI import * from .deu.FalseFriendsDeEnPC import * -from .eng.CoconutSmiles2NamePairClassification import * +from .eng.CoconutSmiles2NamePC import * from .eng.LegalBenchPC import * from .eng.PubChemAIParagraphsParaphrasePC import * from .eng.PubChemAISentenceParaphrasePC import * diff --git a/mteb/tasks/PairClassification/eng/CoconutSmiles2NamePairClassification.py b/mteb/tasks/PairClassification/eng/CoconutSmiles2NamePC.py similarity index 94% rename from mteb/tasks/PairClassification/eng/CoconutSmiles2NamePairClassification.py rename to mteb/tasks/PairClassification/eng/CoconutSmiles2NamePC.py index a1fbb85f38..3880c42eff 100644 --- a/mteb/tasks/PairClassification/eng/CoconutSmiles2NamePairClassification.py +++ b/mteb/tasks/PairClassification/eng/CoconutSmiles2NamePC.py @@ -16,9 +16,9 @@ } -class CoconutSmiles2NamePairClassification(AbsTaskPairClassification): +class CoconutSmiles2NamePC(AbsTaskPairClassification): metadata = TaskMetadata( - name="CoconutSmiles2NamePairClassification", + name="CoconutSmiles2NamePC", description="""TBW""", reference="https://coconut.naturalproducts.net/", dataset={ From 17f8be1b07522bb6eacdd90d3a9d713ecf6c9dc7 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 18 Aug 2024 07:29:21 -0400 Subject: [PATCH 10/49] Fix some classification tasks naming issues --- .../WikipediaEasy2GreenhouseVsEnantiopureClassification.py | 4 ++-- .../eng/WikipediaEasy2SolidStateVsColloidalClassification.py | 4 ++-- ...kipediaHard2BioluminescenceVsLuminescenceClassification.py | 4 ++-- ...d2IsotopesVsFissionProductsNuclearFissionClassification.py | 4 ++-- ...kipediaHard2SaltsVsSemiconductorMaterialsClassification.py | 4 ++-- ...diaMedium2BioluminescenceVsNeurochemistryClassification.py | 4 ++-- ...ediaMedium2ComputationalVsSpectroscopistsClassification.py | 4 ++-- ...rystallographyVsChromatographyTitrationpHClassification.py | 4 ++-- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopureClassification.py b/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopureClassification.py index c20d4ab858..80f647b923 100644 --- a/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopureClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopureClassification.py @@ -4,9 +4,9 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class WikipediaEasy2GreenhouseVsEnantiopure(AbsTaskClassification): +class WikipediaEasy2GreenhouseVsEnantiopureClassification(AbsTaskClassification): metadata = TaskMetadata( - name="WikipediaEasy2GreenhouseVsEnantiopure", + name="WikipediaEasy2GreenhouseVsEnantiopureClassification", description="TBW", reference="https://wikipedia.org", dataset={ diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidalClassification.py b/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidalClassification.py index 83d5685158..95facf2936 100644 --- a/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidalClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidalClassification.py @@ -4,9 +4,9 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class WikipediaEasy2SolidStateVsColloidal(AbsTaskClassification): +class WikipediaEasy2SolidStateVsColloidalClassification(AbsTaskClassification): metadata = TaskMetadata( - name="WikipediaEasy2SolidStateVsColloidal", + name="WikipediaEasy2SolidStateVsColloidalClassification", description="TBW", reference="https://wikipedia.org", dataset={ diff --git a/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescenceClassification.py b/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescenceClassification.py index 34f43acbd0..b3c7a1a8f6 100644 --- a/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescenceClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescenceClassification.py @@ -4,9 +4,9 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class WikipediaHard2BioluminescenceVsLuminescence(AbsTaskClassification): +class WikipediaHard2BioluminescenceVsLuminescenceClassification(AbsTaskClassification): metadata = TaskMetadata( - name="WikipediaHard2BioluminescenceVsLuminescence", + name="WikipediaHard2BioluminescenceVsLuminescenceClassification", description="TBW", reference="https://wikipedia.org", dataset={ diff --git a/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py b/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py index 26f47a27ec..029893dc73 100644 --- a/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py @@ -4,9 +4,9 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class WikipediaHard2IsotopesVsFissionProductsNuclearFission(AbsTaskClassification): +class WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification(AbsTaskClassification): metadata = TaskMetadata( - name="WikipediaHard2IsotopesVsFissionProductsNuclearFission", + name="WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification", description="TBW", reference="https://wikipedia.org", dataset={ diff --git a/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py b/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py index 7cde0c13b7..0b73298662 100644 --- a/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py @@ -4,9 +4,9 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class WikipediaHard2SaltsVsSemiconductorMaterials(AbsTaskClassification): +class WikipediaHard2SaltsVsSemiconductorMaterialsClassification(AbsTaskClassification): metadata = TaskMetadata( - name="WikipediaHard2SaltsVsSemiconductorMaterials", + name="WikipediaHard2SaltsVsSemiconductorMaterialsClassification", description="TBW", reference="https://wikipedia.org", dataset={ diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py b/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py index 2b0bec3925..7aae4f87ff 100644 --- a/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py @@ -4,9 +4,9 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class WikipediaMedium2BioluminescenceVsNeurochemistry(AbsTaskClassification): +class WikipediaMedium2BioluminescenceVsNeurochemistryClassification(AbsTaskClassification): metadata = TaskMetadata( - name="WikipediaMedium2BioluminescenceVsNeurochemistry", + name="WikipediaMedium2BioluminescenceVsNeurochemistryClassification", description="TBW", reference="https://wikipedia.org", dataset={ diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopistsClassification.py b/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopistsClassification.py index 83712fe740..635b74c52b 100644 --- a/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopistsClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopistsClassification.py @@ -4,9 +4,9 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class WikipediaMedium2ComputationalVsSpectroscopists(AbsTaskClassification): +class WikipediaMedium2ComputationalVsSpectroscopistsClassification(AbsTaskClassification): metadata = TaskMetadata( - name="WikipediaMedium2ComputationalVsSpectroscopists", + name="WikipediaMedium2ComputationalVsSpectroscopistsClassification", description="TBW", reference="https://wikipedia.org", dataset={ diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py b/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py index 3461c5696d..f9b3a57d78 100644 --- a/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py @@ -4,9 +4,9 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class WikipediaMedium2CrystallographyVsChromatographyTitrationpH(AbsTaskClassification): +class WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification(AbsTaskClassification): metadata = TaskMetadata( - name="WikipediaMedium2CrystallographyVsChromatographyTitrationpH", + name="WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification", description="TBW", reference="https://wikipedia.org", dataset={ From bb779552b599748faa3079d205d28a5266a07526 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Mon, 19 Aug 2024 09:09:41 +0000 Subject: [PATCH 11/49] Integrate WANDB with benchmarking script --- chem_eval.py | 149 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 123 insertions(+), 26 deletions(-) diff --git a/chem_eval.py b/chem_eval.py index b8cbbff3b9..e26b6d6b51 100644 --- a/chem_eval.py +++ b/chem_eval.py @@ -1,29 +1,126 @@ import mteb +import os from tqdm import tqdm +import wandb +import json +import time -models = ["allenai/scibert_scivocab_uncased", - "google-bert/bert-base-uncased", - "intfloat/multilingual-e5-small", - "intfloat/multilingual-e5-base", - "intfloat/multilingual-e5-large", - "nomic-ai/nomic-embed-text-v1.5", - "nomic-ai/nomic-embed-text-v1", - "nomic-ai/nomic-bert-2048" - ] - -tasks = mteb.get_tasks(tasks=["PubChemAIParagraphsParaphrasePC", - "PubChemAISentenceParaphrasePC", - "PubChemSynonymPC", - "PubChemSMILESIsoTitlePC", - "PubChemSMILESIsoDescPC", - "PubChemSMILESCanonTitlePC", - "PubChemSMILESCanonDescPC", - "PubChemWikiParagraphsPC", - "ChemNQRetrieval", - "ChemHotpotQARetrieval" - ]) - -for model_name in tqdm(models): - model = mteb.get_model(model_name) - evaluation = mteb.MTEB(tasks=tasks) - evaluation.run(model, output_folder="chem_results") + +def is_run_available(model_name, model_revision): + api = wandb.Api() + runs = api.runs('Chembedding - Benchmarking') + for run in runs: + if run.name == model_name and run.config['revision'] == model_revision and run.state == "finished": + return True + return False + + +def read_json(path): + with open(path, "r") as f: + return json.load(f) + + +def json_parser(data): + task_name = data["task_name"] + output = {} + if task_name.endswith("PC"): + output["Max F1"] = data["scores"]["test"][0]["main_score"] + elif task_name.endswith("Classification"): + output["Accuracy"] = data["scores"]["test"][0]["main_score"] + elif "BitextMining" in task_name: + output["F1"] = data["scores"]["test"][0]["main_score"] + elif task_name.endswith("Retrieval"): + output["NDCG@10"] = data["scores"]["test"][0]["main_score"] + return output + + +if __name__ == "__main__": + now = time.time() + + models = {"google-bert/bert-base-uncased": "86b5e0934494bd15c9632b12f734a8a67f723594", + "allenai/scibert_scivocab_uncased": "24f92d32b1bfb0bcaf9ab193ff3ad01e87732fc1", + "nomic-ai/nomic-bert-2048": "no_revision_available", + "intfloat/multilingual-e5-small": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "intfloat/multilingual-e5-base": "d13f1b27baf31030b7fd040960d60d909913633f", + "intfloat/multilingual-e5-large": "4dc6d853a804b9c8886ede6dda8a073b7dc08a81", + "nomic-ai/nomic-embed-text-v1": "0759316f275aa0cb93a5b830973843ca66babcf5", + "nomic-ai/nomic-embed-text-v1.5": "b0753ae76394dd36bcfb912a46018088bca48be0", + } + + all_tasks = [ + "CoconutSmiles2NamePC", + "PubChemAIParagraphsParaphrasePC", + "PubChemAISentenceParaphrasePC", + "PubChemSMILESCanonDescPC", + "PubChemSMILESCanonTitlePC", + "PubChemSMILESIsoDescPC", + "PubChemSMILESIsoTitlePC", + "PubChemSynonymPC", + "PubChemWikiParagraphsPC", + + "WikipediaEasy2GeneExpressionVsMetallurgyClassification", + "WikipediaEasy2GreenhouseVsEnantiopureClassification", + "WikipediaEasy2SolidStateVsColloidalClassification", + "WikipediaEasy2SpecialClassification", + "WikipediaEasy5Classification", + "WikipediaEasy10Classification", + "WikipediaEZ2Classification", + "WikipediaEZ10Classification", + "WikipediaHard2BioluminescenceVsLuminescenceClassification", + "WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification", + "WikipediaHard2SaltsVsSemiconductorMaterialsClassification", + "WikipediaMedium2BioluminescenceVsNeurochemistryClassification", + "WikipediaMedium2ComputationalVsSpectroscopistsClassification", + "WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification", + "WikipediaMedium5Classification", + + "ChemNQRetrieval", + "ChemHotpotQARetrieval", + + "CoconutSmiles2NameBitextMining1", + "CoconutSmiles2NameBitextMining2" + ] + + tasks = mteb.get_tasks(tasks=all_tasks) + + for model_full_name, model_rev in tqdm(models.items()): + model_name = model_full_name.split("/")[1] + + if is_run_available(model_name, model_rev): + print(f"Skipping {model_name} - {model_rev}") + continue + + wandb.init(project='Chembedding - Benchmarking', name=model_name, + config={"revision": model_rev}) + model = mteb.get_model(model_full_name) + evaluation = mteb.MTEB(tasks=tasks) + evaluation.run(model, output_folder="chem_results", + overwrite_results=False) + table = wandb.Table(columns=["Task", "Metric", "Score"]) + + for task_name in tqdm(all_tasks): + data = read_json(os.path.join( + "chem_results", + model_full_name.replace("/", "__"), + model_rev, + task_name + '.json', + )) + output = json_parser(data) + wandb.log(output) + + for metric, score in output.items(): + table = wandb.Table(data=[[metric, score]], + columns=["Metric", "Score"]) + bar_plot = wandb.plot.bar( + table, "Metric", "Score", title=f"{task_name} Performance") + wandb.log({f"{task_name}_bar_plot": bar_plot}) + + wandb.finish() + + elapsed = time.time() - now + + hours = int(elapsed // 3600) + minutes = int((elapsed % 3600) // 60) + seconds = int(elapsed % 60) + + print(f"Elapsed time: {hours} hours, {minutes} minutes, {seconds} seconds") \ No newline at end of file From d287801100d2078123a3d0ffbf0e1f3ebcb4d4f3 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Mon, 19 Aug 2024 09:11:29 +0000 Subject: [PATCH 12/49] Update .gitignore --- .gitignore | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3219560494..98256c621d 100644 --- a/.gitignore +++ b/.gitignore @@ -143,4 +143,10 @@ sb.ipynb tests/create_meta/model_card.md # removed results from mteb repo they are now available at: https://github.com/embeddings-benchmark/results -results/ \ No newline at end of file +results/ + +# Weights and Biases +wandb/ + +# Chemical results +chem_results/ \ No newline at end of file From 107fba5fe855f5ade5f6d885d62e02dd819dabe0 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Tue, 20 Aug 2024 07:05:25 +0000 Subject: [PATCH 13/49] Fix `nomic_models.py` issue with retrieval tasks, similar to issue #1115 in original repo --- mteb/models/nomic_models.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mteb/models/nomic_models.py b/mteb/models/nomic_models.py index d05d65c16d..84ada0e070 100644 --- a/mteb/models/nomic_models.py +++ b/mteb/models/nomic_models.py @@ -77,6 +77,9 @@ def encode_corpus( if "prompt_name" in kwargs: kwargs.pop("prompt_name") + if "request_qid" in kwargs: + kwargs.pop("request_qid") + sentences = corpus_to_texts(corpus) emb = self.encode( sentences, batch_size=batch_size, input_type="search_document", **kwargs From 82aa5593c61d6e6ab3b4cb9feacc1f62cf47d6d0 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Wed, 21 Aug 2024 08:16:58 +0000 Subject: [PATCH 14/49] Add one chemical model and some SentenceTransformer models --- chem_eval.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/chem_eval.py b/chem_eval.py index e26b6d6b51..faf854a539 100644 --- a/chem_eval.py +++ b/chem_eval.py @@ -45,6 +45,11 @@ def json_parser(data): "intfloat/multilingual-e5-large": "4dc6d853a804b9c8886ede6dda8a073b7dc08a81", "nomic-ai/nomic-embed-text-v1": "0759316f275aa0cb93a5b830973843ca66babcf5", "nomic-ai/nomic-embed-text-v1.5": "b0753ae76394dd36bcfb912a46018088bca48be0", + "recobo/chemical-bert-uncased": "498698d28fcf7ce5954852a0444c864bdf232b64", + "all-mpnet-base-v2": "84f2bcc00d77236f9e89c8a360a00fb1139bf47d", + "multi-qa-mpnet-base-dot-v1": "3af7c6da5b3e1bea796ef6c97fe237538cbe6e7f", + "all-MiniLM-L12-v2": "a05860a77cef7b37e0048a7864658139bc18a854", + "all-MiniLM-L6-v2": "8b3219a92973c328a8e22fadcfa821b5dc75636a" } all_tasks = [ @@ -96,7 +101,6 @@ def json_parser(data): evaluation = mteb.MTEB(tasks=tasks) evaluation.run(model, output_folder="chem_results", overwrite_results=False) - table = wandb.Table(columns=["Task", "Metric", "Score"]) for task_name in tqdm(all_tasks): data = read_json(os.path.join( From 90c5ecba21ee089aef33832008cf7ea596abb589 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Wed, 21 Aug 2024 08:33:42 +0000 Subject: [PATCH 15/49] Fix a naming issue for SentenceTransformer models --- chem_eval.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/chem_eval.py b/chem_eval.py index faf854a539..67b2e5e0f0 100644 --- a/chem_eval.py +++ b/chem_eval.py @@ -89,7 +89,10 @@ def json_parser(data): tasks = mteb.get_tasks(tasks=all_tasks) for model_full_name, model_rev in tqdm(models.items()): - model_name = model_full_name.split("/")[1] + if "/" in model_full_name: + model_name = model_full_name.split("/")[1] + else: + model_name = model_full_name if is_run_available(model_name, model_rev): print(f"Skipping {model_name} - {model_rev}") From 0c2deda4924f95a4f3c8dc77ca4210cc48c0e799 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Thu, 5 Sep 2024 21:02:11 +0000 Subject: [PATCH 16/49] Add OpenAI, bge-m3 and matscibert models --- chem_eval.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/chem_eval.py b/chem_eval.py index 67b2e5e0f0..1cff219b49 100644 --- a/chem_eval.py +++ b/chem_eval.py @@ -46,10 +46,15 @@ def json_parser(data): "nomic-ai/nomic-embed-text-v1": "0759316f275aa0cb93a5b830973843ca66babcf5", "nomic-ai/nomic-embed-text-v1.5": "b0753ae76394dd36bcfb912a46018088bca48be0", "recobo/chemical-bert-uncased": "498698d28fcf7ce5954852a0444c864bdf232b64", + "BAAI/bge-m3": "5617a9f61b028005a4858fdac845db406aefb181", "all-mpnet-base-v2": "84f2bcc00d77236f9e89c8a360a00fb1139bf47d", "multi-qa-mpnet-base-dot-v1": "3af7c6da5b3e1bea796ef6c97fe237538cbe6e7f", "all-MiniLM-L12-v2": "a05860a77cef7b37e0048a7864658139bc18a854", - "all-MiniLM-L6-v2": "8b3219a92973c328a8e22fadcfa821b5dc75636a" + "all-MiniLM-L6-v2": "8b3219a92973c328a8e22fadcfa821b5dc75636a", + "m3rg-iitd/matscibert": "ced9d8f5f208712c4a90f98a246fe32155b29995", + "text-embedding-ada-002": "1", + "text-embedding-3-small": "1", + "text-embedding-3-large": "1", } all_tasks = [ From 4e9f309cc881bb04f8972b5561126e1262b7afa9 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 8 Sep 2024 13:41:04 +0000 Subject: [PATCH 17/49] Add PubChem SMILES Bitext Mining tasks --- chem_eval.py | 6 +- mteb/tasks/BitextMining/__init__.py | 4 ++ .../multilingual/PubChemSMILESCanonDescBM.py | 61 +++++++++++++++++++ .../multilingual/PubChemSMILESCanonTitleBM.py | 61 +++++++++++++++++++ .../multilingual/PubChemSMILESISoDescBM.py | 61 +++++++++++++++++++ .../multilingual/PubChemSMILESISoTitleBM.py | 61 +++++++++++++++++++ 6 files changed, 253 insertions(+), 1 deletion(-) create mode 100644 mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py create mode 100644 mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py create mode 100644 mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py create mode 100644 mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py diff --git a/chem_eval.py b/chem_eval.py index 1cff219b49..8cc4a06e23 100644 --- a/chem_eval.py +++ b/chem_eval.py @@ -88,7 +88,11 @@ def json_parser(data): "ChemHotpotQARetrieval", "CoconutSmiles2NameBitextMining1", - "CoconutSmiles2NameBitextMining2" + "CoconutSmiles2NameBitextMining2", + "PubChemSMILESISoTitleBM", + "PubChemSMILESCanonTitleBM", + "PubChemSMILESISoDescBM", + "PubChemSMILESCanonDescBM" ] tasks = mteb.get_tasks(tasks=all_tasks) diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py index b72e4da9c7..44c975d382 100644 --- a/mteb/tasks/BitextMining/__init__.py +++ b/mteb/tasks/BitextMining/__init__.py @@ -20,6 +20,10 @@ from .multilingual.NusaTranslationBitextMining import * from .multilingual.NusaXBitextMining import * from .multilingual.PhincBitextMining import * +from .multilingual.PubChemSMILESCanonDescBM import * +from .multilingual.PubChemSMILESCanonTitleBM import * +from .multilingual.PubChemSMILESISoDescBM import * +from .multilingual.PubChemSMILESISoTitleBM import * from .multilingual.RomaTalesBitextMining import * from .multilingual.TatoebaBitextMining import * from .srn.SRNCorpusBitextMining import * diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py new file mode 100644 index 0000000000..e57d707a55 --- /dev/null +++ b/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class PubChemSMILESCanonDescBM(AbsTaskBitextMining, MultilingualTask): + metadata = TaskMetadata( + name="PubChemSMILESCanonDescBM", + dataset={ + "path": "BASF-We-Create-Chemistry/PubChemSMILESCanonDescBM", + "revision": "c1b58c3d6c27b71c70a509b18915e841c1c89484" + }, + description="TBW", + reference="https://pubchem.ncbi.nlm.nih.gov/", + type="BitextMining", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs={ + "en-en": ["en-Latn", "eng-Latn"] + }, + main_score="f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation=None, + descriptive_stats={"n_samples": {}, "avg_character_length": {}}, + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub and convert it to the standard format.""" + if self.data_loaded: + return + + self.dataset = {} + + for lang in self.hf_subsets: + self.dataset[lang] = datasets.load_dataset( + **self.metadata_dict["dataset"]) + + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + def create_columns(row): + """Put all English titles in column 'sentence1' and SMILES strings in 'sentence2' column""" + row["sentence1"] = row["description"] + row["sentence2"] = row["canonical_smiles"] + return row + + # Convert to standard format + for lang in self.hf_subsets: + self.dataset[lang] = self.dataset[lang].map(create_columns) diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py new file mode 100644 index 0000000000..51299ff0d0 --- /dev/null +++ b/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class PubChemSMILESCanonTitleBM(AbsTaskBitextMining, MultilingualTask): + metadata = TaskMetadata( + name="PubChemSMILESCanonTitleBM", + dataset={ + "path": "BASF-We-Create-Chemistry/PubChemSMILESCanonTitleBM", + "revision": "4e7fd0815c39f407e6fd08c9b1d5bf481022ef4f" + }, + description="TBW", + reference="https://pubchem.ncbi.nlm.nih.gov/", + type="BitextMining", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs={ + "en-en": ["en-Latn", "eng-Latn"] + }, + main_score="f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation=None, + descriptive_stats={"n_samples": {}, "avg_character_length": {}}, + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub and convert it to the standard format.""" + if self.data_loaded: + return + + self.dataset = {} + + for lang in self.hf_subsets: + self.dataset[lang] = datasets.load_dataset( + **self.metadata_dict["dataset"]) + + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + def create_columns(row): + """Put all English titles in column 'sentence1' and SMILES strings in 'sentence2' column""" + row["sentence1"] = row["title"] + row["sentence2"] = row["canonical_smiles"] + return row + + # Convert to standard format + for lang in self.hf_subsets: + self.dataset[lang] = self.dataset[lang].map(create_columns) diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py new file mode 100644 index 0000000000..03a449884d --- /dev/null +++ b/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class PubChemSMILESISoDescBM(AbsTaskBitextMining, MultilingualTask): + metadata = TaskMetadata( + name="PubChemSMILESISoDescBM", + dataset={ + "path": "BASF-We-Create-Chemistry/PubChemSMILESIsoDescBM", + "revision": "a08846b55a60a0361a9ee77a1b2487af4740172d" + }, + description="TBW", + reference="https://pubchem.ncbi.nlm.nih.gov/", + type="BitextMining", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs={ + "en-en": ["en-Latn", "eng-Latn"] + }, + main_score="f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation=None, + descriptive_stats={"n_samples": {}, "avg_character_length": {}}, + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub and convert it to the standard format.""" + if self.data_loaded: + return + + self.dataset = {} + + for lang in self.hf_subsets: + self.dataset[lang] = datasets.load_dataset( + **self.metadata_dict["dataset"]) + + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + def create_columns(row): + """Put all English titles in column 'sentence1' and SMILES strings in 'sentence2' column""" + row["sentence1"] = row["description"] + row["sentence2"] = row["isomeric_smiles"] + return row + + # Convert to standard format + for lang in self.hf_subsets: + self.dataset[lang] = self.dataset[lang].map(create_columns) diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py new file mode 100644 index 0000000000..da0a15abdb --- /dev/null +++ b/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class PubChemSMILESISoTitleBM(AbsTaskBitextMining, MultilingualTask): + metadata = TaskMetadata( + name="PubChemSMILESISoTitleBM", + dataset={ + "path": "BASF-We-Create-Chemistry/PubChemSMILESIsoTitleBM", + "revision": "2a39b190693f847ae14bb2515bd2347fd6f808a6" + }, + description="TBW", + reference="https://pubchem.ncbi.nlm.nih.gov/", + type="BitextMining", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs={ + "en-en": ["en-Latn", "eng-Latn"] + }, + main_score="f1", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation=None, + descriptive_stats={"n_samples": {}, "avg_character_length": {}}, + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub and convert it to the standard format.""" + if self.data_loaded: + return + + self.dataset = {} + + for lang in self.hf_subsets: + self.dataset[lang] = datasets.load_dataset( + **self.metadata_dict["dataset"]) + + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + def create_columns(row): + """Put all English titles in column 'sentence1' and SMILES strings in 'sentence2' column""" + row["sentence1"] = row["title"] + row["sentence2"] = row["isomeric_smiles"] + return row + + # Convert to standard format + for lang in self.hf_subsets: + self.dataset[lang] = self.dataset[lang].map(create_columns) From 52d18317c8473011132f5a5ce1b6aaf4e1d9afe9 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 8 Sep 2024 13:59:43 +0000 Subject: [PATCH 18/49] Change metric namings to be more descriptive --- chem_eval.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/chem_eval.py b/chem_eval.py index 8cc4a06e23..af1e3a6bcb 100644 --- a/chem_eval.py +++ b/chem_eval.py @@ -24,13 +24,13 @@ def json_parser(data): task_name = data["task_name"] output = {} if task_name.endswith("PC"): - output["Max F1"] = data["scores"]["test"][0]["main_score"] + output["PairClassification (Max F1)"] = data["scores"]["test"][0]["main_score"] elif task_name.endswith("Classification"): - output["Accuracy"] = data["scores"]["test"][0]["main_score"] - elif "BitextMining" in task_name: - output["F1"] = data["scores"]["test"][0]["main_score"] + output["Classification (Accuracy)"] = data["scores"]["test"][0]["main_score"] + elif "BitextMining" in task_name or task_name.endswith("BM"): + output["Bitext Mining (F1)"] = data["scores"]["test"][0]["main_score"] elif task_name.endswith("Retrieval"): - output["NDCG@10"] = data["scores"]["test"][0]["main_score"] + output["Retrieval (NDCG@10)"] = data["scores"]["test"][0]["main_score"] return output From 5c4e5010fecc1795d05bb3b641047a5aa35e6ecd Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 8 Sep 2024 17:28:15 +0000 Subject: [PATCH 19/49] Add English e5 and bge v1 models, all the sizes --- chem_eval.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/chem_eval.py b/chem_eval.py index af1e3a6bcb..3a7f1fb6a8 100644 --- a/chem_eval.py +++ b/chem_eval.py @@ -40,6 +40,12 @@ def json_parser(data): models = {"google-bert/bert-base-uncased": "86b5e0934494bd15c9632b12f734a8a67f723594", "allenai/scibert_scivocab_uncased": "24f92d32b1bfb0bcaf9ab193ff3ad01e87732fc1", "nomic-ai/nomic-bert-2048": "no_revision_available", + "intfloat/e5-small": "e272f3049e853b47cb5ca3952268c6662abda68f", + "intfloat/e5-base": "b533fe4636f4a2507c08ddab40644d20b0006d6a", + "intfloat/e5-large": "4dc6d853a804b9c8886ede6dda8a073b7dc08a81", + "intfloat/e5-small-v2": "dca8b1a9dae0d4575df2bf423a5edb485a431236", + "intfloat/e5-base-v2": "1c644c92ad3ba1efdad3f1451a637716616a20e8", + "intfloat/e5-large-v2": "b322e09026e4ea05f42beadf4d661fb4e101d311", "intfloat/multilingual-e5-small": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", "intfloat/multilingual-e5-base": "d13f1b27baf31030b7fd040960d60d909913633f", "intfloat/multilingual-e5-large": "4dc6d853a804b9c8886ede6dda8a073b7dc08a81", @@ -47,6 +53,9 @@ def json_parser(data): "nomic-ai/nomic-embed-text-v1.5": "b0753ae76394dd36bcfb912a46018088bca48be0", "recobo/chemical-bert-uncased": "498698d28fcf7ce5954852a0444c864bdf232b64", "BAAI/bge-m3": "5617a9f61b028005a4858fdac845db406aefb181", + "BAAI/bge-small-en": "2275a7bdee235e9b4f01fa73aa60d3311983cfea", + "BAAI/bge-base-en": "b737bf5dcc6ee8bdc530531266b4804a5d77b5d8", + "BAAI/bge-large-en": "abe7d9d814b775ca171121fb03f394dc42974275", "all-mpnet-base-v2": "84f2bcc00d77236f9e89c8a360a00fb1139bf47d", "multi-qa-mpnet-base-dot-v1": "3af7c6da5b3e1bea796ef6c97fe237538cbe6e7f", "all-MiniLM-L12-v2": "a05860a77cef7b37e0048a7864658139bc18a854", From 30996cd52eb326c8d8db81ea556634296765a809 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 8 Sep 2024 17:30:03 +0000 Subject: [PATCH 20/49] Add two Wikipedia Clustering tasks --- chem_eval.py | 12 ++++--- mteb/tasks/Clustering/__init__.py | 2 ++ .../eng/WikipediaEasy10Clustering.py | 32 +++++++++++++++++++ .../eng/WikipediaMedium5Clustering.py | 32 +++++++++++++++++++ 4 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 mteb/tasks/Clustering/eng/WikipediaEasy10Clustering.py create mode 100644 mteb/tasks/Clustering/eng/WikipediaMedium5Clustering.py diff --git a/chem_eval.py b/chem_eval.py index 3a7f1fb6a8..da24304d84 100644 --- a/chem_eval.py +++ b/chem_eval.py @@ -67,6 +67,7 @@ def json_parser(data): } all_tasks = [ + # Pair Classification "CoconutSmiles2NamePC", "PubChemAIParagraphsParaphrasePC", "PubChemAISentenceParaphrasePC", @@ -76,7 +77,7 @@ def json_parser(data): "PubChemSMILESIsoTitlePC", "PubChemSynonymPC", "PubChemWikiParagraphsPC", - + # Classification "WikipediaEasy2GeneExpressionVsMetallurgyClassification", "WikipediaEasy2GreenhouseVsEnantiopureClassification", "WikipediaEasy2SolidStateVsColloidalClassification", @@ -92,16 +93,19 @@ def json_parser(data): "WikipediaMedium2ComputationalVsSpectroscopistsClassification", "WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification", "WikipediaMedium5Classification", - + # Retrieval "ChemNQRetrieval", "ChemHotpotQARetrieval", - + # Bitext Mining "CoconutSmiles2NameBitextMining1", "CoconutSmiles2NameBitextMining2", "PubChemSMILESISoTitleBM", "PubChemSMILESCanonTitleBM", "PubChemSMILESISoDescBM", - "PubChemSMILESCanonDescBM" + "PubChemSMILESCanonDescBM", + # Clustering + "WikipediaEasy10Clustering", + "WikipediaMedium5Clustering" ] tasks = mteb.get_tasks(tasks=all_tasks) diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index 014796a4cb..fc92267c3e 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -18,6 +18,8 @@ from .eng.StackExchangeClusteringP2P import * from .eng.TwentyNewsgroupsClustering import * from .eng.WikiCitiesClustering import * +from .eng.WikipediaEasy10Clustering import * +from .eng.WikipediaMedium5Clustering import * from .fra.AlloProfClusteringP2P import * from .fra.AlloProfClusteringS2S import * from .fra.HALClusteringS2S import * diff --git a/mteb/tasks/Clustering/eng/WikipediaEasy10Clustering.py b/mteb/tasks/Clustering/eng/WikipediaEasy10Clustering.py new file mode 100644 index 0000000000..1ae5c92788 --- /dev/null +++ b/mteb/tasks/Clustering/eng/WikipediaEasy10Clustering.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskClustering import AbsTaskClustering + + +class WikipediaEasy10Clustering(AbsTaskClustering): + metadata = TaskMetadata( + name="WikipediaEasy10Clustering", + description="TBW", + reference="https://huggingface.co/datasets/wikipedia", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Clustering_Easy_10_Class", + "revision": "67aa9e201b030038d16241a39795f5b5e5a89898", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="v_measure", + date=None, + domains=None, + task_subtypes=[], + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={"n_samples": None, "avg_character_length": None}, + ) diff --git a/mteb/tasks/Clustering/eng/WikipediaMedium5Clustering.py b/mteb/tasks/Clustering/eng/WikipediaMedium5Clustering.py new file mode 100644 index 0000000000..6f366ad200 --- /dev/null +++ b/mteb/tasks/Clustering/eng/WikipediaMedium5Clustering.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskClustering import AbsTaskClustering + + +class WikipediaMedium5Clustering(AbsTaskClustering): + metadata = TaskMetadata( + name="WikipediaMedium5Clustering", + description="TBW", + reference="https://huggingface.co/datasets/wikipedia", + dataset={ + "path": "BASF-We-Create-Chemistry/Wikipedia_Clustering_Medium_5_Class", + "revision": "178f49f21672a31f3fc94ac28e5703eb7c8d3291", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="v_measure", + date=None, + domains=None, + task_subtypes=[], + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={"n_samples": None, "avg_character_length": None}, + ) From c9198b02c66e26d8c04ac5fe3e37f049bac74b6b Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 8 Sep 2024 17:36:25 +0000 Subject: [PATCH 21/49] Add a try-except in evaluation script to skip faulty models during the benchmark. --- chem_eval.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/chem_eval.py b/chem_eval.py index da24304d84..0c0be95c17 100644 --- a/chem_eval.py +++ b/chem_eval.py @@ -120,12 +120,17 @@ def json_parser(data): print(f"Skipping {model_name} - {model_rev}") continue - wandb.init(project='Chembedding - Benchmarking', name=model_name, - config={"revision": model_rev}) - model = mteb.get_model(model_full_name) - evaluation = mteb.MTEB(tasks=tasks) - evaluation.run(model, output_folder="chem_results", - overwrite_results=False) + try: + wandb.init(project='Chembedding - Benchmarking', name=model_name, + config={"revision": model_rev}) + model = mteb.get_model(model_full_name) + evaluation = mteb.MTEB(tasks=tasks) + evaluation.run(model, output_folder="chem_results", + overwrite_results=False) + except Exception as e: + print(f"Error Evaluating Model {model_name}: {e}") + wandb.finish() + continue for task_name in tqdm(all_tasks): data = read_json(os.path.join( From 043e6fd9cd1ed8f1167bd0f05f585aa22feef44a Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 8 Sep 2024 18:23:35 +0000 Subject: [PATCH 22/49] Add bge v1.5 models and clustering score extraction to json parser --- chem_eval.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/chem_eval.py b/chem_eval.py index 0c0be95c17..d132c7fffb 100644 --- a/chem_eval.py +++ b/chem_eval.py @@ -31,6 +31,8 @@ def json_parser(data): output["Bitext Mining (F1)"] = data["scores"]["test"][0]["main_score"] elif task_name.endswith("Retrieval"): output["Retrieval (NDCG@10)"] = data["scores"]["test"][0]["main_score"] + elif task_name.endswith("Clustering"): + output["Clustering (V Measure)"] = data["scores"]["test"][0]["main_score"] return output @@ -56,6 +58,9 @@ def json_parser(data): "BAAI/bge-small-en": "2275a7bdee235e9b4f01fa73aa60d3311983cfea", "BAAI/bge-base-en": "b737bf5dcc6ee8bdc530531266b4804a5d77b5d8", "BAAI/bge-large-en": "abe7d9d814b775ca171121fb03f394dc42974275", + "BAAI/bge-small-en-v1.5": "5c38ec7c405ec4b44b94cc5a9bb96e735b38267a", + "BAAI/bge-base-en-v1.5": "a5beb1e3e68b9ab74eb54cfd186867f64f240e1a", + "BAAI/bge-large-en-v1.5": "d4aa6901d3a41ba39fb536a557fa166f842b0e09", "all-mpnet-base-v2": "84f2bcc00d77236f9e89c8a360a00fb1139bf47d", "multi-qa-mpnet-base-dot-v1": "3af7c6da5b3e1bea796ef6c97fe237538cbe6e7f", "all-MiniLM-L12-v2": "a05860a77cef7b37e0048a7864658139bc18a854", From 1be7bf2d1cc9d466cd38fafd611b3c8850cfa001 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Mon, 9 Sep 2024 05:17:00 +0000 Subject: [PATCH 23/49] Add Amazon Titan embedding models --- chem_eval.py | 2 + mteb/models/__init__.py | 4 +- mteb/models/amazon_models.py | 81 ++++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 mteb/models/amazon_models.py diff --git a/chem_eval.py b/chem_eval.py index d132c7fffb..3cadda2c8b 100644 --- a/chem_eval.py +++ b/chem_eval.py @@ -69,6 +69,8 @@ def json_parser(data): "text-embedding-ada-002": "1", "text-embedding-3-small": "1", "text-embedding-3-large": "1", + "amazon-titan-embed-text-v1": "1", + "amazon-titan-embed-text-v2": "1", } all_tasks = [ diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py index 668ed9f7ba..43874823bd 100644 --- a/mteb/models/__init__.py +++ b/mteb/models/__init__.py @@ -8,6 +8,7 @@ from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode from mteb.model_meta import ModelMeta from mteb.models import ( + amazon_models, bge_models, bm25, cohere_models, @@ -24,7 +25,7 @@ ru_sentence_models, salesforce_models, sentence_transformers_models, - voyage_models, + voyage_models ) logger = logging.getLogger(__name__) @@ -140,6 +141,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe sentence_transformers_models, voyage_models, google_models, + amazon_models ] models = {} diff --git a/mteb/models/amazon_models.py b/mteb/models/amazon_models.py new file mode 100644 index 0000000000..d0cbddbfe5 --- /dev/null +++ b/mteb/models/amazon_models.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import logging +from functools import partial +from typing import Any + +import numpy as np +import json + +from mteb.model_meta import ModelMeta +from mteb.models.text_formatting_utils import corpus_to_texts +from mteb.requires_package import requires_package + +logger = logging.getLogger(__name__) + + +class AmazonWrapper: + def __init__(self, model_id: str, **kwargs) -> None: + requires_package(self, "boto3", "Amazon Bedrock") + import boto3 + boto3_session = boto3.session.Session() + region_name = boto3_session.region_name + self._client = boto3.client( + "bedrock-runtime", + region_name, + ) + self._model_id = model_id + + def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: + requires_package(self, "boto3", "Amazon Bedrock") + + all_embeddings = [] + + for sentence in sentences: + response = self._client.invoke_model( + body=json.dumps({ + "inputText": sentence + }), + modelId=self._model_id, + accept="application/json", + contentType="application/json" + ) + all_embeddings.append(self._to_numpy(response)) + + return np.array(all_embeddings) + + def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray: + return self.encode(queries, **kwargs) + + def encode_corpus( + self, corpus: list[dict[str, str]] | dict[str, list[str]], **kwargs: Any + ) -> np.ndarray: + sentences = corpus_to_texts(corpus) + return self.encode(sentences, **kwargs) + + def _to_numpy(self, embedding_response) -> np.ndarray: + response = json.loads(embedding_response.get("body").read()) + return np.array(response['embedding']) + + +amazon_titan_embed_text_v1 = ModelMeta( + name="amazon-titan-embed-text-v1", + revision="1", + release_date=None, + languages=None, # supported languages not specified + loader=partial(AmazonWrapper, model_id="amazon.titan-embed-text-v1"), + max_tokens=8192, + embed_dim=None, + open_source=False, +) + +amazon_titan_embed_text_v2 = ModelMeta( + name="amazon-titan-embed-text-v2", + revision="1", + release_date=None, + languages=None, # supported languages not specified + loader=partial(AmazonWrapper, model_id="amazon.titan-embed-text-v2:0"), + max_tokens=8192, + embed_dim=None, + open_source=False, +) From 1e950e8296cfc69a491648d2f30ad636d9b1b89a Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Wed, 11 Sep 2024 20:06:59 +0000 Subject: [PATCH 24/49] Add Cohere Bedrock models --- mteb/models/__init__.py | 2 + mteb/models/cohere_bedrock_models.py | 101 +++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 mteb/models/cohere_bedrock_models.py diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py index 43874823bd..4534d8c542 100644 --- a/mteb/models/__init__.py +++ b/mteb/models/__init__.py @@ -12,6 +12,7 @@ bge_models, bm25, cohere_models, + cohere_bedrock_models, e5_instruct, e5_models, google_models, @@ -125,6 +126,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe model_modules = [ bge_models, bm25, + cohere_bedrock_models, cohere_models, e5_instruct, e5_models, diff --git a/mteb/models/cohere_bedrock_models.py b/mteb/models/cohere_bedrock_models.py new file mode 100644 index 0000000000..af2b2d48ac --- /dev/null +++ b/mteb/models/cohere_bedrock_models.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import logging +from functools import partial +from typing import Any + +import numpy as np +import json + +import mteb +from mteb.model_meta import ModelMeta +from mteb.models.text_formatting_utils import corpus_to_texts +from mteb.requires_package import requires_package + +logger = logging.getLogger(__name__) + + +class CohereBedrockWrapper: + def __init__(self, model_id: str, **kwargs) -> None: + requires_package(self, "boto3", "Amazon Bedrock") + import boto3 + boto3_session = boto3.session.Session() + region_name = boto3_session.region_name + self._client = boto3.client( + "bedrock-runtime", + region_name, + ) + self._model_id = model_id + + def encode(self, sentences: list[str], + prompt_name: str | None = None, + cohere_task_type: str = "search_document", + **kwargs: Any) -> np.ndarray: + requires_package(self, "boto3", "Amazon Bedrock") + + if prompt_name: + task = mteb.get_task(prompt_name) + task_type = task.metadata.type + if task_type in ["Classification", "MultilabelClassification"]: + cohere_task_type = "classification" + elif task_type == "Clustering": + cohere_task_type = "clustering" + + max_batch_size = 96 + sublists = [ + sentences[i: i + max_batch_size] + for i in range(0, len(sentences), max_batch_size) + ] + + all_embeddings = [] + + for sublist in sublists: + response = self._client.invoke_model( + body=json.dumps({ + "texts": [sent[:2048] for sent in sublist], + "input_type": cohere_task_type}), + modelId=self._model_id, + accept="*/*", + contentType="application/json" + ) + all_embeddings.extend(self._to_numpy(response)) + + return np.array(all_embeddings) + + def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray: + return self.encode(queries, **kwargs) + + def encode_corpus( + self, corpus: list[dict[str, str]] | dict[str, list[str]], **kwargs: Any + ) -> np.ndarray: + sentences = corpus_to_texts(corpus) + return self.encode(sentences, **kwargs) + + def _to_numpy(self, embedding_response) -> np.ndarray: + response = json.loads(embedding_response.get("body").read()) + return np.array(response['embeddings']) + + +cohere_embed_english_v3 = ModelMeta( + name="cohere-embed-english-v3", + revision="1", + release_date=None, + languages=None, # supported languages not specified + loader=partial(CohereBedrockWrapper, + model_id="cohere.embed-english-v3"), + max_tokens=512, + embed_dim=None, + open_source=False, +) + +cohere_embed_multilingual_v3 = ModelMeta( + name="cohere-embed-multilingual-v3", + revision="1", + release_date=None, + languages=None, # supported languages not specified + loader=partial(CohereBedrockWrapper, + model_id="cohere.embed-multilingual-v3"), + max_tokens=512, + embed_dim=None, + open_source=False, +) From 2116f8cc3499b07d171e6e4e0c0c9943ff2c1e65 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Wed, 11 Sep 2024 20:36:43 +0000 Subject: [PATCH 25/49] Add two SDS Classification tasks --- chem_eval.py | 2 ++ .../eng/SDSEyeProtectionClassification.py | 31 +++++++++++++++++++ .../eng/SDSGlovesClassification.py | 31 +++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py create mode 100644 mteb/tasks/Classification/eng/SDSGlovesClassification.py diff --git a/chem_eval.py b/chem_eval.py index 3cadda2c8b..46aafe7693 100644 --- a/chem_eval.py +++ b/chem_eval.py @@ -71,6 +71,8 @@ def json_parser(data): "text-embedding-3-large": "1", "amazon-titan-embed-text-v1": "1", "amazon-titan-embed-text-v2": "1", + "cohere-embed-english-v3": "1", + "cohere-embed-multilingual-v3": "1" } all_tasks = [ diff --git a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py new file mode 100644 index 0000000000..4a3f62b04e --- /dev/null +++ b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SDSEyeProtectionClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SDSEyeProtectionClassification", + description="TBW", + reference="https://www.kaggle.com/datasets/eliseu10/material-safety-data-sheets", + dataset={ + "path": "BASF-We-Create-Chemistry/Small-SDS-Eyes-Protection-Classification", + "revision": "685c818bc3065dd8974d58656a0072449c032754", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) diff --git a/mteb/tasks/Classification/eng/SDSGlovesClassification.py b/mteb/tasks/Classification/eng/SDSGlovesClassification.py new file mode 100644 index 0000000000..de8a7ef4f9 --- /dev/null +++ b/mteb/tasks/Classification/eng/SDSGlovesClassification.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SDSGlovesClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SDSGlovesClassification", + description="TBW", + reference="https://www.kaggle.com/datasets/eliseu10/material-safety-data-sheets", + dataset={ + "path": "BASF-We-Create-Chemistry/Small-SDS-Gloves-Classification", + "revision": "31aa32eac2ed1a2a97929d6da722a659e7cc2e2d", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=None, + descriptive_stats={} + ) From 0c415f1a325795b99a81e564b6af6fe3c4fcf042 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Thu, 12 Sep 2024 00:38:43 +0000 Subject: [PATCH 26/49] Add SDS Classification tasks to classification init and chem_eval --- chem_eval.py | 2 ++ mteb/tasks/Classification/__init__.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/chem_eval.py b/chem_eval.py index 46aafe7693..a6d756b7d2 100644 --- a/chem_eval.py +++ b/chem_eval.py @@ -102,6 +102,8 @@ def json_parser(data): "WikipediaMedium2ComputationalVsSpectroscopistsClassification", "WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification", "WikipediaMedium5Classification", + "SDSEyeProtectionClassification", + "SDSGlovesClassification", # Retrieval "ChemNQRetrieval", "ChemHotpotQARetrieval", diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index 317267446d..61c2f0d247 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -33,6 +33,8 @@ from .eng.NewsClassification import * from .eng.PatentClassification import * from .eng.PoemSentimentClassification import * +from .eng.SDSEyeProtectionClassification import * +from .eng.SDSGlovesClassification import * from .eng.ToxicChatClassification import * from .eng.ToxicConversationsClassification import * from .eng.TweetSentimentExtractionClassification import * From 63b416fe47123e3b436297abf1e9752259d986e8 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Thu, 3 Oct 2024 22:55:25 +0000 Subject: [PATCH 27/49] Add a retrieval dataset, update dataset names and revisions --- chem_eval.py | 9 ++--- mteb/tasks/BitextMining/__init__.py | 4 +-- ...Mining1.py => CoconutSMILES2Formula1BM.py} | 12 +++---- ...Mining2.py => CoconutSMILES2Formula2BM.py} | 10 +++--- .../multilingual/PubChemSMILESCanonDescBM.py | 2 +- .../multilingual/PubChemSMILESCanonTitleBM.py | 2 +- .../multilingual/PubChemSMILESISoDescBM.py | 2 +- .../multilingual/PubChemSMILESISoTitleBM.py | 2 +- .../eng/SDSEyeProtectionClassification.py | 4 +-- .../eng/SDSGlovesClassification.py | 4 +-- .../eng/WikipediaEZ10Classification.py | 4 +-- .../eng/WikipediaEZ2Classification.py | 4 +-- .../eng/WikipediaEasy10Classification.py | 4 +-- ...eneExpressionVsMetallurgyClassification.py | 4 +-- ...y2GreenhouseVsEnantiopureClassification.py | 4 +-- ...asy2SolidStateVsColloidalClassification.py | 4 +-- .../WikipediaEasy2SpecialClassification.py | 4 +-- .../eng/WikipediaEasy5Classification.py | 4 +-- ...uminescenceVsLuminescenceClassification.py | 4 +-- ...ionProductsNuclearFissionClassification.py | 4 +-- ...sVsSemiconductorMaterialsClassification.py | 4 +-- ...inescenceVsNeurochemistryClassification.py | 4 +-- ...tationalVsSpectroscopistsClassification.py | 4 +-- ...ChromatographyTitrationpHClassification.py | 4 +-- .../eng/WikipediaMedium5Classification.py | 4 +-- .../eng/WikipediaEasy10Clustering.py | 4 +-- .../eng/WikipediaMedium5Clustering.py | 4 +-- mteb/tasks/PairClassification/__init__.py | 4 +-- ...s2NamePC.py => CoconutSMILES2FormulaPC.py} | 8 ++--- .../eng/PubChemAISentenceParaphrasePC.py | 2 +- .../eng/PubChemSMILESCanonDescPC.py | 2 +- .../eng/PubChemSMILESCanonTitlePC.py | 2 +- .../eng/PubChemSMILESIsoDescPC.py | 2 +- .../eng/PubChemSMILESIsoTitlePC.py | 2 +- .../eng/PubChemSynonymPC.py | 2 +- .../eng/PubChemWikiParagraphsPC.py | 2 +- ...y => WikipediaAIParagraphsParaphrasePC.py} | 8 ++--- mteb/tasks/Retrieval/__init__.py | 1 + mteb/tasks/Retrieval/eng/CoconutRetrieval.py | 36 +++++++++++++++++++ 39 files changed, 114 insertions(+), 76 deletions(-) rename mteb/tasks/BitextMining/multilingual/{CoconutSmiles2NameBitextMining1.py => CoconutSMILES2Formula1BM.py} (81%) rename mteb/tasks/BitextMining/multilingual/{CoconutSmiles2NameBitextMining2.py => CoconutSMILES2Formula2BM.py} (85%) rename mteb/tasks/PairClassification/eng/{CoconutSmiles2NamePC.py => CoconutSMILES2FormulaPC.py} (88%) rename mteb/tasks/PairClassification/eng/{PubChemAIParagraphsParaphrasePC.py => WikipediaAIParagraphsParaphrasePC.py} (85%) create mode 100644 mteb/tasks/Retrieval/eng/CoconutRetrieval.py diff --git a/chem_eval.py b/chem_eval.py index a6d756b7d2..cc609e8e3b 100644 --- a/chem_eval.py +++ b/chem_eval.py @@ -77,8 +77,8 @@ def json_parser(data): all_tasks = [ # Pair Classification - "CoconutSmiles2NamePC", - "PubChemAIParagraphsParaphrasePC", + "CoconutSMILES2FormulaPC", + "WikipediaAIParagraphsParaphrasePC", "PubChemAISentenceParaphrasePC", "PubChemSMILESCanonDescPC", "PubChemSMILESCanonTitlePC", @@ -107,9 +107,10 @@ def json_parser(data): # Retrieval "ChemNQRetrieval", "ChemHotpotQARetrieval", + "CoconutRetrieval" # Bitext Mining - "CoconutSmiles2NameBitextMining1", - "CoconutSmiles2NameBitextMining2", + "CoconutSMILES2Formula1BM", + "CoconutSMILES2Formula2BM", "PubChemSMILESISoTitleBM", "PubChemSMILESCanonTitleBM", "PubChemSMILESISoDescBM", diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py index 44c975d382..19353067b5 100644 --- a/mteb/tasks/BitextMining/__init__.py +++ b/mteb/tasks/BitextMining/__init__.py @@ -5,8 +5,8 @@ from .multilingual.BibleNLPBitextMining import * from .multilingual.BUCCBitextMining import * from .multilingual.BUCCBitextMiningFast import * -from .multilingual.CoconutSmiles2NameBitextMining1 import * -from .multilingual.CoconutSmiles2NameBitextMining2 import * +from .multilingual.CoconutSMILES2Formula1BM import * +from .multilingual.CoconutSMILES2Formula2BM import * from .multilingual.DiaBLaBitextMining import * from .multilingual.FloresBitextMining import * from .multilingual.IN22ConvBitextMining import * diff --git a/mteb/tasks/BitextMining/multilingual/CoconutSmiles2NameBitextMining1.py b/mteb/tasks/BitextMining/multilingual/CoconutSMILES2Formula1BM.py similarity index 81% rename from mteb/tasks/BitextMining/multilingual/CoconutSmiles2NameBitextMining1.py rename to mteb/tasks/BitextMining/multilingual/CoconutSMILES2Formula1BM.py index eb4929e9a1..9063c9347c 100644 --- a/mteb/tasks/BitextMining/multilingual/CoconutSmiles2NameBitextMining1.py +++ b/mteb/tasks/BitextMining/multilingual/CoconutSMILES2Formula1BM.py @@ -7,12 +7,12 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class CoconutSmiles2NameBitextMining1(AbsTaskBitextMining, MultilingualTask): +class CoconutSMILES2Formula1BM(AbsTaskBitextMining, MultilingualTask): metadata = TaskMetadata( - name="CoconutSmiles2NameBitextMining1", + name="CoconutSMILES2Formula1BM", dataset={ - "path": "BASF-We-Create-Chemistry/CoconutSmiles2NameBitextMining1", - "revision": "cd1089904b8633a55a3ab3fa379c7fd76c02c722" + "path": "BASF-We-Create-Chemistry/CoconutSMILES2FormulaBM", + "revision": "af0913db3a92d4b16ad679733c281b3237d399a5" }, description="TBW", reference="https://coconut.naturalproducts.net/", @@ -52,8 +52,8 @@ def load_data(self, **kwargs): def dataset_transform(self): def create_columns(row): """Put all English titles in column 'sentence1' and SMILES strings in 'sentence2' column""" - row["sentence1"] = row["name"] - row["sentence2"] = row["canonical_smiles"] + row["sentence1"] = row["formula"] + row["sentence2"] = row["smiles"] return row # Convert to standard format diff --git a/mteb/tasks/BitextMining/multilingual/CoconutSmiles2NameBitextMining2.py b/mteb/tasks/BitextMining/multilingual/CoconutSMILES2Formula2BM.py similarity index 85% rename from mteb/tasks/BitextMining/multilingual/CoconutSmiles2NameBitextMining2.py rename to mteb/tasks/BitextMining/multilingual/CoconutSMILES2Formula2BM.py index a4e4ad9ee3..21a785a658 100644 --- a/mteb/tasks/BitextMining/multilingual/CoconutSmiles2NameBitextMining2.py +++ b/mteb/tasks/BitextMining/multilingual/CoconutSMILES2Formula2BM.py @@ -7,12 +7,12 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class CoconutSmiles2NameBitextMining2(AbsTaskBitextMining, MultilingualTask): +class CoconutSMILES2Formula2BM(AbsTaskBitextMining, MultilingualTask): metadata = TaskMetadata( - name="CoconutSmiles2NameBitextMining2", + name="CoconutSMILES2Formula2BM", dataset={ "path": "BASF-We-Create-Chemistry/CoconutSmiles2NameBitextMining2", - "revision": "9613249c6b80f794d6cfd4c732c5889e94d7c96e" + "revision": "5db4895bbfbf84db8e3f876285b5178828f6ebee" }, description="TBW", reference="https://coconut.naturalproducts.net/", @@ -52,8 +52,8 @@ def load_data(self, **kwargs): def dataset_transform(self): def create_columns(row): """Put all English titles in column 'sentence1' and SMILES strings in 'sentence2' column""" - row["sentence1"] = row["name"] - row["sentence2"] = row["canonical_smiles"] + row["sentence1"] = row["formula"] + row["sentence2"] = row["smiles"] return row # Convert to standard format diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py index e57d707a55..4e96b2e98c 100644 --- a/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py +++ b/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py @@ -12,7 +12,7 @@ class PubChemSMILESCanonDescBM(AbsTaskBitextMining, MultilingualTask): name="PubChemSMILESCanonDescBM", dataset={ "path": "BASF-We-Create-Chemistry/PubChemSMILESCanonDescBM", - "revision": "c1b58c3d6c27b71c70a509b18915e841c1c89484" + "revision": "a721de4af2857bf3cc014b92f013a4f573d9cb00" }, description="TBW", reference="https://pubchem.ncbi.nlm.nih.gov/", diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py index 51299ff0d0..540e91dac3 100644 --- a/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py +++ b/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py @@ -12,7 +12,7 @@ class PubChemSMILESCanonTitleBM(AbsTaskBitextMining, MultilingualTask): name="PubChemSMILESCanonTitleBM", dataset={ "path": "BASF-We-Create-Chemistry/PubChemSMILESCanonTitleBM", - "revision": "4e7fd0815c39f407e6fd08c9b1d5bf481022ef4f" + "revision": "2c7a74635cf41b2ca50d878fa1ff670fc5af3ea1" }, description="TBW", reference="https://pubchem.ncbi.nlm.nih.gov/", diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py index 03a449884d..e9d5056745 100644 --- a/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py +++ b/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py @@ -12,7 +12,7 @@ class PubChemSMILESISoDescBM(AbsTaskBitextMining, MultilingualTask): name="PubChemSMILESISoDescBM", dataset={ "path": "BASF-We-Create-Chemistry/PubChemSMILESIsoDescBM", - "revision": "a08846b55a60a0361a9ee77a1b2487af4740172d" + "revision": "33a2064662e851ea5e42653b303eb9e0f6878a07" }, description="TBW", reference="https://pubchem.ncbi.nlm.nih.gov/", diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py index da0a15abdb..bf0a83edc9 100644 --- a/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py +++ b/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py @@ -12,7 +12,7 @@ class PubChemSMILESISoTitleBM(AbsTaskBitextMining, MultilingualTask): name="PubChemSMILESISoTitleBM", dataset={ "path": "BASF-We-Create-Chemistry/PubChemSMILESIsoTitleBM", - "revision": "2a39b190693f847ae14bb2515bd2347fd6f808a6" + "revision": "d60f975694c1841e60a39518b80e157c145f0be1" }, description="TBW", reference="https://pubchem.ncbi.nlm.nih.gov/", diff --git a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py index 4a3f62b04e..76ee173f4b 100644 --- a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py +++ b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py @@ -10,8 +10,8 @@ class SDSEyeProtectionClassification(AbsTaskClassification): description="TBW", reference="https://www.kaggle.com/datasets/eliseu10/material-safety-data-sheets", dataset={ - "path": "BASF-We-Create-Chemistry/Small-SDS-Eyes-Protection-Classification", - "revision": "685c818bc3065dd8974d58656a0072449c032754", + "path": "BASF-We-Create-Chemistry/SmallSDSEyeProtectionClassification", + "revision": "35cbe5ee544dd26e343238a333de4568e6f77819", }, type="Classification", category="s2p", diff --git a/mteb/tasks/Classification/eng/SDSGlovesClassification.py b/mteb/tasks/Classification/eng/SDSGlovesClassification.py index de8a7ef4f9..ec23e8846c 100644 --- a/mteb/tasks/Classification/eng/SDSGlovesClassification.py +++ b/mteb/tasks/Classification/eng/SDSGlovesClassification.py @@ -10,8 +10,8 @@ class SDSGlovesClassification(AbsTaskClassification): description="TBW", reference="https://www.kaggle.com/datasets/eliseu10/material-safety-data-sheets", dataset={ - "path": "BASF-We-Create-Chemistry/Small-SDS-Gloves-Classification", - "revision": "31aa32eac2ed1a2a97929d6da722a659e7cc2e2d", + "path": "BASF-We-Create-Chemistry/SmallSDSGlovesClassification", + "revision": "c723236c5ec417d79512e6104aca9d2cd88168f6", }, type="Classification", category="s2p", diff --git a/mteb/tasks/Classification/eng/WikipediaEZ10Classification.py b/mteb/tasks/Classification/eng/WikipediaEZ10Classification.py index d63fdb7fed..293dac89c8 100644 --- a/mteb/tasks/Classification/eng/WikipediaEZ10Classification.py +++ b/mteb/tasks/Classification/eng/WikipediaEZ10Classification.py @@ -10,8 +10,8 @@ class WikipediaEZ10Classification(AbsTaskClassification): description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaEZ10Class", - "revision": "4dcc49b22904b1c91d8a5eac701b1182ff988bda", + "path": "BASF-We-Create-Chemistry/WikipediaEZ10Classification", + "revision": "8121c72f71ccb570bcfdd9f0f6b52059507db983", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaEZ2Classification.py b/mteb/tasks/Classification/eng/WikipediaEZ2Classification.py index 1ceddaf36d..e908175746 100644 --- a/mteb/tasks/Classification/eng/WikipediaEZ2Classification.py +++ b/mteb/tasks/Classification/eng/WikipediaEZ2Classification.py @@ -10,8 +10,8 @@ class WikipediaEZ2Classification(AbsTaskClassification): description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaEZ2Class", - "revision": "03d7df0886b6c450a182f221d11f555a95507417", + "path": "BASF-We-Create-Chemistry/WikipediaEZ2Classification", + "revision": "1e4c5cbdfca9b7dab2d0822eff88a0bf5f79c429", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaEasy10Classification.py b/mteb/tasks/Classification/eng/WikipediaEasy10Classification.py index 023a89de58..5f97f1e4e0 100644 --- a/mteb/tasks/Classification/eng/WikipediaEasy10Classification.py +++ b/mteb/tasks/Classification/eng/WikipediaEasy10Classification.py @@ -10,8 +10,8 @@ class WikipediaEasy10Classification(AbsTaskClassification): description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Easy_10_Class", - "revision": "c5ae57cbc4acbf63c49f1b4b1408eb4209e5cf74", + "path": "BASF-We-Create-Chemistry/WikipediaEasy10Classification", + "revision": "c1a2d556de003739a787e2853ca9746b4ee1333f", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2GeneExpressionVsMetallurgyClassification.py b/mteb/tasks/Classification/eng/WikipediaEasy2GeneExpressionVsMetallurgyClassification.py index 7f2e7f7508..4708ca342b 100644 --- a/mteb/tasks/Classification/eng/WikipediaEasy2GeneExpressionVsMetallurgyClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaEasy2GeneExpressionVsMetallurgyClassification.py @@ -10,8 +10,8 @@ class WikipediaEasy2GeneExpressionVsMetallurgyClassification(AbsTaskClassificati description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Easy_2_Class_Gene_Expression_vs_Metallurgy", - "revision": "2a386fa589c865c8bcd6afbf201bc4f871fe9ef6", + "path": "BASF-We-Create-Chemistry/WikipediaEasy2GeneExpressionVsMetallurgyClassification", + "revision": "6ac491e5de9070c6dd434b31e76d3d379123dcff", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopureClassification.py b/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopureClassification.py index 80f647b923..088c5164fe 100644 --- a/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopureClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopureClassification.py @@ -10,8 +10,8 @@ class WikipediaEasy2GreenhouseVsEnantiopureClassification(AbsTaskClassification) description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Easy_2_Class_Greenhouse_vs_Enantiopure", - "revision": "92cddec63a3c8ef29dc72ebaba7204625d864a2b", + "path": "BASF-We-Create-Chemistry/WikipediaEasy2GreenhouseVsEnantiopureClassification", + "revision": "0cfc1a83b6ed832454e8f4f93f7a0e26208274d9", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidalClassification.py b/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidalClassification.py index 95facf2936..bc48090be6 100644 --- a/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidalClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidalClassification.py @@ -10,8 +10,8 @@ class WikipediaEasy2SolidStateVsColloidalClassification(AbsTaskClassification): description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Easy_2_Class_Solid_State_vs_Colloidal", - "revision": "c9d4228c53c402cf3d340d3ccbcdb2cc37c8d6f3", + "path": "BASF-We-Create-Chemistry/WikipediaEasy2SolidStateVsColloidalClassification", + "revision": "7d8df44e588b6143d4856c781f72f919fa0599a7", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2SpecialClassification.py b/mteb/tasks/Classification/eng/WikipediaEasy2SpecialClassification.py index 28269d283a..edf16cc472 100644 --- a/mteb/tasks/Classification/eng/WikipediaEasy2SpecialClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaEasy2SpecialClassification.py @@ -10,8 +10,8 @@ class WikipediaEasy2SpecialClassification(AbsTaskClassification): description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Easy_2_Class_Special", - "revision": "2e748237767f6a8651901493ec5f20d9c125af11", + "path": "BASF-We-Create-Chemistry/WikipediaEasy2SpecialClassification", + "revision": "96d1d9b37c4693f74c46c83d63a290573f78d511", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaEasy5Classification.py b/mteb/tasks/Classification/eng/WikipediaEasy5Classification.py index b34986b72a..c5755b1b11 100644 --- a/mteb/tasks/Classification/eng/WikipediaEasy5Classification.py +++ b/mteb/tasks/Classification/eng/WikipediaEasy5Classification.py @@ -10,8 +10,8 @@ class WikipediaEasy5Classification(AbsTaskClassification): description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Easy_5_Class", - "revision": "14df6a56f71a288622c63a473e81f9205af8e1a7", + "path": "BASF-We-Create-Chemistry/WikipediaEasy5Classification", + "revision": "858633e882dadd1ec6a0d220f7549bcafd379236", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescenceClassification.py b/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescenceClassification.py index b3c7a1a8f6..c70648bf12 100644 --- a/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescenceClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescenceClassification.py @@ -10,8 +10,8 @@ class WikipediaHard2BioluminescenceVsLuminescenceClassification(AbsTaskClassific description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Hard_2_Class_Bioluminescence_vs_Luminescence", - "revision": "907895e5fe3138626c1c8d8ff26ac90b3c447cf2", + "path": "BASF-We-Create-Chemistry/WikipediaHard2BioluminescenceVsLuminescenceClassification", + "revision": "21c4dcebe2c5b36a35292e6441e7a10b59bf4896", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py b/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py index 029893dc73..d7de5677ef 100644 --- a/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py @@ -10,8 +10,8 @@ class WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification(AbsTas description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Hard_2_Class_Isotopes_vs_Fission_Products_Nuclear_Fission", - "revision": "9c3974e039e774828742e739a3cc7bced7b337d5", + "path": "BASF-We-Create-Chemistry/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification", + "revision": "897743346c7c794264f7dbfadc3978aa2895e8e2", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py b/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py index 0b73298662..0e5d31614a 100644 --- a/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py @@ -10,8 +10,8 @@ class WikipediaHard2SaltsVsSemiconductorMaterialsClassification(AbsTaskClassific description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Hard_2_Class_Salts_vs_Semiconductor_Materials", - "revision": "b8f1f0eb9c3f54db47a6a6080938dcfdf307ef9f", + "path": "BASF-We-Create-Chemistry/WikipediaHard2SaltsVsSemiconductorMaterialsClassification", + "revision": "9e5415a096012fa2d1f3a929952cf9859e4550e7", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py b/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py index 7aae4f87ff..2e8aa28c4d 100644 --- a/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py @@ -10,8 +10,8 @@ class WikipediaMedium2BioluminescenceVsNeurochemistryClassification(AbsTaskClass description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Medium_2_Class_Bioluminescence_vs_Neurochemistry", - "revision": "4b1018f7a60702173d5ff9c08fda4704961ca3be", + "path": "BASF-We-Create-Chemistry/WikipediaMedium2BioluminescenceVsNeurochemistryClassification", + "revision": "2f68b7d34c2be896e46b14533573b366e59e5aae", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopistsClassification.py b/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopistsClassification.py index 635b74c52b..8c9932a467 100644 --- a/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopistsClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopistsClassification.py @@ -10,8 +10,8 @@ class WikipediaMedium2ComputationalVsSpectroscopistsClassification(AbsTaskClassi description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Medium_2_Class_Computational_vs_Spectroscopists", - "revision": "e74c1a94e9a0aca888324e89df2b7086a2f0923f", + "path": "BASF-We-Create-Chemistry/WikipediaMedium2ComputationalVsSpectroscopistsClassification", + "revision": "474d706a22b0451b5846d623aa4b4234ba5b0513", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py b/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py index f9b3a57d78..c89ab61e6f 100644 --- a/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py @@ -10,8 +10,8 @@ class WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification(A description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Medium_2_Class_Crystallography_vs_Chromatography_Titration_pH", - "revision": "f1b8f8ca2afd4e8e988e077ac7f42aeae1e1a51c", + "path": "BASF-We-Create-Chemistry/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification", + "revision": "740565a6a853aaed1114a13bdfd5fd46857b4f11", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaMedium5Classification.py b/mteb/tasks/Classification/eng/WikipediaMedium5Classification.py index 19a31f7d8b..b9d00e96c2 100644 --- a/mteb/tasks/Classification/eng/WikipediaMedium5Classification.py +++ b/mteb/tasks/Classification/eng/WikipediaMedium5Classification.py @@ -10,8 +10,8 @@ class WikipediaMedium5Classification(AbsTaskClassification): description="TBW", reference="https://wikipedia.org", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Medium_5_Class", - "revision": "bc4750e59690c46be0ea4cf41ab52d034cea3a06", + "path": "BASF-We-Create-Chemistry/WikipediaMedium5Classification", + "revision": "f81a76a2fb690e5d5bd7a26dd07e85cdf8405dfb", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Clustering/eng/WikipediaEasy10Clustering.py b/mteb/tasks/Clustering/eng/WikipediaEasy10Clustering.py index 1ae5c92788..3425575550 100644 --- a/mteb/tasks/Clustering/eng/WikipediaEasy10Clustering.py +++ b/mteb/tasks/Clustering/eng/WikipediaEasy10Clustering.py @@ -11,8 +11,8 @@ class WikipediaEasy10Clustering(AbsTaskClustering): description="TBW", reference="https://huggingface.co/datasets/wikipedia", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Clustering_Easy_10_Class", - "revision": "67aa9e201b030038d16241a39795f5b5e5a89898", + "path": "BASF-We-Create-Chemistry/WikipediaEasy10Clustering", + "revision": "0a0886b06acbfc735bca6a71b21ce1e5cb92a37b", }, type="Clustering", category="p2p", diff --git a/mteb/tasks/Clustering/eng/WikipediaMedium5Clustering.py b/mteb/tasks/Clustering/eng/WikipediaMedium5Clustering.py index 6f366ad200..f1f810e1eb 100644 --- a/mteb/tasks/Clustering/eng/WikipediaMedium5Clustering.py +++ b/mteb/tasks/Clustering/eng/WikipediaMedium5Clustering.py @@ -11,8 +11,8 @@ class WikipediaMedium5Clustering(AbsTaskClustering): description="TBW", reference="https://huggingface.co/datasets/wikipedia", dataset={ - "path": "BASF-We-Create-Chemistry/Wikipedia_Clustering_Medium_5_Class", - "revision": "178f49f21672a31f3fc94ac28e5703eb7c8d3291", + "path": "BASF-We-Create-Chemistry/WikipediaMedium5Clustering", + "revision": "7754d8d296f9f4c3af1c6426fab36304730ccddf", }, type="Clustering", category="p2p", diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index 5ba6e47dd8..4cef32c163 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -3,9 +3,8 @@ from .ara.ArEntail import * from .ces.CTKFactsNLI import * from .deu.FalseFriendsDeEnPC import * -from .eng.CoconutSmiles2NamePC import * +from .eng.CoconutSMILES2FormulaPC import * from .eng.LegalBenchPC import * -from .eng.PubChemAIParagraphsParaphrasePC import * from .eng.PubChemAISentenceParaphrasePC import * from .eng.PubChemSMILESCanonDescPC import * from .eng.PubChemSMILESCanonTitlePC import * @@ -16,6 +15,7 @@ from .eng.SprintDuplicateQuestionsPC import * from .eng.TwitterSemEval2015PC import * from .eng.TwitterURLCorpusPC import * +from .eng.WikipediaAIParagraphsParaphrasePC import * from .fas.FarsTail import * from .hye.ArmenianParaphrasePC import * from .ind.IndoNLI import * diff --git a/mteb/tasks/PairClassification/eng/CoconutSmiles2NamePC.py b/mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py similarity index 88% rename from mteb/tasks/PairClassification/eng/CoconutSmiles2NamePC.py rename to mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py index 3880c42eff..4e84a59445 100644 --- a/mteb/tasks/PairClassification/eng/CoconutSmiles2NamePC.py +++ b/mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py @@ -16,14 +16,14 @@ } -class CoconutSmiles2NamePC(AbsTaskPairClassification): +class CoconutSMILES2FormulaPC(AbsTaskPairClassification): metadata = TaskMetadata( - name="CoconutSmiles2NamePC", + name="CoconutSMILES2FormulaPC", description="""TBW""", reference="https://coconut.naturalproducts.net/", dataset={ - "path": "BASF-We-Create-Chemistry/CoconutSmiles2NamePairClassification", - "revision": "6e7c8b5419a8b437fc1217bb35101a198e723db6" + "path": "BASF-We-Create-Chemistry/CoconutSMILES2FormulaPC", + "revision": "76bb5eaa5cd86e795dfeb734692dca619d2aed74" }, type="PairClassification", category="p2p", diff --git a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py index cbb763bb01..167389f3d8 100644 --- a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py +++ b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py @@ -16,7 +16,7 @@ class PubChemAISentenceParaphrasePC(AbsTaskPairClassification): reference="https://pubchem.ncbi.nlm.nih.gov/", dataset={ "path": "BASF-We-Create-Chemistry/PubChemAISentenceParaphrasePC", - "revision": "eeaad4bb9ec83058589faec127cdcb38fc7bfb2e" + "revision": "f33a205966ce032f957c3a22f4f9e378f89a2c56" }, type="PairClassification", category="s2s", diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py index 417aca9fef..9e6eff57d4 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py @@ -23,7 +23,7 @@ class PubChemSMILESCanonDescPC(AbsTaskPairClassification): reference="https://pubchem.ncbi.nlm.nih.gov/", dataset={ "path": "BASF-We-Create-Chemistry/PubChemSMILESCanonDescPC", - "revision": "ed91e93abf734d82ad1a0abf4d7653521173e2fb" + "revision": "6236cc0c3003bea6034d00f96d2202f7c05629c6" }, type="PairClassification", category="s2s", diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py index cb227dc9d7..173f140976 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py @@ -23,7 +23,7 @@ class PubChemSMILESCanonTitlePC(AbsTaskPairClassification): reference="https://pubchem.ncbi.nlm.nih.gov/", dataset={ "path": "BASF-We-Create-Chemistry/PubChemSMILESCanonTitlePC", - "revision": "e0df8b7d5a9184cd8fc981959e6a81228f28c3a1" + "revision": "3cce5bbb9ffe0d63a74102f2f5037aea47244c8f" }, type="PairClassification", category="s2s", diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py index c93954fd4c..661c825b60 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py @@ -23,7 +23,7 @@ class PubChemSMILESIsoDescPC(AbsTaskPairClassification): reference="https://pubchem.ncbi.nlm.nih.gov/", dataset={ "path": "BASF-We-Create-Chemistry/PubChemSMILESIsoDescPC", - "revision": "a6c8bb2cb2ced89bf4576744c85b66b3319aa330" + "revision": "afedff80aa393a8bdc5e05da46252dc5fde99029" }, type="PairClassification", category="s2s", diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py index cde72117d1..ccc0b699e1 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py @@ -23,7 +23,7 @@ class PubChemSMILESIsoTitlePC(AbsTaskPairClassification): reference="https://pubchem.ncbi.nlm.nih.gov/", dataset={ "path": "BASF-We-Create-Chemistry/PubChemSMILESIsoTitlePC", - "revision": "d8865a1b3a269aa2f4b4059a13c1037a9adfbc5d" + "revision": "1b0d57516ec7c168b8da44f80148b5418ba394b3" }, type="PairClassification", category="s2s", diff --git a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py index 3ce1fa2cf2..9bec11eede 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py @@ -23,7 +23,7 @@ class PubChemSynonymPC(AbsTaskPairClassification): reference="https://pubchem.ncbi.nlm.nih.gov/", dataset={ "path": "BASF-We-Create-Chemistry/PubChemSynonymPC", - "revision": "dcd82d56e7a8388db23aa962c99d59fe9a2ba7e3" + "revision": "5037d69d177c9628fb79cb57eea1299178b28c1b" }, type="PairClassification", category="s2s", diff --git a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py index 062ac3ed9c..9b6f3ee684 100644 --- a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py @@ -16,7 +16,7 @@ class PubChemWikiParagraphsPC(AbsTaskPairClassification): reference="https://pubchem.ncbi.nlm.nih.gov/", dataset={ "path": "BASF-We-Create-Chemistry/PubChemWikiParagraphsPC", - "revision": "3a12c34da3bdfeaca47058b624e8223513c7ae46" + "revision": "36c2aabe8e5bdb034701d0a226cac3c27d09575c" }, type="PairClassification", category="p2p", diff --git a/mteb/tasks/PairClassification/eng/PubChemAIParagraphsParaphrasePC.py b/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py similarity index 85% rename from mteb/tasks/PairClassification/eng/PubChemAIParagraphsParaphrasePC.py rename to mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py index fe491be68d..56532b86a5 100644 --- a/mteb/tasks/PairClassification/eng/PubChemAIParagraphsParaphrasePC.py +++ b/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py @@ -9,14 +9,14 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class PubChemAIParagraphsParaphrasePC(AbsTaskPairClassification): +class WikipediaAIParagraphsParaphrasePC(AbsTaskPairClassification): metadata = TaskMetadata( - name="PubChemAIParagraphsParaphrasePC", + name="WikipediaAIParagraphsParaphrasePC", description="""TBW""", reference="https://pubchem.ncbi.nlm.nih.gov/", dataset={ - "path": "BASF-We-Create-Chemistry/PubChemAIParagraphsParaphrasePC", - "revision": "bc3efec1bde242c3cdc3b7870c094f4e5a935fee" + "path": "BASF-We-Create-Chemistry/WikipediaAIParagraphsParaphrasePC", + "revision": "a430437ea6c6fe0e6461e6d6659f647d0bf62496" }, type="PairClassification", category="p2p", diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 520721e6d0..50c230494d 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -32,6 +32,7 @@ from .eng.ChemNQRetrieval import * from .eng.ChemHotpotQARetrieval import * from .eng.ClimateFEVERRetrieval import * +from .eng.CoconutRetrieval import * from .eng.CQADupstackAndroidRetrieval import * from .eng.CQADupstackEnglishRetrieval import * from .eng.CQADupstackGamingRetrieval import * diff --git a/mteb/tasks/Retrieval/eng/CoconutRetrieval.py b/mteb/tasks/Retrieval/eng/CoconutRetrieval.py new file mode 100644 index 0000000000..d3b09f723e --- /dev/null +++ b/mteb/tasks/Retrieval/eng/CoconutRetrieval.py @@ -0,0 +1,36 @@ +from __future__ import annotations +import logging + +from mteb.abstasks.TaskMetadata import TaskMetadata + + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + +logger = logging.getLogger(__name__) + + +class CoconutRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CoconutRetrieval", + dataset={ + "path": "BASF-We-Create-Chemistry/CoconutRetrieval", + "revision": "fdb30de349565a819d481f1eb7ef6f851ff150fc", + }, + description="COCONUT: the COlleCtion of Open NatUral producTs", + reference="https://coconut.naturalproducts.net/", + type="Retrieval", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation="""""", + descriptive_stats={} + ) From f1a36e726957efffca2dededede132f1f9bc28c3 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Thu, 3 Oct 2024 23:26:37 +0000 Subject: [PATCH 28/49] Update revision for the CoconutRetrieval dataset: handle duplicate SMILES (documents) --- mteb/tasks/Retrieval/eng/CoconutRetrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/tasks/Retrieval/eng/CoconutRetrieval.py b/mteb/tasks/Retrieval/eng/CoconutRetrieval.py index d3b09f723e..591f3e82d3 100644 --- a/mteb/tasks/Retrieval/eng/CoconutRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CoconutRetrieval.py @@ -14,7 +14,7 @@ class CoconutRetrieval(AbsTaskRetrieval): name="CoconutRetrieval", dataset={ "path": "BASF-We-Create-Chemistry/CoconutRetrieval", - "revision": "fdb30de349565a819d481f1eb7ef6f851ff150fc", + "revision": "4c23111a06ff9162dc8521dfe8096c544ab9548b", }, description="COCONUT: the COlleCtion of Open NatUral producTs", reference="https://coconut.naturalproducts.net/", From 16ce7b9ca82d6c482254580832965e6bd974b139 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Fri, 4 Oct 2024 16:03:23 +0000 Subject: [PATCH 29/49] Update `CoconutSMILES2FormulaPC` task --- .../tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py b/mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py index 4e84a59445..b6f190db37 100644 --- a/mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py +++ b/mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py @@ -10,8 +10,8 @@ _DATASET_COLUMN_MAP = { - "sentence1": "name", - "sentence2": "canonical_smiles", + "sentence1": "formula", + "sentence2": "smiles", "labels": "label", } @@ -23,7 +23,7 @@ class CoconutSMILES2FormulaPC(AbsTaskPairClassification): reference="https://coconut.naturalproducts.net/", dataset={ "path": "BASF-We-Create-Chemistry/CoconutSMILES2FormulaPC", - "revision": "76bb5eaa5cd86e795dfeb734692dca619d2aed74" + "revision": "e46d4868e417703bdcf32aadbe5d0e05a1b7f085" }, type="PairClassification", category="p2p", From 7c10fa9ebbaf0f562118dbbcd1fd70047eeec1be Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Thu, 10 Oct 2024 17:15:08 +0000 Subject: [PATCH 30/49] Change CoconutRetrieval dataset to a smaller one --- mteb/tasks/Retrieval/eng/CoconutRetrieval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/CoconutRetrieval.py b/mteb/tasks/Retrieval/eng/CoconutRetrieval.py index 591f3e82d3..6382955ea7 100644 --- a/mteb/tasks/Retrieval/eng/CoconutRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CoconutRetrieval.py @@ -13,8 +13,8 @@ class CoconutRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( name="CoconutRetrieval", dataset={ - "path": "BASF-We-Create-Chemistry/CoconutRetrieval", - "revision": "4c23111a06ff9162dc8521dfe8096c544ab9548b", + "path": "BASF-We-Create-Chemistry/SmallCoconutRetrieval", + "revision": "831d292c3959eae59e4f89b8758738feee97d6cf", }, description="COCONUT: the COlleCtion of Open NatUral producTs", reference="https://coconut.naturalproducts.net/", From 4e9258908c3a529513f67e782537e1848e096767 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Thu, 2 Jan 2025 21:34:15 +0000 Subject: [PATCH 31/49] Update some models - Integrate models added in ChemTEB (such as amazon, cohere bedrock and nomic bert) with latest modeling format in mteb. - Update the metadata for the mentioned models --- mteb/models/amazon_models.py | 43 ++--- mteb/models/cohere_bedrock_models.py | 237 ++++++++++++++++++++++----- mteb/models/nomic_bert_models.py | 11 +- 3 files changed, 229 insertions(+), 62 deletions(-) diff --git a/mteb/models/amazon_models.py b/mteb/models/amazon_models.py index d0cbddbfe5..c736ebd796 100644 --- a/mteb/models/amazon_models.py +++ b/mteb/models/amazon_models.py @@ -8,13 +8,13 @@ import json from mteb.model_meta import ModelMeta -from mteb.models.text_formatting_utils import corpus_to_texts from mteb.requires_package import requires_package +from .wrapper import Wrapper logger = logging.getLogger(__name__) -class AmazonWrapper: +class AmazonWrapper(Wrapper): def __init__(self, model_id: str, **kwargs) -> None: requires_package(self, "boto3", "Amazon Bedrock") import boto3 @@ -44,38 +44,43 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: return np.array(all_embeddings) - def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray: - return self.encode(queries, **kwargs) - - def encode_corpus( - self, corpus: list[dict[str, str]] | dict[str, list[str]], **kwargs: Any - ) -> np.ndarray: - sentences = corpus_to_texts(corpus) - return self.encode(sentences, **kwargs) - def _to_numpy(self, embedding_response) -> np.ndarray: response = json.loads(embedding_response.get("body").read()) return np.array(response['embedding']) amazon_titan_embed_text_v1 = ModelMeta( - name="amazon-titan-embed-text-v1", + name="amazon/titan-embed-text-v1", revision="1", - release_date=None, + release_date="2023-09-27", languages=None, # supported languages not specified loader=partial(AmazonWrapper, model_id="amazon.titan-embed-text-v1"), max_tokens=8192, - embed_dim=None, - open_source=False, + embed_dim=1536, + open_weights=False, + n_parameters=None, + memory_usage=None, + license=None, + reference="https://aws.amazon.com/about-aws/whats-new/2023/09/amazon-titan-embeddings-generally-available/", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=False, ) amazon_titan_embed_text_v2 = ModelMeta( - name="amazon-titan-embed-text-v2", + name="amazon/titan-embed-text-v2", revision="1", - release_date=None, + release_date="2024-04-30", languages=None, # supported languages not specified loader=partial(AmazonWrapper, model_id="amazon.titan-embed-text-v2:0"), max_tokens=8192, - embed_dim=None, - open_source=False, + embed_dim=1024, + open_weights=False, + n_parameters=None, + memory_usage=None, + license=None, + reference="https://aws.amazon.com/about-aws/whats-new/2024/04/amazon-titan-text-embeddings-v2-amazon-bedrock/", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=False, ) diff --git a/mteb/models/cohere_bedrock_models.py b/mteb/models/cohere_bedrock_models.py index af2b2d48ac..a3096c8eb3 100644 --- a/mteb/models/cohere_bedrock_models.py +++ b/mteb/models/cohere_bedrock_models.py @@ -6,17 +6,136 @@ import numpy as np import json +import tqdm -import mteb from mteb.model_meta import ModelMeta -from mteb.models.text_formatting_utils import corpus_to_texts +from mteb.encoder_interface import PromptType from mteb.requires_package import requires_package +from .wrapper import Wrapper logger = logging.getLogger(__name__) -class CohereBedrockWrapper: - def __init__(self, model_id: str, **kwargs) -> None: +supported_languages = [ + "afr-Latn", + "amh-Ethi", + "ara-Arab", + "asm-Beng", + "aze-Latn", + "bel-Cyrl", + "bul-Cyrl", + "ben-Beng", + "bod-Tibt", + "bos-Latn", + "cat-Latn", + "ceb-Latn", + "cos-Latn", + "ces-Latn", + "cym-Latn", + "dan-Latn", + "deu-Latn", + "ell-Grek", + "eng-Latn", + "epo-Latn", + "spa-Latn", + "est-Latn", + "eus-Latn", + "fas-Arab", + "fin-Latn", + "fra-Latn", + "fry-Latn", + "gle-Latn", + "gla-Latn", + "glg-Latn", + "guj-Gujr", + "hau-Latn", + "haw-Latn", + "heb-Hebr", + "hin-Deva", + "hmn-Latn", + "hrv-Latn", + "hat-Latn", + "hun-Latn", + "hye-Armn", + "ind-Latn", + "ibo-Latn", + "isl-Latn", + "ita-Latn", + "jpn-Jpan", + "jav-Latn", + "kat-Geor", + "kaz-Cyrl", + "khm-Khmr", + "kan-Knda", + "kor-Kore", + "kur-Arab", + "kir-Cyrl", + "lat-Latn", + "ltz-Latn", + "lao-Laoo", + "lit-Latn", + "lav-Latn", + "mlg-Latn", + "mri-Latn", + "mkd-Cyrl", + "mal-Mlym", + "mon-Cyrl", + "mar-Deva", + "msa-Latn", + "mlt-Latn", + "mya-Mymr", + "nep-Deva", + "nld-Latn", + "nor-Latn", + "nya-Latn", + "ori-Orya", + "pan-Guru", + "pol-Latn", + "por-Latn", + "ron-Latn", + "rus-Cyrl", + "kin-Latn", + "sin-Sinh", + "slk-Latn", + "slv-Latn", + "smo-Latn", + "sna-Latn", + "som-Latn", + "sqi-Latn", + "srp-Cyrl", + "sot-Latn", + "sun-Latn", + "swe-Latn", + "swa-Latn", + "tam-Taml", + "tel-Telu", + "tgk-Cyrl", + "tha-Thai", + "tuk-Latn", + "tgl-Latn", + "tur-Latn", + "tat-Cyrl", + "uig-Arab", + "ukr-Cyrl", + "urd-Arab", + "uzb-Latn", + "vie-Latn", + "wol-Latn", + "xho-Latn", + "yid-Hebr", + "yor-Latn", + "zho-Hans", + "zul-Latn", +] + + +class CohereBedrockWrapper(Wrapper): + def __init__( + self, + model_id: str, + model_prompts: dict[str, str] | None = None, + **kwargs + ) -> None: requires_package(self, "boto3", "Amazon Bedrock") import boto3 boto3_session = boto3.session.Session() @@ -26,33 +145,29 @@ def __init__(self, model_id: str, **kwargs) -> None: region_name, ) self._model_id = model_id + self.model_prompts = ( + self.validate_task_to_prompt_name(model_prompts) if model_prompts else None + ) - def encode(self, sentences: list[str], - prompt_name: str | None = None, - cohere_task_type: str = "search_document", - **kwargs: Any) -> np.ndarray: - requires_package(self, "boto3", "Amazon Bedrock") - - if prompt_name: - task = mteb.get_task(prompt_name) - task_type = task.metadata.type - if task_type in ["Classification", "MultilabelClassification"]: - cohere_task_type = "classification" - elif task_type == "Clustering": - cohere_task_type = "clustering" - + def _embed( + self, + sentences: list[str], + cohere_task_type: str, + show_progress_bar: bool = False, + ) -> np.ndarray: max_batch_size = 96 - sublists = [ + + batches = [ sentences[i: i + max_batch_size] for i in range(0, len(sentences), max_batch_size) ] all_embeddings = [] - for sublist in sublists: + for batch in tqdm.tqdm(batches, leave=False, disable=not show_progress_bar): response = self._client.invoke_model( body=json.dumps({ - "texts": [sent[:2048] for sent in sublist], + "texts": [sent[:2048] for sent in batch], "input_type": cohere_task_type}), modelId=self._model_id, accept="*/*", @@ -62,40 +177,80 @@ def encode(self, sentences: list[str], return np.array(all_embeddings) - def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray: - return self.encode(queries, **kwargs) - - def encode_corpus( - self, corpus: list[dict[str, str]] | dict[str, list[str]], **kwargs: Any + def encode( + self, + sentences: list[str], + *, + task_name: str, + prompt_type: PromptType | None = None, + **kwargs: Any, ) -> np.ndarray: - sentences = corpus_to_texts(corpus) - return self.encode(sentences, **kwargs) + prompt_name = self.get_prompt_name(self.model_prompts, task_name, prompt_type) + cohere_task_type = self.model_prompts.get(prompt_name) + + if cohere_task_type is None: + # search_document is recommended if unknown (https://cohere.com/blog/introducing-embed-v3) + cohere_task_type = "search_document" + + show_progress_bar = ( + False + if "show_progress_bar" not in kwargs + else kwargs.pop("show_progress_bar") + ) + + return self._embed( + sentences, + cohere_task_type=cohere_task_type, + show_progress_bar=show_progress_bar, + ) def _to_numpy(self, embedding_response) -> np.ndarray: response = json.loads(embedding_response.get("body").read()) return np.array(response['embeddings']) +model_prompts = { + "Classification": "classification", + "MultilabelClassification": "classification", + "Clustering": "clustering", + PromptType.query.value: "search_query", + PromptType.passage.value: "search_document", +} + cohere_embed_english_v3 = ModelMeta( - name="cohere-embed-english-v3", - revision="1", - release_date=None, - languages=None, # supported languages not specified loader=partial(CohereBedrockWrapper, - model_id="cohere.embed-english-v3"), + model_id="cohere.embed-english-v3", model_prompts=model_prompts), + name="bedrock/cohere-embed-english-v3", + languages=["eng-Latn"], + open_weights=False, + reference="https://cohere.com/blog/introducing-embed-v3", + revision="1", + release_date="2023-11-02", + n_parameters=None, + memory_usage=None, max_tokens=512, - embed_dim=None, - open_source=False, + embed_dim=1024, + license=None, + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, ) cohere_embed_multilingual_v3 = ModelMeta( + loader=partial(CohereBedrockWrapper, + model_id="cohere.embed-multilingual-v3", model_prompts=model_prompts), name="cohere-embed-multilingual-v3", + languages=supported_languages, + open_weights=False, + reference="https://cohere.com/blog/introducing-embed-v3", revision="1", - release_date=None, - languages=None, # supported languages not specified - loader=partial(CohereBedrockWrapper, - model_id="cohere.embed-multilingual-v3"), + release_date="2023-11-02", + n_parameters=None, + memory_usage=None, max_tokens=512, - embed_dim=None, - open_source=False, + embed_dim=1024, + license=None, + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, ) diff --git a/mteb/models/nomic_bert_models.py b/mteb/models/nomic_bert_models.py index 281a84f9a5..7f8d2f4bab 100644 --- a/mteb/models/nomic_bert_models.py +++ b/mteb/models/nomic_bert_models.py @@ -142,8 +142,15 @@ def custom_nomic_bert_loader( ), name="nomic-ai/nomic-bert-2048", languages=["eng-Latn"], - open_source=True, - revision=None, + open_weights=True, + revision="40b98394640e630d5276807046089b233113aa87", release_date="2024-01-03", # first commit + open_weights=True, + license="apache-2.0", + framework=["Sentence Transformers", "PyTorch"], + reference="https://huggingface.co/nomic-ai/nomic-bert-2048", + public_training_data=True, + public_training_code=True, + max_tokens=2048, ) From cea106315783529b2c0b0dfc4e29a006cd8c3a50 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Thu, 2 Jan 2025 22:19:56 +0000 Subject: [PATCH 32/49] Fix a typo `open_weights` argument is repeated twice --- mteb/models/nomic_bert_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mteb/models/nomic_bert_models.py b/mteb/models/nomic_bert_models.py index 7f8d2f4bab..10da788d7c 100644 --- a/mteb/models/nomic_bert_models.py +++ b/mteb/models/nomic_bert_models.py @@ -145,7 +145,6 @@ def custom_nomic_bert_loader( open_weights=True, revision="40b98394640e630d5276807046089b233113aa87", release_date="2024-01-03", # first commit - open_weights=True, license="apache-2.0", framework=["Sentence Transformers", "PyTorch"], reference="https://huggingface.co/nomic-ai/nomic-bert-2048", From 47668acefffef24ad36eab609c31e93ad0e2fa84 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sat, 4 Jan 2025 22:18:56 +0000 Subject: [PATCH 33/49] Update ChemTEB tasks - Rename some tasks for better readability. - Merge some BitextMining and PairClassification tasks into a single task with subsets (`PubChemSMILESBitextMining` and `PubChemSMILESPC`) - Add a new multilingual task (`PubChemWikiPairClassification`) consisting of 12 languages. - Update dataset paths, revisions and metadata for most tasks. - Add a `Chemistry` domain to `TaskMetadata` --- mteb/abstasks/TaskMetadata.py | 1 + mteb/tasks/BitextMining/__init__.py | 4 +- .../multilingual/CoconutSMILES2Formula2BM.py | 61 ---------- ...rmula1BM.py => CoconutSMILES2FormulaBM.py} | 38 +++--- .../multilingual/PubChemSMILESBitextMining.py | 71 +++++++++++ .../multilingual/PubChemSMILESCanonDescBM.py | 34 +++--- .../multilingual/PubChemSMILESCanonTitleBM.py | 34 +++--- .../multilingual/PubChemSMILESISoDescBM.py | 34 +++--- .../multilingual/PubChemSMILESISoTitleBM.py | 34 +++--- mteb/tasks/Classification/__init__.py | 30 ++--- .../eng/SDSEyeProtectionClassification.py | 22 ++-- .../eng/SDSGlovesClassification.py | 22 ++-- .../eng/WikipediaBioMetChemClassification.py | 37 ++++++ .../WikipediaBiolumNeurochemClassification.py | 37 ++++++ ...kipediaChemEngSpecialtiesClassification.py | 37 ++++++ .../eng/WikipediaChemFieldsClassification.py | 37 ++++++ .../WikipediaChemistryTopicsClassification.py | 37 ++++++ ...pediaCompChemSpectroscopyClassification.py | 37 ++++++ ...ediaCryobiologySeparationClassification.py | 37 ++++++ ...CrystallographyAnalyticalClassification.py | 37 ++++++ .../eng/WikipediaEZ10Classification.py | 31 ----- .../eng/WikipediaEZ2Classification.py | 31 ----- .../eng/WikipediaEasy10Classification.py | 31 ----- ...eneExpressionVsMetallurgyClassification.py | 31 ----- ...y2GreenhouseVsEnantiopureClassification.py | 31 ----- ...asy2SolidStateVsColloidalClassification.py | 31 ----- .../WikipediaEasy2SpecialClassification.py | 31 ----- .../eng/WikipediaEasy5Classification.py | 31 ----- ...ediaGreenhouseEnantiopureClassification.py | 37 ++++++ ...uminescenceVsLuminescenceClassification.py | 31 ----- ...ionProductsNuclearFissionClassification.py | 31 ----- ...sVsSemiconductorMaterialsClassification.py | 31 ----- .../WikipediaIsotopesFissionClassification.py | 37 ++++++ .../WikipediaLuminescenceClassification.py | 37 ++++++ ...inescenceVsNeurochemistryClassification.py | 31 ----- ...tationalVsSpectroscopistsClassification.py | 31 ----- ...ChromatographyTitrationpHClassification.py | 31 ----- .../eng/WikipediaMedium5Classification.py | 31 ----- ...WikipediaOrganicInorganicClassification.py | 37 ++++++ ...ipediaSaltsSemiconductorsClassification.py | 37 ++++++ ...ipediaSolidStateColloidalClassification.py | 37 ++++++ ...kipediaTheoreticalAppliedClassification.py | 37 ++++++ mteb/tasks/Clustering/__init__.py | 4 +- ...WikipediaChemistrySpecialtiesClustering.py | 38 ++++++ .../eng/WikipediaChemistryTopicsClustering.py | 38 ++++++ .../eng/WikipediaEasy10Clustering.py | 32 ----- .../eng/WikipediaMedium5Clustering.py | 32 ----- mteb/tasks/PairClassification/__init__.py | 2 + .../eng/CoconutSMILES2FormulaPC.py | 44 +++---- .../eng/PubChemAISentenceParaphrasePC.py | 29 +++-- .../eng/PubChemSMILESCanonDescPC.py | 44 +++---- .../eng/PubChemSMILESCanonTitlePC.py | 45 +++---- .../eng/PubChemSMILESIsoDescPC.py | 44 +++---- .../eng/PubChemSMILESIsoTitlePC.py | 45 +++---- .../PairClassification/eng/PubChemSMILESPC.py | 115 ++++++++++++++++++ .../eng/PubChemSynonymPC.py | 44 +++---- .../eng/PubChemWikiParagraphsPC.py | 31 ++--- .../eng/WikipediaAIParagraphsParaphrasePC.py | 29 +++-- .../PubChemWikiPairClassification.py | 67 ++++++++++ .../Retrieval/eng/ChemHotpotQARetrieval.py | 52 +++----- mteb/tasks/Retrieval/eng/ChemNQRetrieval.py | 37 +++--- 61 files changed, 1225 insertions(+), 952 deletions(-) delete mode 100644 mteb/tasks/BitextMining/multilingual/CoconutSMILES2Formula2BM.py rename mteb/tasks/BitextMining/multilingual/{CoconutSMILES2Formula1BM.py => CoconutSMILES2FormulaBM.py} (52%) create mode 100644 mteb/tasks/BitextMining/multilingual/PubChemSMILESBitextMining.py create mode 100644 mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaEZ10Classification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaEZ2Classification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaEasy10Classification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaEasy2GeneExpressionVsMetallurgyClassification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopureClassification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidalClassification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaEasy2SpecialClassification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaEasy5Classification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescenceClassification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopistsClassification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py delete mode 100644 mteb/tasks/Classification/eng/WikipediaMedium5Classification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py create mode 100644 mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py create mode 100644 mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py delete mode 100644 mteb/tasks/Clustering/eng/WikipediaEasy10Clustering.py delete mode 100644 mteb/tasks/Clustering/eng/WikipediaMedium5Clustering.py create mode 100644 mteb/tasks/PairClassification/eng/PubChemSMILESPC.py create mode 100644 mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index 07c4f97a04..b04753d877 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -70,6 +70,7 @@ "Web", "Written", "Programming", + "Chemistry", ] SAMPLE_CREATION_METHOD = Literal[ diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py index 19353067b5..0d991d1358 100644 --- a/mteb/tasks/BitextMining/__init__.py +++ b/mteb/tasks/BitextMining/__init__.py @@ -5,8 +5,7 @@ from .multilingual.BibleNLPBitextMining import * from .multilingual.BUCCBitextMining import * from .multilingual.BUCCBitextMiningFast import * -from .multilingual.CoconutSMILES2Formula1BM import * -from .multilingual.CoconutSMILES2Formula2BM import * +from .multilingual.CoconutSMILES2FormulaBM import * from .multilingual.DiaBLaBitextMining import * from .multilingual.FloresBitextMining import * from .multilingual.IN22ConvBitextMining import * @@ -20,6 +19,7 @@ from .multilingual.NusaTranslationBitextMining import * from .multilingual.NusaXBitextMining import * from .multilingual.PhincBitextMining import * +from .multilingual.PubChemSMILESBitextMining import * from .multilingual.PubChemSMILESCanonDescBM import * from .multilingual.PubChemSMILESCanonTitleBM import * from .multilingual.PubChemSMILESISoDescBM import * diff --git a/mteb/tasks/BitextMining/multilingual/CoconutSMILES2Formula2BM.py b/mteb/tasks/BitextMining/multilingual/CoconutSMILES2Formula2BM.py deleted file mode 100644 index 21a785a658..0000000000 --- a/mteb/tasks/BitextMining/multilingual/CoconutSMILES2Formula2BM.py +++ /dev/null @@ -1,61 +0,0 @@ -from __future__ import annotations - -import datasets - -from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining -from mteb.abstasks.MultilingualTask import MultilingualTask -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class CoconutSMILES2Formula2BM(AbsTaskBitextMining, MultilingualTask): - metadata = TaskMetadata( - name="CoconutSMILES2Formula2BM", - dataset={ - "path": "BASF-We-Create-Chemistry/CoconutSmiles2NameBitextMining2", - "revision": "5db4895bbfbf84db8e3f876285b5178828f6ebee" - }, - description="TBW", - reference="https://coconut.naturalproducts.net/", - type="BitextMining", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs={ - "en-en": ["en-Latn", "eng-Latn"] - }, - main_score="f1", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, - bibtex_citation=None, - descriptive_stats={"n_samples": {}, "avg_character_length": {}}, - ) - - def load_data(self, **kwargs): - """Load dataset from HuggingFace hub and convert it to the standard format.""" - if self.data_loaded: - return - - self.dataset = {} - - for lang in self.hf_subsets: - self.dataset[lang] = datasets.load_dataset( - **self.metadata_dict["dataset"]) - - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self): - def create_columns(row): - """Put all English titles in column 'sentence1' and SMILES strings in 'sentence2' column""" - row["sentence1"] = row["formula"] - row["sentence2"] = row["smiles"] - return row - - # Convert to standard format - for lang in self.hf_subsets: - self.dataset[lang] = self.dataset[lang].map(create_columns) diff --git a/mteb/tasks/BitextMining/multilingual/CoconutSMILES2Formula1BM.py b/mteb/tasks/BitextMining/multilingual/CoconutSMILES2FormulaBM.py similarity index 52% rename from mteb/tasks/BitextMining/multilingual/CoconutSMILES2Formula1BM.py rename to mteb/tasks/BitextMining/multilingual/CoconutSMILES2FormulaBM.py index 9063c9347c..fb7fc15ee7 100644 --- a/mteb/tasks/BitextMining/multilingual/CoconutSMILES2Formula1BM.py +++ b/mteb/tasks/BitextMining/multilingual/CoconutSMILES2FormulaBM.py @@ -7,15 +7,15 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class CoconutSMILES2Formula1BM(AbsTaskBitextMining, MultilingualTask): +class CoconutSMILES2FormulaBM(AbsTaskBitextMining, MultilingualTask): metadata = TaskMetadata( - name="CoconutSMILES2Formula1BM", + name="CoconutSMILES2FormulaBM", dataset={ - "path": "BASF-We-Create-Chemistry/CoconutSMILES2FormulaBM", + "path": "BASF-AI/CoconutSMILES2FormulaBM", "revision": "af0913db3a92d4b16ad679733c281b3237d399a5" }, - description="TBW", - reference="https://coconut.naturalproducts.net/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", type="BitextMining", category="s2s", modalities=["text"], @@ -25,14 +25,20 @@ class CoconutSMILES2Formula1BM(AbsTaskBitextMining, MultilingualTask): }, main_score="f1", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", dialect=None, sample_creation=None, - bibtex_citation=None, - descriptive_stats={"n_samples": {}, "avg_character_length": {}}, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) def load_data(self, **kwargs): @@ -50,12 +56,8 @@ def load_data(self, **kwargs): self.data_loaded = True def dataset_transform(self): - def create_columns(row): - """Put all English titles in column 'sentence1' and SMILES strings in 'sentence2' column""" - row["sentence1"] = row["formula"] - row["sentence2"] = row["smiles"] - return row - - # Convert to standard format for lang in self.hf_subsets: - self.dataset[lang] = self.dataset[lang].map(create_columns) + self.dataset[lang] = self.dataset[lang].rename_columns({ + "formula": "sentence1", + "smiles": "sentence2" + }) diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESBitextMining.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESBitextMining.py new file mode 100644 index 0000000000..261c89f3b2 --- /dev/null +++ b/mteb/tasks/BitextMining/multilingual/PubChemSMILESBitextMining.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +COL_MAPPING = { + "iso-title": {"title": "sentence1", "isomeric_smiles": "sentence2"}, + "iso-desc": {"description": "sentence1", "isomeric_smiles": "sentence2"}, + "canon-title": {"title": "sentence1", "canonical_smiles": "sentence2"}, + "canon-desc": {"description": "sentence1", "canonical_smiles": "sentence2"}, +} + +EVAL_LANGS = { + "iso-title": ["en-Latn", "eng-Latn"], + "iso-desc": ["en-Latn", "eng-Latn"], + "canon-title": ["en-Latn", "eng-Latn"], + "canon-desc": ["en-Latn", "eng-Latn"], +} + + +class PubChemSMILESBitextMining(AbsTaskBitextMining, MultilingualTask): + metadata = TaskMetadata( + name="PubChemSMILESBitextMining", + dataset={ + "path": "BASF-AI/PubChemSMILESBitextMining", + "revision": "36700ea628118312ebf2f90ad2353a9a8f188dc9" + }, + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + type="BitextMining", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=EVAL_LANGS, + main_score="f1", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub and convert it to the standard format.""" + if self.data_loaded: + return + self.dataset = {} + + for subset in self.hf_subsets: + self.dataset[subset] = datasets.load_dataset( + **self.metadata_dict["dataset"], name=subset) + + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + for subset in self.hf_subsets: + self.dataset[subset] = self.dataset[subset].rename_columns(COL_MAPPING[subset]) diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py index 4e96b2e98c..127d9295dd 100644 --- a/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py +++ b/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py @@ -11,11 +11,11 @@ class PubChemSMILESCanonDescBM(AbsTaskBitextMining, MultilingualTask): metadata = TaskMetadata( name="PubChemSMILESCanonDescBM", dataset={ - "path": "BASF-We-Create-Chemistry/PubChemSMILESCanonDescBM", + "path": "BASF-AI/PubChemSMILESCanonDescBM", "revision": "a721de4af2857bf3cc014b92f013a4f573d9cb00" }, - description="TBW", - reference="https://pubchem.ncbi.nlm.nih.gov/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", type="BitextMining", category="s2p", modalities=["text"], @@ -25,14 +25,20 @@ class PubChemSMILESCanonDescBM(AbsTaskBitextMining, MultilingualTask): }, main_score="f1", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", dialect=None, sample_creation=None, - bibtex_citation=None, - descriptive_stats={"n_samples": {}, "avg_character_length": {}}, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) def load_data(self, **kwargs): @@ -50,12 +56,8 @@ def load_data(self, **kwargs): self.data_loaded = True def dataset_transform(self): - def create_columns(row): - """Put all English titles in column 'sentence1' and SMILES strings in 'sentence2' column""" - row["sentence1"] = row["description"] - row["sentence2"] = row["canonical_smiles"] - return row - - # Convert to standard format for lang in self.hf_subsets: - self.dataset[lang] = self.dataset[lang].map(create_columns) + self.dataset[lang] = self.dataset[lang].rename_columns({ + "description": "sentence1", + "canonical_smiles": "sentence2" + }) diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py index 540e91dac3..29142fc117 100644 --- a/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py +++ b/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py @@ -11,11 +11,11 @@ class PubChemSMILESCanonTitleBM(AbsTaskBitextMining, MultilingualTask): metadata = TaskMetadata( name="PubChemSMILESCanonTitleBM", dataset={ - "path": "BASF-We-Create-Chemistry/PubChemSMILESCanonTitleBM", + "path": "BASF-AI/PubChemSMILESCanonTitleBM", "revision": "2c7a74635cf41b2ca50d878fa1ff670fc5af3ea1" }, - description="TBW", - reference="https://pubchem.ncbi.nlm.nih.gov/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", type="BitextMining", category="s2s", modalities=["text"], @@ -25,14 +25,20 @@ class PubChemSMILESCanonTitleBM(AbsTaskBitextMining, MultilingualTask): }, main_score="f1", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", dialect=None, sample_creation=None, - bibtex_citation=None, - descriptive_stats={"n_samples": {}, "avg_character_length": {}}, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) def load_data(self, **kwargs): @@ -50,12 +56,8 @@ def load_data(self, **kwargs): self.data_loaded = True def dataset_transform(self): - def create_columns(row): - """Put all English titles in column 'sentence1' and SMILES strings in 'sentence2' column""" - row["sentence1"] = row["title"] - row["sentence2"] = row["canonical_smiles"] - return row - - # Convert to standard format for lang in self.hf_subsets: - self.dataset[lang] = self.dataset[lang].map(create_columns) + self.dataset[lang] = self.dataset[lang].rename_columns({ + "title": "sentence1", + "canonical_smiles": "sentence2" + }) diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py index e9d5056745..b135ea68d4 100644 --- a/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py +++ b/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py @@ -11,11 +11,11 @@ class PubChemSMILESISoDescBM(AbsTaskBitextMining, MultilingualTask): metadata = TaskMetadata( name="PubChemSMILESISoDescBM", dataset={ - "path": "BASF-We-Create-Chemistry/PubChemSMILESIsoDescBM", + "path": "BASF-AI/PubChemSMILESIsoDescBM", "revision": "33a2064662e851ea5e42653b303eb9e0f6878a07" }, - description="TBW", - reference="https://pubchem.ncbi.nlm.nih.gov/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", type="BitextMining", category="s2p", modalities=["text"], @@ -25,14 +25,20 @@ class PubChemSMILESISoDescBM(AbsTaskBitextMining, MultilingualTask): }, main_score="f1", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", dialect=None, sample_creation=None, - bibtex_citation=None, - descriptive_stats={"n_samples": {}, "avg_character_length": {}}, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) def load_data(self, **kwargs): @@ -50,12 +56,8 @@ def load_data(self, **kwargs): self.data_loaded = True def dataset_transform(self): - def create_columns(row): - """Put all English titles in column 'sentence1' and SMILES strings in 'sentence2' column""" - row["sentence1"] = row["description"] - row["sentence2"] = row["isomeric_smiles"] - return row - - # Convert to standard format for lang in self.hf_subsets: - self.dataset[lang] = self.dataset[lang].map(create_columns) + self.dataset[lang] = self.dataset[lang].rename_columns({ + "description": "sentence1", + "isomeric_smiles": "sentence2" + }) diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py index bf0a83edc9..d0a5361aeb 100644 --- a/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py +++ b/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py @@ -11,11 +11,11 @@ class PubChemSMILESISoTitleBM(AbsTaskBitextMining, MultilingualTask): metadata = TaskMetadata( name="PubChemSMILESISoTitleBM", dataset={ - "path": "BASF-We-Create-Chemistry/PubChemSMILESIsoTitleBM", + "path": "BASF-AI/PubChemSMILESIsoTitleBM", "revision": "d60f975694c1841e60a39518b80e157c145f0be1" }, - description="TBW", - reference="https://pubchem.ncbi.nlm.nih.gov/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", type="BitextMining", category="s2s", modalities=["text"], @@ -25,14 +25,20 @@ class PubChemSMILESISoTitleBM(AbsTaskBitextMining, MultilingualTask): }, main_score="f1", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", dialect=None, sample_creation=None, - bibtex_citation=None, - descriptive_stats={"n_samples": {}, "avg_character_length": {}}, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) def load_data(self, **kwargs): @@ -50,12 +56,8 @@ def load_data(self, **kwargs): self.data_loaded = True def dataset_transform(self): - def create_columns(row): - """Put all English titles in column 'sentence1' and SMILES strings in 'sentence2' column""" - row["sentence1"] = row["title"] - row["sentence2"] = row["isomeric_smiles"] - return row - - # Convert to standard format for lang in self.hf_subsets: - self.dataset[lang] = self.dataset[lang].map(create_columns) + self.dataset[lang] = self.dataset[lang].rename_columns({ + "title": "sentence1", + "isomeric_smiles": "sentence2" + }) diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index 8ee516aae2..09e8761a17 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -39,21 +39,21 @@ from .eng.ToxicConversationsClassification import * from .eng.TweetSentimentExtractionClassification import * from .eng.TweetTopicSingleClassification import * -from .eng.WikipediaEasy2GeneExpressionVsMetallurgyClassification import * -from .eng.WikipediaEasy2GreenhouseVsEnantiopureClassification import * -from .eng.WikipediaEasy2SolidStateVsColloidalClassification import * -from .eng.WikipediaEasy2SpecialClassification import * -from .eng.WikipediaEasy5Classification import * -from .eng.WikipediaEasy10Classification import * -from .eng.WikipediaEZ2Classification import * -from .eng.WikipediaEZ10Classification import * -from .eng.WikipediaHard2BioluminescenceVsLuminescenceClassification import * -from .eng.WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification import * -from .eng.WikipediaHard2SaltsVsSemiconductorMaterialsClassification import * -from .eng.WikipediaMedium2BioluminescenceVsNeurochemistryClassification import * -from .eng.WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification import * -from .eng.WikipediaMedium2ComputationalVsSpectroscopistsClassification import * -from .eng.WikipediaMedium5Classification import * +from .eng.WikipediaBioMetChemClassification import * +from .eng.WikipediaGreenhouseEnantiopureClassification import * +from .eng.WikipediaSolidStateColloidalClassification import * +from .eng.WikipediaOrganicInorganicClassification import * +from .eng.WikipediaCryobiologySeparationClassification import * +from .eng.WikipediaChemistryTopicsClassification import * +from .eng.WikipediaTheoreticalAppliedClassification import * +from .eng.WikipediaChemFieldsClassification import * +from .eng.WikipediaLuminescenceClassification import * +from .eng.WikipediaIsotopesFissionClassification import * +from .eng.WikipediaSaltsSemiconductorsClassification import * +from .eng.WikipediaBiolumNeurochemClassification import * +from .eng.WikipediaCrystallographyAnalyticalClassification import * +from .eng.WikipediaCompChemSpectroscopyClassification import * +from .eng.WikipediaChemEngSpecialtiesClassification import * from .eng.YahooAnswersTopicsClassification import * from .eng.YelpReviewFullClassification import * from .est.estonian_valence import * diff --git a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py index 76ee173f4b..5adaba8c7e 100644 --- a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py +++ b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py @@ -7,10 +7,10 @@ class SDSEyeProtectionClassification(AbsTaskClassification): metadata = TaskMetadata( name="SDSEyeProtectionClassification", - description="TBW", - reference="https://www.kaggle.com/datasets/eliseu10/material-safety-data-sheets", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", dataset={ - "path": "BASF-We-Create-Chemistry/SmallSDSEyeProtectionClassification", + "path": "BASF-AI/SDSEyeProtectionClassification", "revision": "35cbe5ee544dd26e343238a333de4568e6f77819", }, type="Classification", @@ -20,12 +20,18 @@ class SDSEyeProtectionClassification(AbsTaskClassification): eval_langs=["eng-Latn"], main_score="accuracy", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators="derived", + license="cc-by-nc-sa-4.0", + annotations_creators="LM-generated and reviewed", dialect=[], sample_creation=None, - bibtex_citation=None, - descriptive_stats={} + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) diff --git a/mteb/tasks/Classification/eng/SDSGlovesClassification.py b/mteb/tasks/Classification/eng/SDSGlovesClassification.py index ec23e8846c..6f3d339bec 100644 --- a/mteb/tasks/Classification/eng/SDSGlovesClassification.py +++ b/mteb/tasks/Classification/eng/SDSGlovesClassification.py @@ -7,10 +7,10 @@ class SDSGlovesClassification(AbsTaskClassification): metadata = TaskMetadata( name="SDSGlovesClassification", - description="TBW", - reference="https://www.kaggle.com/datasets/eliseu10/material-safety-data-sheets", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", dataset={ - "path": "BASF-We-Create-Chemistry/SmallSDSGlovesClassification", + "path": "BASF-AI/SDSGlovesClassification", "revision": "c723236c5ec417d79512e6104aca9d2cd88168f6", }, type="Classification", @@ -20,12 +20,18 @@ class SDSGlovesClassification(AbsTaskClassification): eval_langs=["eng-Latn"], main_score="accuracy", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators="derived", + license="cc-by-nc-sa-4.0", + annotations_creators="LM-generated and reviewed", dialect=[], sample_creation=None, - bibtex_citation=None, - descriptive_stats={} + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) diff --git a/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py b/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py new file mode 100644 index 0000000000..30194b4950 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaBioMetChemClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaBioMetChemClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy2GeneExpressionVsMetallurgyClassification", + "revision": "6ac491e5de9070c6dd434b31e76d3d379123dcff", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py b/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py new file mode 100644 index 0000000000..ff3fd4f6ac --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaBiolumNeurochemClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaBiolumNeurochemClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium2BioluminescenceVsNeurochemistryClassification", + "revision": "2f68b7d34c2be896e46b14533573b366e59e5aae", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py b/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py new file mode 100644 index 0000000000..41c8c025d5 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaChemEngSpecialtiesClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaChemEngSpecialtiesClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium5Classification", + "revision": "f81a76a2fb690e5d5bd7a26dd07e85cdf8405dfb", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py new file mode 100644 index 0000000000..3a6283556c --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaChemFieldsClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaChemFieldsClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEZ10Classification", + "revision": "bb465f7e0dc023c7effc39b45aa268ff70d4312c", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py b/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py new file mode 100644 index 0000000000..9a7c52bd7d --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaChemistryTopicsClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaChemistryTopicsClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy10Classification", + "revision": "d8fb355db2248f95df8ea410a43aa1db1ee96ba4", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py b/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py new file mode 100644 index 0000000000..837800cc1f --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaCompChemSpectroscopyClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaCompChemSpectroscopyClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium2ComputationalVsSpectroscopistsClassification", + "revision": "474d706a22b0451b5846d623aa4b4234ba5b0513", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py b/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py new file mode 100644 index 0000000000..3e44b91e77 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaCryobiologySeparationClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaCryobiologySeparationClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy5Classification", + "revision": "858633e882dadd1ec6a0d220f7549bcafd379236", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py b/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py new file mode 100644 index 0000000000..e42d787949 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaCrystallographyAnalyticalClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaCrystallographyAnalyticalClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification", + "revision": "740565a6a853aaed1114a13bdfd5fd46857b4f11", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaEZ10Classification.py b/mteb/tasks/Classification/eng/WikipediaEZ10Classification.py deleted file mode 100644 index 293dac89c8..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaEZ10Classification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaEZ10Classification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaEZ10Classification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaEZ10Classification", - "revision": "8121c72f71ccb570bcfdd9f0f6b52059507db983", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaEZ2Classification.py b/mteb/tasks/Classification/eng/WikipediaEZ2Classification.py deleted file mode 100644 index e908175746..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaEZ2Classification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaEZ2Classification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaEZ2Classification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaEZ2Classification", - "revision": "1e4c5cbdfca9b7dab2d0822eff88a0bf5f79c429", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaEasy10Classification.py b/mteb/tasks/Classification/eng/WikipediaEasy10Classification.py deleted file mode 100644 index 5f97f1e4e0..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaEasy10Classification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaEasy10Classification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaEasy10Classification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaEasy10Classification", - "revision": "c1a2d556de003739a787e2853ca9746b4ee1333f", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2GeneExpressionVsMetallurgyClassification.py b/mteb/tasks/Classification/eng/WikipediaEasy2GeneExpressionVsMetallurgyClassification.py deleted file mode 100644 index 4708ca342b..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaEasy2GeneExpressionVsMetallurgyClassification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaEasy2GeneExpressionVsMetallurgyClassification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaEasy2GeneExpressionVsMetallurgyClassification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaEasy2GeneExpressionVsMetallurgyClassification", - "revision": "6ac491e5de9070c6dd434b31e76d3d379123dcff", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopureClassification.py b/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopureClassification.py deleted file mode 100644 index 088c5164fe..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaEasy2GreenhouseVsEnantiopureClassification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaEasy2GreenhouseVsEnantiopureClassification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaEasy2GreenhouseVsEnantiopureClassification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaEasy2GreenhouseVsEnantiopureClassification", - "revision": "0cfc1a83b6ed832454e8f4f93f7a0e26208274d9", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidalClassification.py b/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidalClassification.py deleted file mode 100644 index bc48090be6..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaEasy2SolidStateVsColloidalClassification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaEasy2SolidStateVsColloidalClassification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaEasy2SolidStateVsColloidalClassification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaEasy2SolidStateVsColloidalClassification", - "revision": "7d8df44e588b6143d4856c781f72f919fa0599a7", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaEasy2SpecialClassification.py b/mteb/tasks/Classification/eng/WikipediaEasy2SpecialClassification.py deleted file mode 100644 index edf16cc472..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaEasy2SpecialClassification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaEasy2SpecialClassification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaEasy2SpecialClassification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaEasy2SpecialClassification", - "revision": "96d1d9b37c4693f74c46c83d63a290573f78d511", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaEasy5Classification.py b/mteb/tasks/Classification/eng/WikipediaEasy5Classification.py deleted file mode 100644 index c5755b1b11..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaEasy5Classification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaEasy5Classification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaEasy5Classification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaEasy5Classification", - "revision": "858633e882dadd1ec6a0d220f7549bcafd379236", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py b/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py new file mode 100644 index 0000000000..a24567473f --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaGreenhouseEnantiopureClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaGreenhouseEnantiopureClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy2GreenhouseVsEnantiopureClassification", + "revision": "0cfc1a83b6ed832454e8f4f93f7a0e26208274d9", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescenceClassification.py b/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescenceClassification.py deleted file mode 100644 index c70648bf12..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaHard2BioluminescenceVsLuminescenceClassification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaHard2BioluminescenceVsLuminescenceClassification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaHard2BioluminescenceVsLuminescenceClassification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaHard2BioluminescenceVsLuminescenceClassification", - "revision": "21c4dcebe2c5b36a35292e6441e7a10b59bf4896", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py b/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py deleted file mode 100644 index d7de5677ef..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification", - "revision": "897743346c7c794264f7dbfadc3978aa2895e8e2", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py b/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py deleted file mode 100644 index 0e5d31614a..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaHard2SaltsVsSemiconductorMaterialsClassification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaHard2SaltsVsSemiconductorMaterialsClassification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaHard2SaltsVsSemiconductorMaterialsClassification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaHard2SaltsVsSemiconductorMaterialsClassification", - "revision": "9e5415a096012fa2d1f3a929952cf9859e4550e7", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py b/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py new file mode 100644 index 0000000000..d1d713cfb8 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaIsotopesFissionClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaIsotopesFissionClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification", + "revision": "897743346c7c794264f7dbfadc3978aa2895e8e2", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py b/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py new file mode 100644 index 0000000000..e96f172e81 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaLuminescenceClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaLuminescenceClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaHard2BioluminescenceVsLuminescenceClassification", + "revision": "21c4dcebe2c5b36a35292e6441e7a10b59bf4896", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py b/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py deleted file mode 100644 index 2e8aa28c4d..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaMedium2BioluminescenceVsNeurochemistryClassification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaMedium2BioluminescenceVsNeurochemistryClassification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaMedium2BioluminescenceVsNeurochemistryClassification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaMedium2BioluminescenceVsNeurochemistryClassification", - "revision": "2f68b7d34c2be896e46b14533573b366e59e5aae", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopistsClassification.py b/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopistsClassification.py deleted file mode 100644 index 8c9932a467..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaMedium2ComputationalVsSpectroscopistsClassification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaMedium2ComputationalVsSpectroscopistsClassification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaMedium2ComputationalVsSpectroscopistsClassification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaMedium2ComputationalVsSpectroscopistsClassification", - "revision": "474d706a22b0451b5846d623aa4b4234ba5b0513", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py b/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py deleted file mode 100644 index c89ab61e6f..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification", - "revision": "740565a6a853aaed1114a13bdfd5fd46857b4f11", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaMedium5Classification.py b/mteb/tasks/Classification/eng/WikipediaMedium5Classification.py deleted file mode 100644 index b9d00e96c2..0000000000 --- a/mteb/tasks/Classification/eng/WikipediaMedium5Classification.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.AbsTaskClassification import AbsTaskClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaMedium5Classification(AbsTaskClassification): - metadata = TaskMetadata( - name="WikipediaMedium5Classification", - description="TBW", - reference="https://wikipedia.org", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaMedium5Classification", - "revision": "f81a76a2fb690e5d5bd7a26dd07e85cdf8405dfb", - }, - type="Classification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={} - ) diff --git a/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py b/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py new file mode 100644 index 0000000000..add6687bd8 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaOrganicInorganicClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaOrganicInorganicClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy2SpecialClassification", + "revision": "96d1d9b37c4693f74c46c83d63a290573f78d511", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py b/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py new file mode 100644 index 0000000000..b71c9dcd2c --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaSaltsSemiconductorsClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaSaltsSemiconductorsClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-We-Create-Chemistry/WikipediaHard2SaltsVsSemiconductorMaterialsClassification", + "revision": "9e5415a096012fa2d1f3a929952cf9859e4550e7", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py b/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py new file mode 100644 index 0000000000..6ce489b681 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaSolidStateColloidalClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaSolidStateColloidalClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy2SolidStateVsColloidalClassification", + "revision": "7d8df44e588b6143d4856c781f72f919fa0599a7", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py new file mode 100644 index 0000000000..8724042a7d --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaTheoreticalAppliedClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaTheoreticalAppliedClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEZ2Classification", + "revision": "39350f72444caf0cff039dbf0f57933d8226f73e", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index fc92267c3e..601029c4d6 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -18,8 +18,8 @@ from .eng.StackExchangeClusteringP2P import * from .eng.TwentyNewsgroupsClustering import * from .eng.WikiCitiesClustering import * -from .eng.WikipediaEasy10Clustering import * -from .eng.WikipediaMedium5Clustering import * +from .eng.WikipediaChemistryTopicsClustering import * +from .eng.WikipediaChemistrySpecialtiesClustering import * from .fra.AlloProfClusteringP2P import * from .fra.AlloProfClusteringS2S import * from .fra.HALClusteringS2S import * diff --git a/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py b/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py new file mode 100644 index 0000000000..4c6007902d --- /dev/null +++ b/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskClustering import AbsTaskClustering + + +class WikipediaChemistrySpecialtiesClustering(AbsTaskClustering): + metadata = TaskMetadata( + name="WikipediaSpecialtiesInChemistryClustering", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium5Clustering", + "revision": "7754d8d296f9f4c3af1c6426fab36304730ccddf", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="v_measure", + date=None, + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators=None, + dialect=[], + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py b/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py new file mode 100644 index 0000000000..bd66ccfcd1 --- /dev/null +++ b/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskClustering import AbsTaskClustering + + +class WikipediaChemistryTopicsClustering(AbsTaskClustering): + metadata = TaskMetadata( + name="WikipediaChemistryTopicsClustering", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-We-Create-Chemistry/WikipediaEasy10Clustering", + "revision": "0a0886b06acbfc735bca6a71b21ce1e5cb92a37b", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="v_measure", + date=None, + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators=None, + dialect=[], + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Clustering/eng/WikipediaEasy10Clustering.py b/mteb/tasks/Clustering/eng/WikipediaEasy10Clustering.py deleted file mode 100644 index 3425575550..0000000000 --- a/mteb/tasks/Clustering/eng/WikipediaEasy10Clustering.py +++ /dev/null @@ -1,32 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.TaskMetadata import TaskMetadata - -from ....abstasks.AbsTaskClustering import AbsTaskClustering - - -class WikipediaEasy10Clustering(AbsTaskClustering): - metadata = TaskMetadata( - name="WikipediaEasy10Clustering", - description="TBW", - reference="https://huggingface.co/datasets/wikipedia", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaEasy10Clustering", - "revision": "0a0886b06acbfc735bca6a71b21ce1e5cb92a37b", - }, - type="Clustering", - category="p2p", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="v_measure", - date=None, - domains=None, - task_subtypes=[], - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={"n_samples": None, "avg_character_length": None}, - ) diff --git a/mteb/tasks/Clustering/eng/WikipediaMedium5Clustering.py b/mteb/tasks/Clustering/eng/WikipediaMedium5Clustering.py deleted file mode 100644 index f1f810e1eb..0000000000 --- a/mteb/tasks/Clustering/eng/WikipediaMedium5Clustering.py +++ /dev/null @@ -1,32 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.TaskMetadata import TaskMetadata - -from ....abstasks.AbsTaskClustering import AbsTaskClustering - - -class WikipediaMedium5Clustering(AbsTaskClustering): - metadata = TaskMetadata( - name="WikipediaMedium5Clustering", - description="TBW", - reference="https://huggingface.co/datasets/wikipedia", - dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaMedium5Clustering", - "revision": "7754d8d296f9f4c3af1c6426fab36304730ccddf", - }, - type="Clustering", - category="p2p", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="v_measure", - date=None, - domains=None, - task_subtypes=[], - license=None, - annotations_creators="derived", - dialect=[], - sample_creation=None, - bibtex_citation=None, - descriptive_stats={"n_samples": None, "avg_character_length": None}, - ) diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index 4cef32c163..1c8072708d 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -10,6 +10,7 @@ from .eng.PubChemSMILESCanonTitlePC import * from .eng.PubChemSMILESIsoDescPC import * from .eng.PubChemSMILESIsoTitlePC import * +from .eng.PubChemSMILESPC import * from .eng.PubChemSynonymPC import * from .eng.PubChemWikiParagraphsPC import * from .eng.SprintDuplicateQuestionsPC import * @@ -22,6 +23,7 @@ from .kor.KlueNLI import * from .multilingual.OpusparcusPC import * from .multilingual.PawsXPairClassification import * +from .multilingual.PubChemWikiPairClassification import * from .multilingual.RTE3 import * from .multilingual.XNLI import * from .multilingual.XStance import * diff --git a/mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py b/mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py index b6f190db37..d89bec5ef1 100644 --- a/mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py +++ b/mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py @@ -1,28 +1,18 @@ from __future__ import annotations - -from typing import Any - import datasets from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification from mteb.abstasks.TaskMetadata import TaskMetadata -_DATASET_COLUMN_MAP = { - "sentence1": "formula", - "sentence2": "smiles", - "labels": "label", -} - - class CoconutSMILES2FormulaPC(AbsTaskPairClassification): metadata = TaskMetadata( name="CoconutSMILES2FormulaPC", - description="""TBW""", - reference="https://coconut.naturalproducts.net/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", dataset={ - "path": "BASF-We-Create-Chemistry/CoconutSMILES2FormulaPC", + "path": "BASF-AI/CoconutSMILES2FormulaPC", "revision": "e46d4868e417703bdcf32aadbe5d0e05a1b7f085" }, type="PairClassification", @@ -30,19 +20,25 @@ class CoconutSMILES2FormulaPC(AbsTaskPairClassification): modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="max_f1", + main_score="max_ap", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, + license="cc-by-nc-sa-4.0", annotations_creators="derived", dialect=None, - sample_creation="created", - bibtex_citation=None, - descriptive_stats={} + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) - def load_data(self, **kwargs: Any) -> None: + def load_data(self): """Load dataset from HuggingFace hub""" if self.data_loaded: return @@ -57,16 +53,16 @@ def load_data(self, **kwargs: Any) -> None: def dataset_transform(self): self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["test"], label=_DATASET_COLUMN_MAP["labels"] + self.dataset, seed=self.seed, splits=["test"], label='label' ) _dataset = {} for split in self.metadata.eval_splits: hf_dataset = self.dataset[split] _dataset[split] = [ { - "sentence1": hf_dataset[_DATASET_COLUMN_MAP["sentence1"]], - "sentence2": hf_dataset[_DATASET_COLUMN_MAP["sentence2"]], - "labels": hf_dataset[_DATASET_COLUMN_MAP["labels"]] + "sentence1": hf_dataset['formula'], + "sentence2": hf_dataset['smiles'], + "labels": hf_dataset['label'] } ] self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py index 167389f3d8..32c3447301 100644 --- a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py +++ b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py @@ -1,8 +1,5 @@ from __future__ import annotations - -from typing import Any - import datasets from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification @@ -12,10 +9,10 @@ class PubChemAISentenceParaphrasePC(AbsTaskPairClassification): metadata = TaskMetadata( name="PubChemAISentenceParaphrasePC", - description="""TBW""", - reference="https://pubchem.ncbi.nlm.nih.gov/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", dataset={ - "path": "BASF-We-Create-Chemistry/PubChemAISentenceParaphrasePC", + "path": "BASF-AI/PubChemAISentenceParaphrasePC", "revision": "f33a205966ce032f957c3a22f4f9e378f89a2c56" }, type="PairClassification", @@ -25,17 +22,23 @@ class PubChemAISentenceParaphrasePC(AbsTaskPairClassification): eval_langs=["eng-Latn"], main_score="max_f1", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators="derived", + license="cc-by-nc-sa-4.0", + annotations_creators="LM-generated", dialect=None, - sample_creation="created", - bibtex_citation=None, - descriptive_stats={} + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) - def load_data(self, **kwargs: Any) -> None: + def load_data(self): """Load dataset from HuggingFace hub""" if self.data_loaded: return diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py index 9e6eff57d4..ac00022725 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py @@ -1,28 +1,18 @@ from __future__ import annotations - -from typing import Any - import datasets from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification from mteb.abstasks.TaskMetadata import TaskMetadata -_DATASET_COLUMN_MAP = { - "sentence1": "description", - "sentence2": "canonical_smiles", - "labels": "labels", -} - - class PubChemSMILESCanonDescPC(AbsTaskPairClassification): metadata = TaskMetadata( name="PubChemSMILESCanonDescPC", - description="""TBW""", - reference="https://pubchem.ncbi.nlm.nih.gov/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", dataset={ - "path": "BASF-We-Create-Chemistry/PubChemSMILESCanonDescPC", + "path": "BASF-AI/PubChemSMILESCanonDescPC", "revision": "6236cc0c3003bea6034d00f96d2202f7c05629c6" }, type="PairClassification", @@ -30,19 +20,25 @@ class PubChemSMILESCanonDescPC(AbsTaskPairClassification): modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="max_f1", + main_score="max_ap", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", dialect=None, - sample_creation="created", - bibtex_citation=None, - descriptive_stats={} + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) - def load_data(self, **kwargs: Any) -> None: + def load_data(self): """Load dataset from HuggingFace hub""" if self.data_loaded: return @@ -67,9 +63,9 @@ def dataset_transform(self): hf_dataset = self.dataset[split] _dataset[split] = [ { - "sentence1": hf_dataset[_DATASET_COLUMN_MAP["sentence1"]], - "sentence2": hf_dataset[_DATASET_COLUMN_MAP["sentence2"]], - "labels": hf_dataset[_DATASET_COLUMN_MAP["labels"]] + "sentence1": hf_dataset["description"], + "sentence2": hf_dataset["canonical_smiles"], + "labels": hf_dataset["labels"] } ] self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py index 173f140976..244118ef4e 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py @@ -1,28 +1,18 @@ from __future__ import annotations - -from typing import Any - import datasets from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification from mteb.abstasks.TaskMetadata import TaskMetadata -_DATASET_COLUMN_MAP = { - "sentence1": "title", - "sentence2": "canonical_smiles", - "labels": "labels", -} - - class PubChemSMILESCanonTitlePC(AbsTaskPairClassification): metadata = TaskMetadata( name="PubChemSMILESCanonTitlePC", - description="""TBW""", - reference="https://pubchem.ncbi.nlm.nih.gov/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", dataset={ - "path": "BASF-We-Create-Chemistry/PubChemSMILESCanonTitlePC", + "path": "BASF-AI/PubChemSMILESCanonTitlePC", "revision": "3cce5bbb9ffe0d63a74102f2f5037aea47244c8f" }, type="PairClassification", @@ -30,22 +20,25 @@ class PubChemSMILESCanonTitlePC(AbsTaskPairClassification): modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="max_f1", + main_score="max_ap", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", dialect=None, - sample_creation="created", - bibtex_citation=None, - descriptive_stats={ - "n_samples": {"train": 43052}, - "avg_character_length": {"train": 34} + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} } + """, ) - def load_data(self, **kwargs: Any) -> None: + def load_data(self): """Load dataset from HuggingFace hub""" if self.data_loaded: return @@ -70,9 +63,9 @@ def dataset_transform(self): hf_dataset = self.dataset[split] _dataset[split] = [ { - "sentence1": hf_dataset[_DATASET_COLUMN_MAP["sentence1"]], - "sentence2": hf_dataset[_DATASET_COLUMN_MAP["sentence2"]], - "labels": hf_dataset[_DATASET_COLUMN_MAP["labels"]] + "sentence1": hf_dataset["title"], + "sentence2": hf_dataset["canonical_smiles"], + "labels": hf_dataset["labels"] } ] self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py index 661c825b60..ae7da53070 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py @@ -1,28 +1,18 @@ from __future__ import annotations - -from typing import Any - import datasets from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification from mteb.abstasks.TaskMetadata import TaskMetadata -_DATASET_COLUMN_MAP = { - "sentence1": "description", - "sentence2": "isomeric_smiles", - "labels": "labels", -} - - class PubChemSMILESIsoDescPC(AbsTaskPairClassification): metadata = TaskMetadata( name="PubChemSMILESIsoDescPC", - description="""TBW""", - reference="https://pubchem.ncbi.nlm.nih.gov/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", dataset={ - "path": "BASF-We-Create-Chemistry/PubChemSMILESIsoDescPC", + "path": "BASF-AI/PubChemSMILESIsoDescPC", "revision": "afedff80aa393a8bdc5e05da46252dc5fde99029" }, type="PairClassification", @@ -30,19 +20,25 @@ class PubChemSMILESIsoDescPC(AbsTaskPairClassification): modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="max_f1", + main_score="max_ap", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", dialect=None, - sample_creation="created", - bibtex_citation=None, - descriptive_stats={} + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) - def load_data(self, **kwargs: Any) -> None: + def load_data(self): """Load dataset from HuggingFace hub""" if self.data_loaded: return @@ -67,9 +63,9 @@ def dataset_transform(self): hf_dataset = self.dataset[split] _dataset[split] = [ { - "sentence1": hf_dataset[_DATASET_COLUMN_MAP["sentence1"]], - "sentence2": hf_dataset[_DATASET_COLUMN_MAP["sentence2"]], - "labels": hf_dataset[_DATASET_COLUMN_MAP["labels"]], + "sentence1": hf_dataset["description"], + "sentence2": hf_dataset["isomeric_smiles"], + "labels": hf_dataset["labels"], } ] self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py index ccc0b699e1..ba3da3b0c1 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py @@ -1,28 +1,18 @@ from __future__ import annotations - -from typing import Any - import datasets from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification from mteb.abstasks.TaskMetadata import TaskMetadata -_DATASET_COLUMN_MAP = { - "sentence1": "title", - "sentence2": "isomeric_smiles", - "labels": "labels", -} - - class PubChemSMILESIsoTitlePC(AbsTaskPairClassification): metadata = TaskMetadata( name="PubChemSMILESIsoTitlePC", - description="""TBW""", - reference="https://pubchem.ncbi.nlm.nih.gov/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", dataset={ - "path": "BASF-We-Create-Chemistry/PubChemSMILESIsoTitlePC", + "path": "BASF-AI/PubChemSMILESIsoTitlePC", "revision": "1b0d57516ec7c168b8da44f80148b5418ba394b3" }, type="PairClassification", @@ -30,22 +20,25 @@ class PubChemSMILESIsoTitlePC(AbsTaskPairClassification): modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="max_f1", + main_score="max_ap", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", dialect=None, - sample_creation="created", - bibtex_citation=None, - descriptive_stats={ - "n_samples": {"train": 43052}, - "avg_character_length": {"train": 34} + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} } + """, ) - def load_data(self, **kwargs: Any) -> None: + def load_data(self): """Load dataset from HuggingFace hub""" if self.data_loaded: return @@ -70,9 +63,9 @@ def dataset_transform(self): hf_dataset = self.dataset[split] _dataset[split] = [ { - "sentence1": hf_dataset[_DATASET_COLUMN_MAP["sentence1"]], - "sentence2": hf_dataset[_DATASET_COLUMN_MAP["sentence2"]], - "labels": hf_dataset[_DATASET_COLUMN_MAP["labels"]], + "sentence1": hf_dataset["title"], + "sentence2": hf_dataset["isomeric_smiles"], + "labels": hf_dataset["labels"], } ] self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py new file mode 100644 index 0000000000..6c63b1da7d --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + +_DATASET_COLUMN_MAP = [ + { + "name": "iso-desc", + "sent1": "description", + "sent2": "isomeric_smiles", + "labels": "labels", + }, + { + "name": "iso-title", + "sent1": "title", + "sent2": "isomeric_smiles", + "labels": "labels", + }, + { + "name": "canon-desc", + "sent1": "description", + "sent2": "canonical_smiles", + "labels": "labels", + }, + { + "name": "canon-title", + "sent1": "title", + "sent2": "canonical_smiles", + "labels": "labels", + }, +] + +class PubChemSMILESPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemSMILESPC", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/PubChemSMILESPairClassification", + "revision": "7ba40b69f5fe6ffe4cc189aac9e1710913c73c8a" + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_ap", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) + + def load_data(self): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + + _hf_dataset = None + for dataset_col_map in _DATASET_COLUMN_MAP: + _dataset = datasets.load_dataset( + self.metadata_dict["dataset"]["path"], + dataset_col_map["name"], + revision=self.metadata_dict["dataset"]["revision"], + ) + + _dataset = _dataset.rename_columns( + { + dataset_col_map["sent1"]: "sentence1", + dataset_col_map["sent2"]: "sentence2", + dataset_col_map["labels"]: "labels", + } + ) + + if _hf_dataset is None: + _hf_dataset = _dataset + else: + _hf_dataset["test"] = datasets.concatenate_datasets( + [_hf_dataset["test"], _dataset["test"]] + ) + + self.dataset = _hf_dataset + self.dataset_transform() + self.data_loaded = True + + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=self.metadata_dict["eval_splits"], label="labels" + ) + + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset["sentence1"], + "sentence2": hf_dataset["sentence2"], + "labels": hf_dataset["labels"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py index 9bec11eede..09c6b8dae7 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py @@ -1,28 +1,18 @@ from __future__ import annotations - -from typing import Any - import datasets from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification from mteb.abstasks.TaskMetadata import TaskMetadata -_DATASET_COLUMN_MAP = { - "sentence1": "title", - "sentence2": "synonyms", - "labels": "labels", -} - - class PubChemSynonymPC(AbsTaskPairClassification): metadata = TaskMetadata( name="PubChemSynonymPC", - description="""TBW""", - reference="https://pubchem.ncbi.nlm.nih.gov/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", dataset={ - "path": "BASF-We-Create-Chemistry/PubChemSynonymPC", + "path": "BASF-AI/PubChemSynonymPC", "revision": "5037d69d177c9628fb79cb57eea1299178b28c1b" }, type="PairClassification", @@ -30,19 +20,25 @@ class PubChemSynonymPC(AbsTaskPairClassification): modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="max_f1", + main_score="max_ap", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", dialect=None, - sample_creation="created", - bibtex_citation=None, - descriptive_stats={} + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) - def load_data(self, **kwargs: Any) -> None: + def load_data(self): """Load dataset from HuggingFace hub""" if self.data_loaded: return @@ -63,9 +59,9 @@ def dataset_transform(self): hf_dataset = self.dataset[split] _dataset[split] = [ { - "sentence1": hf_dataset[_DATASET_COLUMN_MAP["sentence1"]], - "sentence2": hf_dataset[_DATASET_COLUMN_MAP["sentence2"]], - "labels": hf_dataset[_DATASET_COLUMN_MAP["labels"]], + "sentence1": hf_dataset["title"], + "sentence2": hf_dataset["synonyms"], + "labels": hf_dataset["labels"], } ] self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py index 9b6f3ee684..f3f2a07e8c 100644 --- a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py @@ -1,8 +1,5 @@ from __future__ import annotations - -from typing import Any - import datasets from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification @@ -12,30 +9,36 @@ class PubChemWikiParagraphsPC(AbsTaskPairClassification): metadata = TaskMetadata( name="PubChemWikiParagraphsPC", - description="""TBW""", - reference="https://pubchem.ncbi.nlm.nih.gov/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", dataset={ - "path": "BASF-We-Create-Chemistry/PubChemWikiParagraphsPC", - "revision": "36c2aabe8e5bdb034701d0a226cac3c27d09575c" + "path": "BASF-AI/PubChemWikiParagraphsPC", + "revision": "7fb14716e4106b72f51a16e682e5cd2d67e9bd70" }, type="PairClassification", category="p2p", modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="max_f1", + main_score="max_ap", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, + license="cc-by-nc-sa-4.0", annotations_creators="derived", dialect=None, - sample_creation="created", - bibtex_citation=None, - descriptive_stats={} + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) - def load_data(self, **kwargs: Any) -> None: + def load_data(self): """Load dataset from HuggingFace hub""" if self.data_loaded: return diff --git a/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py b/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py index 56532b86a5..7875fd0fe0 100644 --- a/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py +++ b/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py @@ -1,8 +1,5 @@ from __future__ import annotations - -from typing import Any - import datasets from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification @@ -12,8 +9,8 @@ class WikipediaAIParagraphsParaphrasePC(AbsTaskPairClassification): metadata = TaskMetadata( name="WikipediaAIParagraphsParaphrasePC", - description="""TBW""", - reference="https://pubchem.ncbi.nlm.nih.gov/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", dataset={ "path": "BASF-We-Create-Chemistry/WikipediaAIParagraphsParaphrasePC", "revision": "a430437ea6c6fe0e6461e6d6659f647d0bf62496" @@ -23,19 +20,25 @@ class WikipediaAIParagraphsParaphrasePC(AbsTaskPairClassification): modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="max_f1", + main_score="max_ap", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators="derived", + license="cc-by-nc-sa-4.0", + annotations_creators="LM-generated", dialect=None, - sample_creation="created", - bibtex_citation=None, - descriptive_stats={} + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) - def load_data(self, **kwargs: Any) -> None: + def load_data(self): """Load dataset from HuggingFace hub""" if self.data_loaded: return diff --git a/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py b/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py new file mode 100644 index 0000000000..c3bb5dd5c7 --- /dev/null +++ b/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_LANGUAGES = { + 'de': ["deu-Latn", "eng-Latn"], + 'nl': ["nld-Latn", "eng-Latn"], + 'zh': ["zho-Hans", "eng-Latn"], + 'fr': ["fra-Latn", "eng-Latn"], + 'es': ["spa-Latn", "eng-Latn"], + 'pt': ["por-Latn", "eng-Latn"], + 'ms': ["msa-Latn", "eng-Latn"], + 'ko': ["kor-Hang", "eng-Latn"], + 'tr': ["tur-Latn", "eng-Latn"], + 'hi': ["hin-Deva", "eng-Latn"], + 'cs': ["ces-Latn", "eng-Latn"], + 'ja': ["jpn-Jpan", "eng-Latn"], +} + + +class PubChemWikiPairClassification(AbsTaskPairClassification, MultilingualTask): + metadata = TaskMetadata( + name="PubChemWikiPairClassification", + dataset={ + "path": "BASF-AI/PubChemWikiMultilingualPC", + "revision": "3412b208896a37e4ebb5ff7b96f6cc313ee9d2e3", + }, + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + category="s2s", + modalities=["text"], + type="PairClassification", + eval_splits=["test"], + eval_langs=_LANGUAGES, + main_score="max_ap", + date=None, + domains=["Chemistry"], + task_subtypes=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=None, + sample_creation=None, + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) + + def dataset_transform(self) -> None: + _dataset = {} + for lang in self.hf_subsets: + _dataset[lang] = {} + hf_dataset = self.dataset[lang][self.metadata.eval_splits[0]] + _dataset[lang]["test"] = [ + { + "sentence1": hf_dataset["sent1"], + "sentence2": hf_dataset["sent2"], + "labels": hf_dataset["labels"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py b/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py index 1df15eb5c4..5585373ac3 100644 --- a/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py +++ b/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py @@ -1,22 +1,18 @@ from __future__ import annotations from mteb.abstasks.TaskMetadata import TaskMetadata - -from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval class ChemHotpotQARetrieval(AbsTaskRetrieval): metadata = TaskMetadata( name="ChemHotpotQARetrieval", dataset={ - "path": "BASF-We-Create-Chemistry/ChemHotpotQARetrieval", - "revision": "f39c1d16edd269f233be381216bc8146f0857124", + "path": "BASF-AI/ChemHotpotQARetrieval", + "revision": "1840e8a5ac6ec752bbdd97d543ead0189bc7c25b", }, - description=( - "HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong" - " supervision for supporting facts to enable more explainable question answering systems." - ), - reference="https://hotpotqa.github.io/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", type="Retrieval", category="s2p", modalities=["text"], @@ -24,34 +20,18 @@ class ChemHotpotQARetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings{yang-etal-2018-hotpotqa, - title = "{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering", - author = "Yang, Zhilin and - Qi, Peng and - Zhang, Saizheng and - Bengio, Yoshua and - Cohen, William and - Salakhutdinov, Ruslan and - Manning, Christopher D.", - editor = "Riloff, Ellen and - Chiang, David and - Hockenmaier, Julia and - Tsujii, Jun{'}ichi", - booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", - month = oct # "-" # nov, - year = "2018", - address = "Brussels, Belgium", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/D18-1259", - doi = "10.18653/v1/D18-1259", - pages = "2369--2380", - abstract = "Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.", -}""", - descriptive_stats={} + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) diff --git a/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py b/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py index 70ae066e72..2342bcaa79 100644 --- a/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py +++ b/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py @@ -1,24 +1,18 @@ from __future__ import annotations -import os -import logging from mteb.abstasks.TaskMetadata import TaskMetadata - - -from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval - -logger = logging.getLogger(__name__) +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval class ChemNQRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( name="ChemNQRetrieval", dataset={ - "path": "BASF-We-Create-Chemistry/ChemNQRetrieval", - "revision": "023e7a813e3b73d8d33551ed2aea511314d612e2", + "path": "BASF-AI/ChemNQRetrieval", + "revision": "5d958fb6b31055495347724d46431ba41309b03a", }, - description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval", - reference="https://ai.google.com/research/NaturalQuestions/", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", type="Retrieval", category="s2p", modalities=["text"], @@ -26,17 +20,18 @@ class ChemNQRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, + domains=["Chemistry"], task_subtypes=None, - license=None, - annotations_creators=None, + license="cc-by-nc-sa-4.0", + annotations_creators="derived", dialect=None, sample_creation=None, - bibtex_citation="""@article{47761,title = {Natural Questions: a Benchmark for Question Answering Research}, - author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh - and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee - and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le - and Slav Petrov},year = {2019},journal = {Transactions of the Association of Computational - Linguistics}}""", - descriptive_stats={} + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, ) From 4864dc046a7f5ea7bab1f030b2b2a45ea7c47b7f Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sat, 4 Jan 2025 23:05:49 +0000 Subject: [PATCH 34/49] Remove unnecessary files and tasks for MTEB --- chem_eval.py | 172 ------------------ mteb/tasks/BitextMining/__init__.py | 5 - .../multilingual/CoconutSMILES2FormulaBM.py | 63 ------- .../multilingual/PubChemSMILESCanonDescBM.py | 63 ------- .../multilingual/PubChemSMILESCanonTitleBM.py | 63 ------- .../multilingual/PubChemSMILESISoDescBM.py | 63 ------- .../multilingual/PubChemSMILESISoTitleBM.py | 63 ------- mteb/tasks/PairClassification/__init__.py | 5 - .../eng/CoconutSMILES2FormulaPC.py | 68 ------- .../eng/PubChemSMILESCanonDescPC.py | 71 -------- .../eng/PubChemSMILESCanonTitlePC.py | 71 -------- .../eng/PubChemSMILESIsoDescPC.py | 71 -------- .../eng/PubChemSMILESIsoTitlePC.py | 71 -------- 13 files changed, 849 deletions(-) delete mode 100644 chem_eval.py delete mode 100644 mteb/tasks/BitextMining/multilingual/CoconutSMILES2FormulaBM.py delete mode 100644 mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py delete mode 100644 mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py delete mode 100644 mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py delete mode 100644 mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py delete mode 100644 mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py delete mode 100644 mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py delete mode 100644 mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py delete mode 100644 mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py delete mode 100644 mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py diff --git a/chem_eval.py b/chem_eval.py deleted file mode 100644 index cc609e8e3b..0000000000 --- a/chem_eval.py +++ /dev/null @@ -1,172 +0,0 @@ -import mteb -import os -from tqdm import tqdm -import wandb -import json -import time - - -def is_run_available(model_name, model_revision): - api = wandb.Api() - runs = api.runs('Chembedding - Benchmarking') - for run in runs: - if run.name == model_name and run.config['revision'] == model_revision and run.state == "finished": - return True - return False - - -def read_json(path): - with open(path, "r") as f: - return json.load(f) - - -def json_parser(data): - task_name = data["task_name"] - output = {} - if task_name.endswith("PC"): - output["PairClassification (Max F1)"] = data["scores"]["test"][0]["main_score"] - elif task_name.endswith("Classification"): - output["Classification (Accuracy)"] = data["scores"]["test"][0]["main_score"] - elif "BitextMining" in task_name or task_name.endswith("BM"): - output["Bitext Mining (F1)"] = data["scores"]["test"][0]["main_score"] - elif task_name.endswith("Retrieval"): - output["Retrieval (NDCG@10)"] = data["scores"]["test"][0]["main_score"] - elif task_name.endswith("Clustering"): - output["Clustering (V Measure)"] = data["scores"]["test"][0]["main_score"] - return output - - -if __name__ == "__main__": - now = time.time() - - models = {"google-bert/bert-base-uncased": "86b5e0934494bd15c9632b12f734a8a67f723594", - "allenai/scibert_scivocab_uncased": "24f92d32b1bfb0bcaf9ab193ff3ad01e87732fc1", - "nomic-ai/nomic-bert-2048": "no_revision_available", - "intfloat/e5-small": "e272f3049e853b47cb5ca3952268c6662abda68f", - "intfloat/e5-base": "b533fe4636f4a2507c08ddab40644d20b0006d6a", - "intfloat/e5-large": "4dc6d853a804b9c8886ede6dda8a073b7dc08a81", - "intfloat/e5-small-v2": "dca8b1a9dae0d4575df2bf423a5edb485a431236", - "intfloat/e5-base-v2": "1c644c92ad3ba1efdad3f1451a637716616a20e8", - "intfloat/e5-large-v2": "b322e09026e4ea05f42beadf4d661fb4e101d311", - "intfloat/multilingual-e5-small": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", - "intfloat/multilingual-e5-base": "d13f1b27baf31030b7fd040960d60d909913633f", - "intfloat/multilingual-e5-large": "4dc6d853a804b9c8886ede6dda8a073b7dc08a81", - "nomic-ai/nomic-embed-text-v1": "0759316f275aa0cb93a5b830973843ca66babcf5", - "nomic-ai/nomic-embed-text-v1.5": "b0753ae76394dd36bcfb912a46018088bca48be0", - "recobo/chemical-bert-uncased": "498698d28fcf7ce5954852a0444c864bdf232b64", - "BAAI/bge-m3": "5617a9f61b028005a4858fdac845db406aefb181", - "BAAI/bge-small-en": "2275a7bdee235e9b4f01fa73aa60d3311983cfea", - "BAAI/bge-base-en": "b737bf5dcc6ee8bdc530531266b4804a5d77b5d8", - "BAAI/bge-large-en": "abe7d9d814b775ca171121fb03f394dc42974275", - "BAAI/bge-small-en-v1.5": "5c38ec7c405ec4b44b94cc5a9bb96e735b38267a", - "BAAI/bge-base-en-v1.5": "a5beb1e3e68b9ab74eb54cfd186867f64f240e1a", - "BAAI/bge-large-en-v1.5": "d4aa6901d3a41ba39fb536a557fa166f842b0e09", - "all-mpnet-base-v2": "84f2bcc00d77236f9e89c8a360a00fb1139bf47d", - "multi-qa-mpnet-base-dot-v1": "3af7c6da5b3e1bea796ef6c97fe237538cbe6e7f", - "all-MiniLM-L12-v2": "a05860a77cef7b37e0048a7864658139bc18a854", - "all-MiniLM-L6-v2": "8b3219a92973c328a8e22fadcfa821b5dc75636a", - "m3rg-iitd/matscibert": "ced9d8f5f208712c4a90f98a246fe32155b29995", - "text-embedding-ada-002": "1", - "text-embedding-3-small": "1", - "text-embedding-3-large": "1", - "amazon-titan-embed-text-v1": "1", - "amazon-titan-embed-text-v2": "1", - "cohere-embed-english-v3": "1", - "cohere-embed-multilingual-v3": "1" - } - - all_tasks = [ - # Pair Classification - "CoconutSMILES2FormulaPC", - "WikipediaAIParagraphsParaphrasePC", - "PubChemAISentenceParaphrasePC", - "PubChemSMILESCanonDescPC", - "PubChemSMILESCanonTitlePC", - "PubChemSMILESIsoDescPC", - "PubChemSMILESIsoTitlePC", - "PubChemSynonymPC", - "PubChemWikiParagraphsPC", - # Classification - "WikipediaEasy2GeneExpressionVsMetallurgyClassification", - "WikipediaEasy2GreenhouseVsEnantiopureClassification", - "WikipediaEasy2SolidStateVsColloidalClassification", - "WikipediaEasy2SpecialClassification", - "WikipediaEasy5Classification", - "WikipediaEasy10Classification", - "WikipediaEZ2Classification", - "WikipediaEZ10Classification", - "WikipediaHard2BioluminescenceVsLuminescenceClassification", - "WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification", - "WikipediaHard2SaltsVsSemiconductorMaterialsClassification", - "WikipediaMedium2BioluminescenceVsNeurochemistryClassification", - "WikipediaMedium2ComputationalVsSpectroscopistsClassification", - "WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification", - "WikipediaMedium5Classification", - "SDSEyeProtectionClassification", - "SDSGlovesClassification", - # Retrieval - "ChemNQRetrieval", - "ChemHotpotQARetrieval", - "CoconutRetrieval" - # Bitext Mining - "CoconutSMILES2Formula1BM", - "CoconutSMILES2Formula2BM", - "PubChemSMILESISoTitleBM", - "PubChemSMILESCanonTitleBM", - "PubChemSMILESISoDescBM", - "PubChemSMILESCanonDescBM", - # Clustering - "WikipediaEasy10Clustering", - "WikipediaMedium5Clustering" - ] - - tasks = mteb.get_tasks(tasks=all_tasks) - - for model_full_name, model_rev in tqdm(models.items()): - if "/" in model_full_name: - model_name = model_full_name.split("/")[1] - else: - model_name = model_full_name - - if is_run_available(model_name, model_rev): - print(f"Skipping {model_name} - {model_rev}") - continue - - try: - wandb.init(project='Chembedding - Benchmarking', name=model_name, - config={"revision": model_rev}) - model = mteb.get_model(model_full_name) - evaluation = mteb.MTEB(tasks=tasks) - evaluation.run(model, output_folder="chem_results", - overwrite_results=False) - except Exception as e: - print(f"Error Evaluating Model {model_name}: {e}") - wandb.finish() - continue - - for task_name in tqdm(all_tasks): - data = read_json(os.path.join( - "chem_results", - model_full_name.replace("/", "__"), - model_rev, - task_name + '.json', - )) - output = json_parser(data) - wandb.log(output) - - for metric, score in output.items(): - table = wandb.Table(data=[[metric, score]], - columns=["Metric", "Score"]) - bar_plot = wandb.plot.bar( - table, "Metric", "Score", title=f"{task_name} Performance") - wandb.log({f"{task_name}_bar_plot": bar_plot}) - - wandb.finish() - - elapsed = time.time() - now - - hours = int(elapsed // 3600) - minutes = int((elapsed % 3600) // 60) - seconds = int(elapsed % 60) - - print(f"Elapsed time: {hours} hours, {minutes} minutes, {seconds} seconds") \ No newline at end of file diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py index 0d991d1358..a8addef6a6 100644 --- a/mteb/tasks/BitextMining/__init__.py +++ b/mteb/tasks/BitextMining/__init__.py @@ -5,7 +5,6 @@ from .multilingual.BibleNLPBitextMining import * from .multilingual.BUCCBitextMining import * from .multilingual.BUCCBitextMiningFast import * -from .multilingual.CoconutSMILES2FormulaBM import * from .multilingual.DiaBLaBitextMining import * from .multilingual.FloresBitextMining import * from .multilingual.IN22ConvBitextMining import * @@ -20,10 +19,6 @@ from .multilingual.NusaXBitextMining import * from .multilingual.PhincBitextMining import * from .multilingual.PubChemSMILESBitextMining import * -from .multilingual.PubChemSMILESCanonDescBM import * -from .multilingual.PubChemSMILESCanonTitleBM import * -from .multilingual.PubChemSMILESISoDescBM import * -from .multilingual.PubChemSMILESISoTitleBM import * from .multilingual.RomaTalesBitextMining import * from .multilingual.TatoebaBitextMining import * from .srn.SRNCorpusBitextMining import * diff --git a/mteb/tasks/BitextMining/multilingual/CoconutSMILES2FormulaBM.py b/mteb/tasks/BitextMining/multilingual/CoconutSMILES2FormulaBM.py deleted file mode 100644 index fb7fc15ee7..0000000000 --- a/mteb/tasks/BitextMining/multilingual/CoconutSMILES2FormulaBM.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import annotations - -import datasets - -from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining -from mteb.abstasks.MultilingualTask import MultilingualTask -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class CoconutSMILES2FormulaBM(AbsTaskBitextMining, MultilingualTask): - metadata = TaskMetadata( - name="CoconutSMILES2FormulaBM", - dataset={ - "path": "BASF-AI/CoconutSMILES2FormulaBM", - "revision": "af0913db3a92d4b16ad679733c281b3237d399a5" - }, - description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", - reference="https://arxiv.org/abs/2412.00532", - type="BitextMining", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs={ - "en-en": ["en-Latn", "eng-Latn"] - }, - main_score="f1", - date=None, - domains=["Chemistry"], - task_subtypes=None, - license="cc-by-nc-sa-4.0", - annotations_creators="derived", - dialect=None, - sample_creation=None, - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, - ) - - def load_data(self, **kwargs): - """Load dataset from HuggingFace hub and convert it to the standard format.""" - if self.data_loaded: - return - - self.dataset = {} - - for lang in self.hf_subsets: - self.dataset[lang] = datasets.load_dataset( - **self.metadata_dict["dataset"]) - - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self): - for lang in self.hf_subsets: - self.dataset[lang] = self.dataset[lang].rename_columns({ - "formula": "sentence1", - "smiles": "sentence2" - }) diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py deleted file mode 100644 index 127d9295dd..0000000000 --- a/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonDescBM.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import annotations - -import datasets - -from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining -from mteb.abstasks.MultilingualTask import MultilingualTask -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class PubChemSMILESCanonDescBM(AbsTaskBitextMining, MultilingualTask): - metadata = TaskMetadata( - name="PubChemSMILESCanonDescBM", - dataset={ - "path": "BASF-AI/PubChemSMILESCanonDescBM", - "revision": "a721de4af2857bf3cc014b92f013a4f573d9cb00" - }, - description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", - reference="https://arxiv.org/abs/2412.00532", - type="BitextMining", - category="s2p", - modalities=["text"], - eval_splits=["test"], - eval_langs={ - "en-en": ["en-Latn", "eng-Latn"] - }, - main_score="f1", - date=None, - domains=["Chemistry"], - task_subtypes=None, - license="cc-by-nc-sa-4.0", - annotations_creators="derived", - dialect=None, - sample_creation=None, - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, - ) - - def load_data(self, **kwargs): - """Load dataset from HuggingFace hub and convert it to the standard format.""" - if self.data_loaded: - return - - self.dataset = {} - - for lang in self.hf_subsets: - self.dataset[lang] = datasets.load_dataset( - **self.metadata_dict["dataset"]) - - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self): - for lang in self.hf_subsets: - self.dataset[lang] = self.dataset[lang].rename_columns({ - "description": "sentence1", - "canonical_smiles": "sentence2" - }) diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py deleted file mode 100644 index 29142fc117..0000000000 --- a/mteb/tasks/BitextMining/multilingual/PubChemSMILESCanonTitleBM.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import annotations - -import datasets - -from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining -from mteb.abstasks.MultilingualTask import MultilingualTask -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class PubChemSMILESCanonTitleBM(AbsTaskBitextMining, MultilingualTask): - metadata = TaskMetadata( - name="PubChemSMILESCanonTitleBM", - dataset={ - "path": "BASF-AI/PubChemSMILESCanonTitleBM", - "revision": "2c7a74635cf41b2ca50d878fa1ff670fc5af3ea1" - }, - description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", - reference="https://arxiv.org/abs/2412.00532", - type="BitextMining", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs={ - "en-en": ["en-Latn", "eng-Latn"] - }, - main_score="f1", - date=None, - domains=["Chemistry"], - task_subtypes=None, - license="cc-by-nc-sa-4.0", - annotations_creators="derived", - dialect=None, - sample_creation=None, - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, - ) - - def load_data(self, **kwargs): - """Load dataset from HuggingFace hub and convert it to the standard format.""" - if self.data_loaded: - return - - self.dataset = {} - - for lang in self.hf_subsets: - self.dataset[lang] = datasets.load_dataset( - **self.metadata_dict["dataset"]) - - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self): - for lang in self.hf_subsets: - self.dataset[lang] = self.dataset[lang].rename_columns({ - "title": "sentence1", - "canonical_smiles": "sentence2" - }) diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py deleted file mode 100644 index b135ea68d4..0000000000 --- a/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoDescBM.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import annotations - -import datasets - -from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining -from mteb.abstasks.MultilingualTask import MultilingualTask -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class PubChemSMILESISoDescBM(AbsTaskBitextMining, MultilingualTask): - metadata = TaskMetadata( - name="PubChemSMILESISoDescBM", - dataset={ - "path": "BASF-AI/PubChemSMILESIsoDescBM", - "revision": "33a2064662e851ea5e42653b303eb9e0f6878a07" - }, - description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", - reference="https://arxiv.org/abs/2412.00532", - type="BitextMining", - category="s2p", - modalities=["text"], - eval_splits=["test"], - eval_langs={ - "en-en": ["en-Latn", "eng-Latn"] - }, - main_score="f1", - date=None, - domains=["Chemistry"], - task_subtypes=None, - license="cc-by-nc-sa-4.0", - annotations_creators="derived", - dialect=None, - sample_creation=None, - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, - ) - - def load_data(self, **kwargs): - """Load dataset from HuggingFace hub and convert it to the standard format.""" - if self.data_loaded: - return - - self.dataset = {} - - for lang in self.hf_subsets: - self.dataset[lang] = datasets.load_dataset( - **self.metadata_dict["dataset"]) - - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self): - for lang in self.hf_subsets: - self.dataset[lang] = self.dataset[lang].rename_columns({ - "description": "sentence1", - "isomeric_smiles": "sentence2" - }) diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py b/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py deleted file mode 100644 index d0a5361aeb..0000000000 --- a/mteb/tasks/BitextMining/multilingual/PubChemSMILESISoTitleBM.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import annotations - -import datasets - -from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining -from mteb.abstasks.MultilingualTask import MultilingualTask -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class PubChemSMILESISoTitleBM(AbsTaskBitextMining, MultilingualTask): - metadata = TaskMetadata( - name="PubChemSMILESISoTitleBM", - dataset={ - "path": "BASF-AI/PubChemSMILESIsoTitleBM", - "revision": "d60f975694c1841e60a39518b80e157c145f0be1" - }, - description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", - reference="https://arxiv.org/abs/2412.00532", - type="BitextMining", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs={ - "en-en": ["en-Latn", "eng-Latn"] - }, - main_score="f1", - date=None, - domains=["Chemistry"], - task_subtypes=None, - license="cc-by-nc-sa-4.0", - annotations_creators="derived", - dialect=None, - sample_creation=None, - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, - ) - - def load_data(self, **kwargs): - """Load dataset from HuggingFace hub and convert it to the standard format.""" - if self.data_loaded: - return - - self.dataset = {} - - for lang in self.hf_subsets: - self.dataset[lang] = datasets.load_dataset( - **self.metadata_dict["dataset"]) - - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self): - for lang in self.hf_subsets: - self.dataset[lang] = self.dataset[lang].rename_columns({ - "title": "sentence1", - "isomeric_smiles": "sentence2" - }) diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index 1c8072708d..d821346ab5 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -3,13 +3,8 @@ from .ara.ArEntail import * from .ces.CTKFactsNLI import * from .deu.FalseFriendsDeEnPC import * -from .eng.CoconutSMILES2FormulaPC import * from .eng.LegalBenchPC import * from .eng.PubChemAISentenceParaphrasePC import * -from .eng.PubChemSMILESCanonDescPC import * -from .eng.PubChemSMILESCanonTitlePC import * -from .eng.PubChemSMILESIsoDescPC import * -from .eng.PubChemSMILESIsoTitlePC import * from .eng.PubChemSMILESPC import * from .eng.PubChemSynonymPC import * from .eng.PubChemWikiParagraphsPC import * diff --git a/mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py b/mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py deleted file mode 100644 index d89bec5ef1..0000000000 --- a/mteb/tasks/PairClassification/eng/CoconutSMILES2FormulaPC.py +++ /dev/null @@ -1,68 +0,0 @@ -from __future__ import annotations - -import datasets - -from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class CoconutSMILES2FormulaPC(AbsTaskPairClassification): - metadata = TaskMetadata( - name="CoconutSMILES2FormulaPC", - description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", - reference="https://arxiv.org/abs/2412.00532", - dataset={ - "path": "BASF-AI/CoconutSMILES2FormulaPC", - "revision": "e46d4868e417703bdcf32aadbe5d0e05a1b7f085" - }, - type="PairClassification", - category="p2p", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="max_ap", - date=None, - domains=["Chemistry"], - task_subtypes=None, - license="cc-by-nc-sa-4.0", - annotations_creators="derived", - dialect=None, - sample_creation=None, - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, - ) - - def load_data(self): - """Load dataset from HuggingFace hub""" - if self.data_loaded: - return - - self.dataset = datasets.load_dataset( - self.metadata_dict["dataset"]["path"], - revision=self.metadata_dict["dataset"]["revision"], - trust_remote_code=True, - ) - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self): - self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["test"], label='label' - ) - _dataset = {} - for split in self.metadata.eval_splits: - hf_dataset = self.dataset[split] - _dataset[split] = [ - { - "sentence1": hf_dataset['formula'], - "sentence2": hf_dataset['smiles'], - "labels": hf_dataset['label'] - } - ] - self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py deleted file mode 100644 index ac00022725..0000000000 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESCanonDescPC.py +++ /dev/null @@ -1,71 +0,0 @@ -from __future__ import annotations - -import datasets - -from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class PubChemSMILESCanonDescPC(AbsTaskPairClassification): - metadata = TaskMetadata( - name="PubChemSMILESCanonDescPC", - description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", - reference="https://arxiv.org/abs/2412.00532", - dataset={ - "path": "BASF-AI/PubChemSMILESCanonDescPC", - "revision": "6236cc0c3003bea6034d00f96d2202f7c05629c6" - }, - type="PairClassification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="max_ap", - date=None, - domains=["Chemistry"], - task_subtypes=None, - license="cc-by-nc-sa-4.0", - annotations_creators="derived", - dialect=None, - sample_creation=None, - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, - ) - - def load_data(self): - """Load dataset from HuggingFace hub""" - if self.data_loaded: - return - - _dataset = datasets.load_dataset( - self.metadata_dict["dataset"]["path"], - revision=self.metadata_dict["dataset"]["revision"], - trust_remote_code=True, - ) - - self.dataset = _dataset - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self): - self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=self.metadata_dict["eval_splits"], label="labels" - ) - - _dataset = {} - for split in self.metadata.eval_splits: - hf_dataset = self.dataset[split] - _dataset[split] = [ - { - "sentence1": hf_dataset["description"], - "sentence2": hf_dataset["canonical_smiles"], - "labels": hf_dataset["labels"] - } - ] - self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py deleted file mode 100644 index 244118ef4e..0000000000 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESCanonTitlePC.py +++ /dev/null @@ -1,71 +0,0 @@ -from __future__ import annotations - -import datasets - -from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class PubChemSMILESCanonTitlePC(AbsTaskPairClassification): - metadata = TaskMetadata( - name="PubChemSMILESCanonTitlePC", - description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", - reference="https://arxiv.org/abs/2412.00532", - dataset={ - "path": "BASF-AI/PubChemSMILESCanonTitlePC", - "revision": "3cce5bbb9ffe0d63a74102f2f5037aea47244c8f" - }, - type="PairClassification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="max_ap", - date=None, - domains=["Chemistry"], - task_subtypes=None, - license="cc-by-nc-sa-4.0", - annotations_creators="derived", - dialect=None, - sample_creation=None, - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, - ) - - def load_data(self): - """Load dataset from HuggingFace hub""" - if self.data_loaded: - return - - _dataset = datasets.load_dataset( - self.metadata_dict["dataset"]["path"], - revision=self.metadata_dict["dataset"]["revision"], - trust_remote_code=True, - ) - - self.dataset = _dataset - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self): - self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=self.metadata_dict["eval_splits"], label="labels" - ) - - _dataset = {} - for split in self.metadata.eval_splits: - hf_dataset = self.dataset[split] - _dataset[split] = [ - { - "sentence1": hf_dataset["title"], - "sentence2": hf_dataset["canonical_smiles"], - "labels": hf_dataset["labels"] - } - ] - self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py deleted file mode 100644 index ae7da53070..0000000000 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESIsoDescPC.py +++ /dev/null @@ -1,71 +0,0 @@ -from __future__ import annotations - -import datasets - -from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class PubChemSMILESIsoDescPC(AbsTaskPairClassification): - metadata = TaskMetadata( - name="PubChemSMILESIsoDescPC", - description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", - reference="https://arxiv.org/abs/2412.00532", - dataset={ - "path": "BASF-AI/PubChemSMILESIsoDescPC", - "revision": "afedff80aa393a8bdc5e05da46252dc5fde99029" - }, - type="PairClassification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="max_ap", - date=None, - domains=["Chemistry"], - task_subtypes=None, - license="cc-by-nc-sa-4.0", - annotations_creators="derived", - dialect=None, - sample_creation=None, - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, - ) - - def load_data(self): - """Load dataset from HuggingFace hub""" - if self.data_loaded: - return - - _dataset = datasets.load_dataset( - self.metadata_dict["dataset"]["path"], - revision=self.metadata_dict["dataset"]["revision"], - trust_remote_code=True, - ) - - self.dataset = _dataset - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self): - self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=self.metadata_dict["eval_splits"], label="labels" - ) - - _dataset = {} - for split in self.metadata.eval_splits: - hf_dataset = self.dataset[split] - _dataset[split] = [ - { - "sentence1": hf_dataset["description"], - "sentence2": hf_dataset["isomeric_smiles"], - "labels": hf_dataset["labels"], - } - ] - self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py deleted file mode 100644 index ba3da3b0c1..0000000000 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESIsoTitlePC.py +++ /dev/null @@ -1,71 +0,0 @@ -from __future__ import annotations - -import datasets - -from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class PubChemSMILESIsoTitlePC(AbsTaskPairClassification): - metadata = TaskMetadata( - name="PubChemSMILESIsoTitlePC", - description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", - reference="https://arxiv.org/abs/2412.00532", - dataset={ - "path": "BASF-AI/PubChemSMILESIsoTitlePC", - "revision": "1b0d57516ec7c168b8da44f80148b5418ba394b3" - }, - type="PairClassification", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="max_ap", - date=None, - domains=["Chemistry"], - task_subtypes=None, - license="cc-by-nc-sa-4.0", - annotations_creators="derived", - dialect=None, - sample_creation=None, - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, - ) - - def load_data(self): - """Load dataset from HuggingFace hub""" - if self.data_loaded: - return - - _dataset = datasets.load_dataset( - self.metadata_dict["dataset"]["path"], - revision=self.metadata_dict["dataset"]["revision"], - trust_remote_code=True, - ) - - self.dataset = _dataset - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self): - self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=self.metadata_dict["eval_splits"], label="labels" - ) - - _dataset = {} - for split in self.metadata.eval_splits: - hf_dataset = self.dataset[split] - _dataset[split] = [ - { - "sentence1": hf_dataset["title"], - "sentence2": hf_dataset["isomeric_smiles"], - "labels": hf_dataset["labels"], - } - ] - self.dataset = _dataset From c68d6968b83f9b963087e7720d7e29622a720982 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 5 Jan 2025 00:05:16 +0000 Subject: [PATCH 35/49] Update some ChemTEB tasks - Move `PubChemSMILESBitextMining` to `eng` folder - Add citations for tasks involving SDS, NQ, Hotpot, PubChem data - Update Clustering tasks `category` - Change `main_score` for `PubChemAISentenceParaphrasePC` --- mteb/tasks/BitextMining/__init__.py | 2 +- .../PubChemSMILESBitextMining.py | 10 ++++++++ mteb/tasks/BitextMining/eng/__init__.py | 0 .../eng/SDSEyeProtectionClassification.py | 7 ++++++ .../eng/SDSGlovesClassification.py | 7 ++++++ ...WikipediaChemistrySpecialtiesClustering.py | 2 +- .../eng/WikipediaChemistryTopicsClustering.py | 2 +- .../eng/PubChemAISentenceParaphrasePC.py | 12 ++++++++- .../PairClassification/eng/PubChemSMILESPC.py | 10 ++++++++ .../eng/PubChemSynonymPC.py | 10 ++++++++ .../eng/PubChemWikiParagraphsPC.py | 11 ++++++++ .../eng/WikipediaAIParagraphsParaphrasePC.py | 4 +-- .../PubChemWikiPairClassification.py | 10 ++++++++ .../Retrieval/eng/ChemHotpotQARetrieval.py | 25 ++++++++++++++++++- mteb/tasks/Retrieval/eng/ChemNQRetrieval.py | 8 ++++++ 15 files changed, 113 insertions(+), 7 deletions(-) rename mteb/tasks/BitextMining/{multilingual => eng}/PubChemSMILESBitextMining.py (85%) create mode 100644 mteb/tasks/BitextMining/eng/__init__.py diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py index a8addef6a6..1cec5d5ddc 100644 --- a/mteb/tasks/BitextMining/__init__.py +++ b/mteb/tasks/BitextMining/__init__.py @@ -1,6 +1,7 @@ from __future__ import annotations from .dan.BornholmskBitextMining import * +from .eng.PubChemSMILESBitextMining import * from .kat.TbilisiCityHallBitextMining import * from .multilingual.BibleNLPBitextMining import * from .multilingual.BUCCBitextMining import * @@ -18,7 +19,6 @@ from .multilingual.NusaTranslationBitextMining import * from .multilingual.NusaXBitextMining import * from .multilingual.PhincBitextMining import * -from .multilingual.PubChemSMILESBitextMining import * from .multilingual.RomaTalesBitextMining import * from .multilingual.TatoebaBitextMining import * from .srn.SRNCorpusBitextMining import * diff --git a/mteb/tasks/BitextMining/multilingual/PubChemSMILESBitextMining.py b/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py similarity index 85% rename from mteb/tasks/BitextMining/multilingual/PubChemSMILESBitextMining.py rename to mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py index 261c89f3b2..4e1d62ee36 100644 --- a/mteb/tasks/BitextMining/multilingual/PubChemSMILESBitextMining.py +++ b/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py @@ -50,6 +50,16 @@ class PubChemSMILESBitextMining(AbsTaskBitextMining, MultilingualTask): journal={arXiv preprint arXiv:2412.00532}, year={2024} } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } """, ) diff --git a/mteb/tasks/BitextMining/eng/__init__.py b/mteb/tasks/BitextMining/eng/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py index 5adaba8c7e..cb442d22bf 100644 --- a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py +++ b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py @@ -33,5 +33,12 @@ class SDSEyeProtectionClassification(AbsTaskClassification): journal={arXiv preprint arXiv:2412.00532}, year={2024} } + @inproceedings{pereira2020msds, + title={MSDS-OPP: Operator Procedures Prediction in Material Safety Data Sheets}, + author={Pereira, Eliseu}, + booktitle={15th Doctoral Symposium}, + pages={42}, + year={2020} + } """, ) diff --git a/mteb/tasks/Classification/eng/SDSGlovesClassification.py b/mteb/tasks/Classification/eng/SDSGlovesClassification.py index 6f3d339bec..f5055bd7ab 100644 --- a/mteb/tasks/Classification/eng/SDSGlovesClassification.py +++ b/mteb/tasks/Classification/eng/SDSGlovesClassification.py @@ -33,5 +33,12 @@ class SDSGlovesClassification(AbsTaskClassification): journal={arXiv preprint arXiv:2412.00532}, year={2024} } + @inproceedings{pereira2020msds, + title={MSDS-OPP: Operator Procedures Prediction in Material Safety Data Sheets}, + author={Pereira, Eliseu}, + booktitle={15th Doctoral Symposium}, + pages={42}, + year={2020} + } """, ) diff --git a/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py b/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py index 4c6007902d..3385e3b8b5 100644 --- a/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py +++ b/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py @@ -15,7 +15,7 @@ class WikipediaChemistrySpecialtiesClustering(AbsTaskClustering): "revision": "7754d8d296f9f4c3af1c6426fab36304730ccddf", }, type="Clustering", - category="p2p", + category="s2p", modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], diff --git a/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py b/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py index bd66ccfcd1..ec211378bd 100644 --- a/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py +++ b/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py @@ -15,7 +15,7 @@ class WikipediaChemistryTopicsClustering(AbsTaskClustering): "revision": "0a0886b06acbfc735bca6a71b21ce1e5cb92a37b", }, type="Clustering", - category="p2p", + category="s2p", modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], diff --git a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py index 32c3447301..fe1a6b56bc 100644 --- a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py +++ b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py @@ -20,7 +20,7 @@ class PubChemAISentenceParaphrasePC(AbsTaskPairClassification): modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="max_f1", + main_score="max_ap", date=None, domains=["Chemistry"], task_subtypes=None, @@ -35,6 +35,16 @@ class PubChemAISentenceParaphrasePC(AbsTaskPairClassification): journal={arXiv preprint arXiv:2412.00532}, year={2024} } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } """, ) diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py index 6c63b1da7d..78b1ae665b 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py @@ -61,6 +61,16 @@ class PubChemSMILESPC(AbsTaskPairClassification): journal={arXiv preprint arXiv:2412.00532}, year={2024} } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } """, ) diff --git a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py index 09c6b8dae7..18dba0eabc 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py @@ -35,6 +35,16 @@ class PubChemSynonymPC(AbsTaskPairClassification): journal={arXiv preprint arXiv:2412.00532}, year={2024} } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } """, ) diff --git a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py index f3f2a07e8c..99c0097280 100644 --- a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py @@ -35,7 +35,18 @@ class PubChemWikiParagraphsPC(AbsTaskPairClassification): journal={arXiv preprint arXiv:2412.00532}, year={2024} } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } """, + ) def load_data(self): diff --git a/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py b/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py index 7875fd0fe0..5701db915c 100644 --- a/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py +++ b/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py @@ -12,8 +12,8 @@ class WikipediaAIParagraphsParaphrasePC(AbsTaskPairClassification): description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", reference="https://arxiv.org/abs/2412.00532", dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaAIParagraphsParaphrasePC", - "revision": "a430437ea6c6fe0e6461e6d6659f647d0bf62496" + "path": "BASF-AI/WikipediaAIParagraphsParaphrasePC", + "revision": "7694661b6e28000d9b2c2376a1bbd49417d279ea" }, type="PairClassification", category="p2p", diff --git a/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py b/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py index c3bb5dd5c7..29b4895f83 100644 --- a/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py +++ b/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py @@ -49,6 +49,16 @@ class PubChemWikiPairClassification(AbsTaskPairClassification, MultilingualTask) journal={arXiv preprint arXiv:2412.00532}, year={2024} } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } """, ) diff --git a/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py b/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py index 5585373ac3..ce9d222058 100644 --- a/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py +++ b/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py @@ -33,5 +33,28 @@ class ChemHotpotQARetrieval(AbsTaskRetrieval): journal={arXiv preprint arXiv:2412.00532}, year={2024} } - """, + @inproceedings{yang-etal-2018-hotpotqa, + title = "{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering", + author = "Yang, Zhilin and + Qi, Peng and + Zhang, Saizheng and + Bengio, Yoshua and + Cohen, William and + Salakhutdinov, Ruslan and + Manning, Christopher D.", + editor = "Riloff, Ellen and + Chiang, David and + Hockenmaier, Julia and + Tsujii, Jun{'}ichi", + booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", + month = oct # "-" # nov, + year = "2018", + address = "Brussels, Belgium", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/D18-1259", + doi = "10.18653/v1/D18-1259", + pages = "2369--2380", + abstract = "Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.", + } +""", ) diff --git a/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py b/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py index 2342bcaa79..a0bfb9c1fa 100644 --- a/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py +++ b/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py @@ -33,5 +33,13 @@ class ChemNQRetrieval(AbsTaskRetrieval): journal={arXiv preprint arXiv:2412.00532}, year={2024} } + @article{47761, + title = {Natural Questions: a Benchmark for Question Answering Research}, + author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh + and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee + and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le + and Slav Petrov}, + year = {2019}, + journal = {Transactions of the Association of Computational Linguistics}} """, ) From 20637f6dd8a27653569aa84fc1c60b7cb101dbca Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 5 Jan 2025 00:13:03 +0000 Subject: [PATCH 36/49] Create ChemTEB benchmark --- mteb/benchmarks/benchmarks.py | 45 +++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index edb4326cae..5ec3e09635 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1037,3 +1037,48 @@ def load_results( reference="https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6", citation=None, ) + +CHEMTEB = Benchmark( + name="ChemTEB", + tasks=get_tasks( + tasks=[ + "PubChemSMILESBitextMining", + "SDSEyeProtectionClassification", + "SDSGlovesClassification", + "WikipediaBioMetChemClassification", + "WikipediaGreenhouseEnantiopureClassification", + "WikipediaSolidStateColloidalClassification", + "WikipediaOrganicInorganicClassification", + "WikipediaCryobiologySeparationClassification", + "WikipediaChemistryTopicsClassification", + "WikipediaTheoreticalAppliedClassification", + "WikipediaChemFieldsClassification", + "WikipediaLuminescenceClassification", + "WikipediaIsotopesFissionClassification", + "WikipediaSaltsSemiconductorsClassification", + "WikipediaBiolumNeurochemClassification", + "WikipediaCrystallographyAnalyticalClassification", + "WikipediaCompChemSpectroscopyClassification", + "WikipediaChemEngSpecialtiesClassification", + "WikipediaChemistryTopicsClustering", + "WikipediaSpecialtiesInChemistryClustering", + "PubChemAISentenceParaphrasePC", + "PubChemSMILESPC", + "PubChemSynonymPC", + "PubChemWikiParagraphsPC", + "WikipediaAIParagraphsParaphrasePC", + "PubChemWikiPairClassification", + "ChemNQRetrieval", + "ChemHotpotQARetrieval", + ], + ), + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} +}""", +) From 5c04d06bb018394b9293a304dbd9f85985b1f4eb Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 5 Jan 2025 00:13:56 +0000 Subject: [PATCH 37/49] Remove `CoconutRetrieval` --- mteb/tasks/Retrieval/__init__.py | 1 - mteb/tasks/Retrieval/eng/CoconutRetrieval.py | 36 -------------------- 2 files changed, 37 deletions(-) delete mode 100644 mteb/tasks/Retrieval/eng/CoconutRetrieval.py diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index e62f285fab..f1e549eace 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -32,7 +32,6 @@ from .eng.ChemNQRetrieval import * from .eng.ChemHotpotQARetrieval import * from .eng.ClimateFEVERRetrieval import * -from .eng.CoconutRetrieval import * from .eng.CQADupstackAndroidRetrieval import * from .eng.CQADupstackEnglishRetrieval import * from .eng.CQADupstackGamingRetrieval import * diff --git a/mteb/tasks/Retrieval/eng/CoconutRetrieval.py b/mteb/tasks/Retrieval/eng/CoconutRetrieval.py deleted file mode 100644 index 6382955ea7..0000000000 --- a/mteb/tasks/Retrieval/eng/CoconutRetrieval.py +++ /dev/null @@ -1,36 +0,0 @@ -from __future__ import annotations -import logging - -from mteb.abstasks.TaskMetadata import TaskMetadata - - -from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval - -logger = logging.getLogger(__name__) - - -class CoconutRetrieval(AbsTaskRetrieval): - metadata = TaskMetadata( - name="CoconutRetrieval", - dataset={ - "path": "BASF-We-Create-Chemistry/SmallCoconutRetrieval", - "revision": "831d292c3959eae59e4f89b8758738feee97d6cf", - }, - description="COCONUT: the COlleCtion of Open NatUral producTs", - reference="https://coconut.naturalproducts.net/", - type="Retrieval", - category="s2s", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="ndcg_at_10", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, - bibtex_citation="""""", - descriptive_stats={} - ) From c9470120c0db88ee1ad096a7ecba678969817452 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 5 Jan 2025 00:14:27 +0000 Subject: [PATCH 38/49] Update tasks and benchmarks tables with ChemTEB --- docs/benchmarks.md | 35 ++++++++++++++++++---------- docs/tasks.md | 57 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 66 insertions(+), 26 deletions(-) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index a5abe50215..15051f0fe3 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -7,16 +7,27 @@ The following table gives you an overview of the benchmarks in MTEB. | Name | # Tasks | Task Types | Domains | Languages | |------|---------|------------|---------|-----------| -| [CoIR](https://github.com/CoIR-team/coir) | 10 | {'Retrieval': 10} | [Written, Programming] | python,c++,sql,go,eng,php,javascript,ruby,java | -| [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | 7 | {'BitextMining': 7} | [Written, Social, Reviews] | sun,kaz,tzl,ido,abs,arq,yue,tam,nij,glg,slk,hsb,ber,xho,cbk,pol,uzb,ina,kab,swh,amh,fao,kzj,lfn,uig,sqi,deu,ang,ind,bug,pms,ibo,cym,eus,spa,ceb,tgl,ron,isl,ita,csb,cha,fin,est,pes,jpn,tel,tha,oci,cmn,min,fry,bbc,epo,lit,rus,bos,hrv,war,ara,bjn,mkd,srp,ast,nno,urd,pam,aze,eng,ace,bew,kor,dan,awa,mui,hye,ban,cor,ben,gle,swe,mad,bul,lat,cat,nob,fra,pcm,ell,mar,vie,tat,ukr,gsw,kat,arz,dsb,lvs,nld,tur,bel,max,nds,afr,khm,dtp,yor,ces,gla,zsm,mak,ile,nov,orv,bre,swg,rej,mhr,mon,mal,jav,heb,slv,bhp,kur,wuu,tuk,por,hun,hin,hau,yid | -| [MTEB(Retrieval w/Instructions)](https://arxiv.org/abs/2403.15246) | 3 | {'InstructionRetrieval': 3} | [Written, News] | eng | -| [MTEB(Scandinavian)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | 28 | {'BitextMining': 2, 'Classification': 13, 'Retrieval': 7, 'Clustering': 6} | [Encyclopaedic, Spoken, Non-fiction, Government, News, Fiction, Social, Blog, Reviews, Written, Web, Legal] | nob,fao,swe,isl,dan,nno | -| MTEB(code) | 12 | {'Retrieval': 12} | [Written, Programming] | python,c++,sql,c,go,eng,shell,typescript,php,scala,rust,swift,javascript,ruby,java | -| [MTEB(deu)](https://arxiv.org/html/2401.02709v1) | 19 | {'Classification': 6, 'Clustering': 4, 'PairClassification': 2, 'Reranking': 1, 'Retrieval': 4, 'STS': 2} | [Encyclopaedic, Spoken, News, Reviews, Written, Web] | eng,deu,pol,fra | -| MTEB(eng) | 67 | {'Classification': 12, 'Retrieval': 26, 'Clustering': 11, 'Reranking': 4, 'STS': 10, 'PairClassification': 3, 'Summarization': 1} | [Encyclopaedic, Spoken, Non-fiction, Blog, News, Medical, Social, Programming, Written, Reviews, Web, Academic] | tur,fra,eng,cmn,pol,ita,nld,spa,deu,ara | -| [MTEB(fra)](https://arxiv.org/abs/2405.20468) | 26 | {'Classification': 6, 'Clustering': 7, 'PairClassification': 2, 'Reranking': 2, 'Retrieval': 5, 'STS': 3, 'Summarization': 1} | [Encyclopaedic, Spoken, Non-fiction, News, Social, Reviews, Written, Web, Legal, Academic] | eng,deu,pol,fra | -| MTEB(kor) | 6 | {'Classification': 1, 'Reranking': 1, 'Retrieval': 2, 'STS': 2} | [Encyclopaedic, Spoken, News, Reviews, Written, Web] | kor | -| [MTEB(law)](https://aclanthology.org/2023.eacl-main.148/) | 8 | {'Retrieval': 8} | [Written, Legal] | eng,deu,zho | -| [MTEB(pol)](https://arxiv.org/abs/2405.10138) | 18 | {'Classification': 7, 'Clustering': 3, 'PairClassification': 4, 'STS': 4} | [Spoken, Non-fiction, News, Fiction, Social, Written, Web, Legal, Academic] | pol,deu,eng,fra | -| [MTEB(rus)](https://aclanthology.org/2023.eacl-main.148/) | 23 | {'Classification': 9, 'Clustering': 3, 'MultilabelClassification': 2, 'PairClassification': 1, 'Reranking': 2, 'Retrieval': 3, 'STS': 3} | [Encyclopaedic, Spoken, Blog, News, Social, Reviews, Written, Web, Academic] | rus | +| [BRIGHT](https://brightbenchmark.github.io/) | 1 | {'Retrieval': 1} | [Non-fiction] | eng | +| [ChemTEB](https://arxiv.org/abs/2412.00532) | 28 | {'BitextMining': 1, 'Classification': 17, 'Clustering': 2, 'PairClassification': 6, 'Retrieval': 2} | [Chemistry] | msa,zho,jpn,fra,kor,en,nld,por,hin,tur,deu,eng,spa,ces | +| [CoIR](https://github.com/CoIR-team/coir) | 10 | {'Retrieval': 10} | [Written, Programming] | c++,javascript,java,ruby,go,python,sql,eng,php | +| [LongEmbed](https://arxiv.org/abs/2404.12096v2) | 6 | {'Retrieval': 6} | [Written, Academic, Non-fiction, Encyclopaedic, Fiction, Spoken, Blog] | eng | +| [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | 7 | {'BitextMining': 7} | [Reviews, Written, Social] | tgl,arz,tha,cym,srp,ita,swh,sun,hrv,ace,bul,jpn,mak,tam,xho,yue,ind,dtp,mui,ban,war,epo,hau,fra,ber,tzl,urd,hun,swe,heb,aze,nds,lit,por,tuk,bel,kor,spa,est,bos,gla,hye,kur,cbk,ces,kaz,eus,uzb,max,kat,arq,nij,tur,eng,bre,bug,nld,mad,glg,ido,bbc,nob,sqi,pol,amh,ang,awa,dan,cat,bjn,slv,tel,ast,ibo,bew,lat,zsm,ceb,ara,isl,yor,fry,slk,wuu,bhp,hin,fin,oci,vie,mal,pam,pms,mar,nov,mkd,lfn,pcm,ukr,mhr,afr,dsb,min,cmn,fao,ben,lvs,deu,orv,gle,kab,cor,jav,ile,ell,nno,uig,cha,abs,hsb,khm,csb,pes,ina,kzj,rej,gsw,tat,mon,ron,swg,yid,rus | +| MTEB(Europe, beta) | 74 | {'BitextMining': 7, 'Classification': 21, 'Clustering': 8, 'Retrieval': 15, 'InstructionRetrieval': 3, 'MultilabelClassification': 2, 'PairClassification': 6, 'Reranking': 3, 'STS': 9} | [News, Spoken, Written, Academic, Religious, Non-fiction, Encyclopaedic, Reviews, Constructed, Government, Social, Medical, Programming, Blog, Legal, Subtitles, Web, Fiction] | cwe,gub,poe,nbq,mlt,nhi,mek,szl,cak,rom,sot,stp,taw,kiz,sna,kgk,okv,kqw,aoi,gnw,not,cle,txq,vec,ian,kyg,cbi,pab,car,luo,mzz,srm,jae,caa,fuf,xla,nuy,zho,cot,bak,tbc,mai,quc,kmo,cap,kwd,mya,heb,nab,aze,ikw,ajp,nop,nys,rop,tee,dji,bqp,jid,hne,spa,nhe,pio,qve,kkl,csy,met,mpm,cbc,azb,cnl,chk,abx,aia,geb,atg,nak,cso,amr,nnq,yml,cjk,wmt,kbq,mmo,glk,bsp,kaz,ton,zlm,cop,naf,upv,sim,dwr,tlf,cav,aai,tur,wrk,bsn,jvn,gng,mpt,hns,cbt,kde,dov,mle,mwf,bug,zga,kam,bba,nld,rai,ltz,glg,opm,sat,awb,boa,gmv,wsk,hch,mcf,pol,tiw,uzn,tsw,hbo,ngu,maq,mcq,bjz,jao,tel,ast,zpc,rkb,row,lua,mxt,heg,lid,srd,pap,ksd,maa,lbb,tvk,zar,als,ote,huu,waj,bkx,zao,qvw,bki,gdr,apz,ata,hmo,dgr,atd,kmk,acm,nhw,wln,myy,nii,yss,crh,hop,sps,nso,shj,ffm,mux,mgc,tof,maj,tzj,nor,bao,apn,qwh,azj,nou,tew,kkc,acq,gah,aom,kpg,qxn,aaz,sny,tbo,mar,nwi,orm,mos,quy,cco,kql,ebk,xav,zpz,min,cmn,zaa,imo,klv,kbc,zas,suz,amu,atb,knf,obo,san,agm,gyr,amm,tmd,zpl,gui,toc,caf,aui,kmu,nno,ter,azg,cha,tbz,aoj,pls,cpy,kwi,hix,ewe,mox,ubr,seh,anv,hlt,cab,nlg,rgu,bpr,kpj,tat,mbt,nko,jiv,etr,kdc,nho,rug,uvl,sue,ktm,urw,ssg,guj,mti,mcb,far,spm,bzh,tgl,cbs,sgz,aso,bsj,taj,bjk,mic,noa,leu,bon,chf,qvc,rmc,jpn,cpc,blz,kjs,kmr,cux,pwg,xho,yue,kwf,mee,qul,mpp,jic,tdt,agt,iou,kto,soq,ken,gwi,kon,hui,cax,mop,kyz,ncj,trc,viv,wiu,ziw,buk,cui,ots,pon,hun,swe,por,ood,run,fuh,kbp,mir,wuv,grc,ssx,ltg,ino,ntu,wal,amk,fon,llg,zap,crx,kpr,mil,est,fuc,kmg,kin,mdy,ixl,fas,agn,zam,bqc,gvc,xtm,sri,bos,sbk,nya,con,cme,gla,hye,kur,pad,bmh,kqa,xsi,knv,kat,gvs,ulk,wer,cbv,kas,mbh,lav,mau,bea,eng,nhy,otn,bre,yre,dad,djk,kqf,sus,bch,uli,kmb,qup,djr,mav,pbt,ndg,mlp,snc,kyf,nqo,mph,zpq,kup,nsn,amh,tac,aau,isn,xed,yon,zia,fuv,ncu,slv,kne,agd,cjv,swp,zaw,ibo,tna,glv,ikk,aey,cof,ame,wro,zsm,haw,sbe,guh,apr,ubu,ttc,sua,mie,sgb,nhr,for,ppo,msa,tav,pjt,zai,cni,mbj,quh,fue,top,wrs,inb,cmo,qxh,ztq,myw,apw,wim,vmy,dgz,cgc,wbp,afr,kze,lif,aly,aii,bss,ben,udu,bgs,kan,med,lvs,bco,mna,tir,too,hat,gle,ydd,sbs,nss,cbu,jni,yap,dif,kab,bjv,som,fil,uig,avt,wnc,zsr,ntp,tso,mkn,txu,tpi,toj,muy,yaq,tnn,enq,agg,spl,gum,srq,zpo,mig,ycn,box,iws,arn,chq,bhg,ron,dik,mcr,ary,bef,rus,tbf,bmr,cya,kgp,srp,rro,dhg,ory,tgk,ita,cym,swh,sun,aka,hrv,pri,sxb,ace,mlg,cpu,bem,kek,apu,bhl,mbc,tgp,tos,gvn,aby,faa,ded,apc,yad,kud,zpu,fij,war,qvn,nde,aak,mib,cjo,scn,hau,tte,fra,tnp,pah,sin,usp,ksr,mxb,tku,plu,lit,lim,pao,bel,arp,huv,nas,guo,kor,awk,bvr,xtd,cpb,acu,mbb,qvm,twi,dyu,tsn,kpx,beu,daa,arl,blw,msy,tcz,yle,swa,tum,tke,gaw,ces,aer,nif,zos,eus,npl,zat,kvn,are,lug,tca,maz,kaq,bnp,tzm,omw,wap,tif,gai,yal,bbb,gvf,spp,gnn,kew,piu,snx,tgo,mqb,lac,ndj,nhu,tpa,yrb,nob,sqi,amp,pus,boj,msc,dah,kje,awa,cat,kir,bjn,kyc,ilo,ctp,bdd,mxq,beo,mks,ven,dob,lat,pma,mri,cek,wbi,npi,nvm,wmw,ara,nch,shn,mqj,mey,shi,adz,nhg,knj,hin,msb,zac,mxp,fur,sag,kwj,oci,mco,chz,ura,nep,cpa,kqc,bbr,lww,lbk,nin,poy,eri,qvs,snd,bus,lex,lao,khk,zpm,tuc,bjp,mgh,ntj,mkd,bho,kpf,bps,rwo,sja,ssd,zca,zpv,klt,tod,dwy,gof,bod,kik,grn,mva,kea,ake,ons,acr,mbl,gul,mpx,qxo,crn,wos,yva,dzo,khs,mni,ayr,tpt,nfa,soy,zav,yuw,msk,tfr,khm,pes,mto,mio,sco,smk,agr,dop,tcs,apb,kpw,eko,emi,arb,pir,mgw,mwp,lij,emp,qvh,zab,cub,gaz,arz,otm,tha,pib,qvz,agu,ptu,gfk,hla,sll,tzo,snp,bul,gdn,mkl,tam,ind,cut,hvn,mbs,azz,lmo,ban,cth,epo,bmu,jac,mwc,usa,div,ghs,urd,umb,ncl,uvh,myk,tuk,tuf,tpz,xbi,dgc,zyp,mmx,roo,amn,bzd,pag,kms,tim,wol,hto,ese,hub,lus,poi,auy,snn,poh,bkq,fai,mps,rmy,yby,urt,tah,mag,byr,mih,kac,mcd,cbk,mhl,ptp,esk,mkj,kdl,mlh,uzb,reg,srn,gup,alq,ign,mcp,tuo,hus,cta,lin,meq,xnn,gym,mca,ksj,kgf,cao,mpj,cac,tnk,bvd,mvn,lgl,prf,nus,prs,nyu,hmn,mam,dww,yut,dan,khz,sey,cbr,bxh,bam,kvg,byx,bkd,pan,yaa,bmk,ceb,plt,yka,hot,att,urb,cnt,isl,ckb,yor,slk,wiv,zty,fin,ong,tbg,gam,anh,vie,mal,kiw,sab,tiy,gux,ipi,asm,acf,amx,bjr,mwe,myu,gun,kmh,xon,amo,knc,tet,uri,kbm,msm,meu,aeb,chd,yuj,wnu,ukr,alp,clu,mit,fao,ssw,ngp,otq,shp,deu,kue,kbh,mjc,cuc,bzj,big,kos,jav,ctu,aon,bgt,ell,abt,miz,amf,kyq,wat,qub,zaj,auc,quf,nna,zul,nca,cuk,smo,spy,vid,zad,ape,mon,wed,taq,ars,tue,lcm,tnc,ruf,awx | +| MTEB(Indic, beta) | 23 | {'BitextMining': 4, 'Clustering': 1, 'Classification': 13, 'STS': 1, 'PairClassification': 1, 'Retrieval': 2, 'Reranking': 1} | [News, Written, Religious, Non-fiction, Reviews, Constructed, Government, Encyclopaedic, Social, Web, Legal, Spoken, Fiction] | tgl,arz,tha,cym,mlt,ory,srp,szl,ita,tgk,sot,sun,aka,hrv,sna,ace,swh,bul,jpn,bem,kmr,tam,xho,yue,ind,vec,apc,lmo,ban,fij,war,kon,luo,epo,gom,zho,scn,hau,fra,bak,sin,mai,urd,umb,hun,mya,swe,heb,por,lim,lit,tuk,run,ajp,bel,kbp,pag,ltg,kor,wol,lus,dyu,twi,fon,hne,spa,tsn,mup,est,kin,azb,mag,fas,bos,kac,swa,gla,nya,hye,tum,cjk,ces,kaz,eus,lug,kat,kas,tur,tzm,eng,bug,kmb,kam,lin,nld,ltz,glg,sat,pbt,nob,prs,nus,gaz,nqo,pol,uzn,amh,pus,fuv,awa,dan,cat,kir,bjn,slv,ilo,brx,tel,bam,ast,lua,ibo,srd,pap,pan,mri,rus,als,zsm,ceb,npi,plt,raj,ara,shn,bgc,isl,ckb,yor,slk,acm,hin,sag,fur,fin,oci,crh,nep,nso,vie,mal,nor,asm,snd,azj,acq,knc,lao,khk,mar,aeb,mkd,gbm,mos,ukr,quy,bho,afr,fao,min,cmn,ssw,ben,kan,lvs,san,bod,deu,kik,tir,grn,gle,boy,hat,kea,ydd,mwr,kab,jav,nno,ell,dzo,som,mni,ayr,uig,tso,tpi,khm,ewe,pes,zul,smo,tat,doi,ron,taq,arb,ars,dik,lij,ary,guj | +| MTEB(Medical) | 12 | {'Retrieval': 9, 'Clustering': 2, 'Reranking': 1} | [Written, Academic, Non-fiction, Medical, Web, Government] | zho,ara,cmn,fra,kor,pol,vie,rus,eng,spa | +| MTEB(Multilingual, beta) | 132 | {'BitextMining': 13, 'Classification': 43, 'Clustering': 17, 'Retrieval': 18, 'InstructionRetrieval': 3, 'MultilabelClassification': 5, 'PairClassification': 11, 'Reranking': 6, 'STS': 16} | [News, Spoken, Written, Academic, Religious, Non-fiction, Encyclopaedic, Reviews, Government, Constructed, Social, Medical, Programming, Blog, Legal, Subtitles, Web, Fiction] | cwe,gub,poe,nbq,mlt,nhi,mek,szl,cak,rom,sot,stp,taw,kiz,sna,kgk,okv,kqw,aoi,gnw,not,cle,txq,vec,ian,kyg,cbi,mui,pab,car,luo,mzz,srm,jae,caa,fuf,xla,nuy,zho,cot,gom,bak,tbc,mai,quc,kmo,cap,kwd,mya,heb,nab,aze,ikw,ajp,nop,nys,rop,tee,dji,bqp,jid,hne,spa,nhe,pio,qve,kkl,csy,met,mpm,cbc,azb,cnl,chk,abx,aia,geb,atg,nak,cso,amr,nnq,yml,cjk,wmt,kbq,mmo,glk,bsp,kaz,ton,zlm,cop,naf,upv,sim,arq,dwr,tlf,cav,aai,tur,wrk,bsn,jvn,gng,mpt,hns,cbt,kde,dov,mle,mwf,bug,zga,kam,bba,nld,rai,ltz,glg,opm,sat,awb,boa,gmv,wsk,hch,mcf,pol,tiw,uzn,tsw,hbo,ngu,maq,brx,mcq,bjz,jao,tel,ast,zpc,rkb,row,lua,mxt,heg,bew,lid,srd,pap,ksd,maa,lbb,tvk,zar,als,ote,huu,waj,bkx,zao,qvw,bki,gdr,apz,ata,hmo,tyv,dgr,atd,kmk,acm,nhw,wln,myy,nii,yss,crh,hop,sps,nso,shj,ffm,mux,mgc,tof,maj,tzj,nor,bao,apn,qwh,azj,nou,tew,kkc,acq,gah,aom,kpg,qxn,aaz,sny,tbo,mar,nwi,orm,mos,quy,cco,kql,ebk,xav,mhr,zpz,min,cmn,zaa,imo,klv,kbc,zas,suz,amu,atb,knf,obo,san,agm,gyr,amm,tmd,zpl,gui,toc,caf,cor,aui,kmu,nno,ile,ter,azg,cha,tbz,aoj,hsb,pls,cpy,kwi,hix,ewe,mox,ubr,seh,anv,sah,hlt,cab,nlg,rgu,bpr,kpj,tat,mbt,nko,jiv,etr,kdc,nho,rug,uvl,sue,ktm,urw,ssg,yid,guj,mti,mcb,far,spm,bzh,tgl,cbs,sgz,aso,bsj,taj,bjk,mic,noa,leu,bon,chf,qvc,rmc,jpn,cpc,blz,kjs,kmr,cux,pwg,xho,yue,kwf,mee,qul,dtp,mpp,jic,tdt,agt,iou,kto,soq,ken,gwi,kon,hui,cax,mop,kyz,ncj,trc,viv,wiu,ziw,buk,cui,ots,pon,hun,swe,por,ood,run,fuh,kbp,mir,wuv,grc,ssx,ltg,chv,ino,ntu,wal,amk,fon,llg,zap,crx,kpr,mil,est,fuc,kmg,kin,mdy,ixl,fas,agn,zam,bqc,gvc,xtm,sri,bos,sbk,nya,con,cme,gla,hye,kur,pad,bmh,kqa,xsi,knv,kat,gvs,ulk,wer,cbv,kas,mbh,lav,mau,bea,eng,nhy,otn,bre,yre,dad,djk,kqf,sus,bch,uli,kmb,qup,djr,mav,pbt,ndg,mlp,snc,kyf,nqo,mph,zpq,kup,nsn,amh,tac,aau,isn,xed,yon,zia,fuv,ncu,slv,kne,agd,cjv,swp,zaw,ibo,tna,glv,ikk,aey,cof,ame,wro,zsm,haw,sbe,guh,apr,ubu,ttc,sua,mie,sgb,nhr,for,ppo,msa,tav,pjt,zai,cni,mbj,quh,fue,top,wrs,inb,cmo,qxh,ztq,myw,apw,wim,vmy,dgz,cgc,pcm,wbp,afr,kze,lif,aly,aii,bss,ben,udu,bgs,svk,kan,med,lvs,bco,mna,tir,too,hat,gle,ydd,sbs,nss,cbu,jni,yap,dif,kab,bjv,som,fil,uig,avt,wnc,zsr,ntp,tso,mkn,txu,tpi,toj,muy,yaq,tnn,enq,agg,spl,gum,srq,zpo,mig,kzj,ycn,box,iws,arn,chq,bhg,ron,dik,mcr,ary,bef,rus,tbf,bmr,cya,kgp,srp,rro,dhg,ory,tgk,ita,cym,swh,sun,aka,hrv,pri,sxb,ace,mlg,cpu,bem,kek,apu,bhl,mbc,tgp,tos,gvn,aby,faa,ded,apc,yad,kud,zpu,fij,war,qvn,nde,aak,mib,cjo,scn,hau,tte,fra,tnp,pah,sin,tzl,usp,ksr,mxb,tku,plu,lit,lim,pao,nds,bel,arp,huv,nas,guo,kor,awk,bvr,xtd,cpb,acu,mbb,qvm,twi,dyu,tsn,kpx,beu,daa,arl,blw,msy,tcz,yle,swa,tum,tke,gaw,ces,aer,nif,zos,eus,npl,zat,kvn,are,lug,tca,maz,kaq,bnp,tzm,omw,wap,tif,gai,yal,bbb,gvf,mad,spp,gnn,kew,piu,snx,tgo,mqb,ido,bbc,lac,ndj,nhu,tpa,yrb,nob,sqi,amp,pus,boj,msc,dah,kje,awa,cat,kir,bjn,kyc,ilo,ctp,bdd,mxq,beo,mks,ven,dob,lat,pma,mri,cek,wbi,npi,raj,nvm,wmw,ara,nch,shn,mqj,mey,shi,adz,nhg,knj,bhp,hin,msb,zac,mxp,fur,sag,kwj,oci,mco,chz,ura,nep,cpa,kqc,bbr,lww,lbk,nin,poy,eri,qvs,snd,bus,pms,lex,lao,khk,zpm,nov,tuc,bjp,mgh,ntj,mkd,lfn,bho,dsb,kpf,bps,rwo,sja,ssd,zca,zpv,klt,tod,dwy,gof,bod,kik,orv,grn,mva,kea,ake,mwr,ons,acr,mbl,gul,mpx,krc,qxo,crn,wos,yva,dzo,khs,mni,ayr,tpt,nfa,soy,zav,yuw,abs,msk,tfr,khm,pes,ina,mto,mio,sco,smk,agr,dop,tcs,apb,rej,kpw,gsw,doi,eko,emi,arb,pir,mgw,mwp,lij,emp,qvh,zab,cub,gaz,arz,otm,tha,pib,qvz,agu,ptu,gfk,hla,sll,tzo,snp,bul,gdn,mkl,tam,mak,ind,cut,hvn,mbs,azz,lmo,ban,cth,epo,bmu,jac,mwc,usa,div,ber,ghs,urd,umb,ncl,uvh,myk,tuk,tuf,tpz,xbi,dgc,zyp,mmx,roo,amn,bzd,pag,kms,tim,wol,hto,ese,hub,lus,poi,auy,snn,poh,bkq,fai,mps,rmy,yby,mup,urt,tah,mag,byr,mih,kac,mcd,cbk,mhl,ptp,esk,mkj,kdl,mlh,uzb,max,reg,srn,gup,alq,nij,ign,mcp,tuo,hus,cta,lin,meq,xnn,gym,mca,ksj,kgf,cao,mpj,cac,tnk,bvd,mvn,lgl,prf,nus,prs,nyu,hmn,mam,dww,ang,yut,dan,khz,sey,cbr,bxh,bam,kvg,byx,bkd,pan,yaa,bmk,ceb,plt,yka,hot,att,urb,cnt,bgc,isl,ckb,yor,fry,slk,wuu,wiv,zty,fin,ong,tbg,gam,anh,vie,mal,pam,kiw,sab,tiy,gux,ipi,asm,acf,amx,bjr,mwe,myu,gun,kmh,xon,amo,knc,tet,uri,kbm,msm,meu,aeb,chd,yuj,gbm,wnu,ukr,alp,clu,mit,fao,ssw,ngp,otq,shp,deu,kue,kbh,boy,mjc,cuc,bzj,big,kos,jav,ctu,aon,bgt,ell,abt,miz,amf,kyq,wat,qub,zaj,auc,quf,csb,nna,zul,nca,cuk,smo,spy,vid,zad,ape,mon,wed,taq,ars,swg,tue,lcm,tnc,ruf,awx | +| [MTEB(Retrieval w/Instructions)](https://arxiv.org/abs/2403.15246) | 3 | {'InstructionRetrieval': 3} | [News, Written] | eng | +| [MTEB(Scandinavian)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | 28 | {'BitextMining': 2, 'Classification': 13, 'Retrieval': 7, 'Clustering': 6} | [News, Spoken, Written, Non-fiction, Encyclopaedic, Reviews, Government, Social, Blog, Legal, Web, Fiction] | nob,fao,isl,dan,swe,nno | +| MTEB(code) | 12 | {'Retrieval': 12} | [Written, Programming] | c++,javascript,typescript,java,ruby,swift,go,scala,shell,python,sql,rust,c,eng,php | +| [MTEB(deu)](https://arxiv.org/html/2401.02709v1) | 19 | {'Classification': 6, 'Clustering': 4, 'PairClassification': 2, 'Reranking': 1, 'Retrieval': 4, 'STS': 2} | [News, Written, Reviews, Encyclopaedic, Web, Spoken] | pol,deu,eng,fra | +| MTEB(eng, beta) | 41 | {'Classification': 8, 'Retrieval': 10, 'Clustering': 8, 'Reranking': 2, 'STS': 9, 'PairClassification': 3, 'Summarization': 1} | [Spoken, News, Written, Academic, Non-fiction, Reviews, Medical, Encyclopaedic, Social, Programming, Web, Blog] | ara,cmn,fra,pol,ita,nld,tur,deu,eng,spa | +| MTEB(eng, classic) | 67 | {'Classification': 12, 'Retrieval': 26, 'Clustering': 11, 'Reranking': 4, 'STS': 10, 'PairClassification': 3, 'Summarization': 1} | [Spoken, News, Written, Academic, Non-fiction, Reviews, Medical, Encyclopaedic, Social, Programming, Web, Blog] | ara,cmn,fra,pol,ita,nld,tur,deu,eng,spa | +| [MTEB(fra)](https://arxiv.org/abs/2405.20468) | 26 | {'Classification': 6, 'Clustering': 7, 'PairClassification': 2, 'Reranking': 2, 'Retrieval': 5, 'STS': 3, 'Summarization': 1} | [News, Written, Academic, Non-fiction, Reviews, Encyclopaedic, Social, Web, Legal, Spoken] | pol,deu,eng,fra | +| [MTEB(jpn)](https://github.com/sbintuitions/JMTEB) | 16 | {'Clustering': 2, 'Classification': 4, 'STS': 2, 'PairClassification': 1, 'Retrieval': 6, 'Reranking': 1} | [News, Written, Academic, Non-fiction, Reviews, Encyclopaedic, Web, Spoken] | jpn | +| MTEB(kor) | 6 | {'Classification': 1, 'Reranking': 1, 'Retrieval': 2, 'STS': 2} | [News, Written, Reviews, Encyclopaedic, Web, Spoken] | kor | +| [MTEB(law)](https://aclanthology.org/2023.eacl-main.148/) | 8 | {'Retrieval': 8} | [Legal, Written] | deu,eng,zho | +| [MTEB(pol)](https://arxiv.org/abs/2405.10138) | 18 | {'Classification': 7, 'Clustering': 3, 'PairClassification': 4, 'STS': 4} | [News, Written, Academic, Non-fiction, Social, Web, Legal, Spoken, Fiction] | pol,deu,eng,fra | +| [MTEB(rus)](https://aclanthology.org/2023.eacl-main.148/) | 23 | {'Classification': 9, 'Clustering': 3, 'MultilabelClassification': 2, 'PairClassification': 1, 'Reranking': 2, 'Retrieval': 3, 'STS': 3} | [News, Spoken, Written, Academic, Reviews, Encyclopaedic, Social, Web, Blog] | rus | +| [NanoBEIR](https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6) | 13 | {'Retrieval': 13} | [News, Written, Academic, Non-fiction, Medical, Encyclopaedic, Social, Web] | eng | +| [RAR-b](https://arxiv.org/abs/2404.06347) | 17 | {'Retrieval': 17} | [Encyclopaedic, Written, Programming] | eng | \ No newline at end of file diff --git a/docs/tasks.md b/docs/tasks.md index f4ec3c792e..459c03e033 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -122,6 +122,8 @@ The following tables give you an overview of the tasks in MTEB. | [CUREv1](https://huggingface.co/datasets/clinia/CUREv1) | ['eng', 'fra', 'spa'] | Retrieval | s2p | [Medical, Academic, Written] | None | None | | [CanadaTaxCourtOutcomesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [CataloniaTweetClassification](https://aclanthology.org/2020.lrec-1.171/) | ['cat', 'spa'] | Classification | s2s | [Social, Government, Written] | None | None | +| [ChemHotpotQARetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | +| [ChemNQRetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | | [ClimateFEVER](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | | None | None | | [ClimateFEVERHardNegatives](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | | None | None | | [CmedqaRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) | ['cmn'] | Retrieval | s2p | [Medical, Written] | None | None | @@ -425,6 +427,12 @@ The following tables give you an overview of the tasks in MTEB. | [PolEmo2.0-IN](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Written, Social] | None | None | | [PolEmo2.0-OUT](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Written, Social] | None | None | | [PpcPC](https://arxiv.org/pdf/2207.12759.pdf) (Sławomir Dadas, 2022) | ['pol'] | PairClassification | s2s | [Fiction, Non-fiction, Web, Written, Spoken, Social, News] | None | None | +| [PubChemAISentenceParaphrasePC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | +| [PubChemSMILESBitextMining](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['en', 'eng'] | BitextMining | s2s | [Chemistry] | None | None | +| [PubChemSMILESPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | +| [PubChemSynonymPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | +| [PubChemWikiPairClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['ces', 'deu', 'eng', 'fra', 'hin', 'jpn', 'kor', 'msa', 'nld', 'por', 'spa', 'tur', 'zho'] | PairClassification | s2s | [Chemistry] | None | None | +| [PubChemWikiParagraphsPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | p2p | [Chemistry] | None | None | | [PublicHealthQA](https://huggingface.co/datasets/xhluca/publichealth-qa) | ['ara', 'eng', 'fra', 'kor', 'rus', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Medical, Government, Web, Written] | None | None | | [PunjabiNewsClassification](https://github.com/goru001/nlp-for-punjabi/) (Anoop Kunchukuttan, 2020) | ['pan'] | Classification | s2s | [News, Written] | None | None | | [QBQTC](https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset) | ['cmn'] | STS | s2s | | None | None | @@ -468,6 +476,8 @@ The following tables give you an overview of the tasks in MTEB. | [SCDDVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCIDOCS](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Written, Non-fiction] | None | None | | [SCIDOCS-PL](https://allenai.org/data/scidocs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | +| [SDSEyeProtectionClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2p | [Chemistry] | None | None | +| [SDSGlovesClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2p | [Chemistry] | None | None | | [SIB200Classification](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Classification | s2s | [News, Written] | None | None | | [SIB200ClusteringS2S](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Clustering | s2s | [News, Written] | None | None | | [SICK-BR-PC](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) | ['por'] | PairClassification | s2s | [Web, Written] | None | None | @@ -595,8 +605,26 @@ The following tables give you an overview of the tasks in MTEB. | [WebLINXCandidatesReranking](https://mcgill-nlp.github.io/weblinx) (Xing Han Lù, 2024) | ['eng'] | Reranking | p2p | [Academic, Web, Written] | None | None | | [WikiCitiesClustering](https://huggingface.co/datasets/wikipedia) | ['eng'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | | [WikiClusteringP2P.v2](https://github.com/Rysias/wiki-clustering) | ['bos', 'cat', 'ces', 'dan', 'eus', 'glv', 'ilo', 'kur', 'lav', 'min', 'mlt', 'sco', 'sqi', 'wln'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | +| [WikipediaAIParagraphsParaphrasePC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | p2p | [Chemistry] | None | None | +| [WikipediaBioMetChemClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaBiolumNeurochemClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaChemEngSpecialtiesClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaChemFieldsClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaChemistryTopicsClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaChemistryTopicsClustering](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Clustering | s2p | [Chemistry] | None | None | +| [WikipediaCompChemSpectroscopyClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaCryobiologySeparationClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaCrystallographyAnalyticalClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaGreenhouseEnantiopureClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaIsotopesFissionClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaLuminescenceClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaOrganicInorganicClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaRerankingMultilingual](https://huggingface.co/datasets/ellamind/wikipedia-2023-11-reranking-multilingual) | ['ben', 'bul', 'ces', 'dan', 'deu', 'eng', 'fas', 'fin', 'hin', 'ita', 'nld', 'nor', 'por', 'ron', 'srp', 'swe'] | Reranking | s2p | [Encyclopaedic, Written] | {'test': 24000} | {'test': {'num_samples': 24000, 'number_of_characters': 83866932, 'num_positive': 24000, 'num_negative': 192000, 'min_query_length': 7, 'avg_query_length': 59.09, 'max_query_length': 180, 'unique_query': 23997, 'min_positive_length': 100, 'avg_positive_length': 385.45, 'max_positive_length': 3515, 'unique_positive': 23993, 'min_negative_length': 100, 'avg_negative_length': 381.24, 'max_negative_length': 9461, 'unique_negative': 191783, 'hf_subset_descriptive_stats': {'bg': {'num_samples': 1500, 'number_of_characters': 5145316, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 18, 'avg_query_length': 60.83, 'max_query_length': 166, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 375.89, 'max_positive_length': 2241, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 374.19, 'max_negative_length': 4869, 'unique_negative': 11996}, 'bn': {'num_samples': 1500, 'number_of_characters': 5390581, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 7, 'avg_query_length': 47.27, 'max_query_length': 123, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 394.59, 'max_positive_length': 2338, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 393.98, 'max_negative_length': 5104, 'unique_negative': 11996}, 'cs': {'num_samples': 1500, 'number_of_characters': 5079180, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 17, 'avg_query_length': 56.27, 'max_query_length': 137, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 383.84, 'max_positive_length': 2300, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 368.25, 'max_negative_length': 3487, 'unique_negative': 11982}, 'da': {'num_samples': 1500, 'number_of_characters': 4746132, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 17, 'avg_query_length': 56.75, 'max_query_length': 137, 'unique_query': 1499, 'min_positive_length': 100, 'avg_positive_length': 351.68, 'max_positive_length': 2159, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 344.46, 'max_negative_length': 2563, 'unique_negative': 11972}, 'de': {'num_samples': 1500, 'number_of_characters': 5483592, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 20, 'avg_query_length': 70.0, 'max_query_length': 180, 'unique_query': 1499, 'min_positive_length': 100, 'avg_positive_length': 391.54, 'max_positive_length': 2674, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 399.27, 'max_negative_length': 3083, 'unique_negative': 12000}, 'en': {'num_samples': 1500, 'number_of_characters': 6217884, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 18, 'avg_query_length': 68.37, 'max_query_length': 162, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 451.73, 'max_positive_length': 3515, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 453.14, 'max_negative_length': 3662, 'unique_negative': 12000}, 'fa': {'num_samples': 1500, 'number_of_characters': 4732619, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 12, 'avg_query_length': 48.67, 'max_query_length': 119, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 347.7, 'max_positive_length': 2571, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 344.84, 'max_negative_length': 4707, 'unique_negative': 11978}, 'fi': {'num_samples': 1500, 'number_of_characters': 5209132, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 14, 'avg_query_length': 55.34, 'max_query_length': 132, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 394.71, 'max_positive_length': 2129, 'unique_positive': 1498, 'min_negative_length': 100, 'avg_negative_length': 377.84, 'max_negative_length': 2574, 'unique_negative': 11972}, 'hi': {'num_samples': 1500, 'number_of_characters': 5620959, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 13, 'avg_query_length': 50.78, 'max_query_length': 125, 'unique_query': 1499, 'min_positive_length': 100, 'avg_positive_length': 420.38, 'max_positive_length': 2361, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 409.52, 'max_negative_length': 5912, 'unique_negative': 11996}, 'it': {'num_samples': 1500, 'number_of_characters': 5420496, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 23, 'avg_query_length': 70.05, 'max_query_length': 156, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 396.97, 'max_positive_length': 2082, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 393.33, 'max_negative_length': 9461, 'unique_negative': 11993}, 'nl': {'num_samples': 1500, 'number_of_characters': 5169556, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 18, 'avg_query_length': 65.34, 'max_query_length': 136, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 380.79, 'max_positive_length': 1864, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 375.03, 'max_negative_length': 3641, 'unique_negative': 11985}, 'pt': {'num_samples': 1500, 'number_of_characters': 5474356, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 18, 'avg_query_length': 65.12, 'max_query_length': 176, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 404.02, 'max_positive_length': 3057, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 397.55, 'max_negative_length': 2877, 'unique_negative': 11991}, 'ro': {'num_samples': 1500, 'number_of_characters': 4796113, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 14, 'avg_query_length': 61.97, 'max_query_length': 169, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 346.71, 'max_positive_length': 1917, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 348.59, 'max_negative_length': 4213, 'unique_negative': 11971}, 'sr': {'num_samples': 1500, 'number_of_characters': 5271732, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 15, 'avg_query_length': 55.67, 'max_query_length': 146, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 386.35, 'max_positive_length': 2421, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 384.06, 'max_negative_length': 3668, 'unique_negative': 11974}, 'no': {'num_samples': 1500, 'number_of_characters': 5036586, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 14, 'avg_query_length': 55.29, 'max_query_length': 129, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 367.72, 'max_positive_length': 1450, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 366.84, 'max_negative_length': 2841, 'unique_negative': 11996}, 'sv': {'num_samples': 1500, 'number_of_characters': 5072698, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 17, 'avg_query_length': 57.73, 'max_query_length': 133, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 372.59, 'max_positive_length': 2493, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 368.94, 'max_negative_length': 3680, 'unique_negative': 11999}}}} | | [WikipediaRetrievalMultilingual](https://huggingface.co/datasets/ellamind/wikipedia-2023-11-retrieval-multilingual-queries) | ['ben', 'bul', 'ces', 'dan', 'deu', 'eng', 'fas', 'fin', 'hin', 'ita', 'nld', 'nor', 'por', 'ron', 'srp', 'swe'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [WikipediaSaltsSemiconductorsClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaSolidStateColloidalClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaSpecialtiesInChemistryClustering](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Clustering | s2p | [Chemistry] | None | None | +| [WikipediaTheoreticalAppliedClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WinoGrande](https://winogrande.allenai.org/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [WisesightSentimentClassification](https://github.com/PyThaiNLP/wisesight-sentiment) | ['tha'] | Classification | s2s | [Social, News, Written] | None | None | | XMarket (Bonab et al., 2021) | ['deu', 'eng', 'spa'] | Retrieval | s2p | | None | None | @@ -819,7 +847,7 @@ The following tables give you an overview of the tasks in MTEB. | cco | Comaltepec Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | ceb | Cebuano | Austronesian | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | | cek | Eastern Khumi Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ces | Czech | Indo-European | 4 | 5 | 2 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 0 | 16 | +| ces | Czech | Indo-European | 4 | 5 | 2 | 0 | 1 | 2 | 1 | 2 | 0 | 0 | 0 | 17 | | cgc | Kagayanen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | cha | Chamorro | Austronesian | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | | chd | Highland Oaxaca Chontal | Tequistlatecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -875,7 +903,7 @@ The following tables give you an overview of the tasks in MTEB. | dah | Gwahatike | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dan | Danish | Indo-European | 5 | 9 | 2 | 0 | 1 | 0 | 1 | 5 | 0 | 0 | 0 | 23 | | ded | Dedua | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| deu | German | Indo-European | 6 | 14 | 7 | 0 | 1 | 6 | 2 | 18 | 4 | 0 | 0 | 58 | +| deu | German | Indo-European | 6 | 14 | 7 | 0 | 1 | 7 | 2 | 18 | 4 | 0 | 0 | 59 | | dgc | Casiguran Dumagat Agta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dgr | Dogrib | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dgz | Daga | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -903,7 +931,8 @@ The following tables give you an overview of the tasks in MTEB. | ell | Modern Greek (1453-) | Indo-European | 3 | 6 | 1 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 16 | | emi | Mussau-Emira | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | emp | Northern Emberá | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 16 | 143 | 16 | 3 | 1 | 8 | 8 | 105 | 13 | 2 | 1 | 316 | +| en | unknown | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eng | English | Indo-European | 17 | 160 | 18 | 3 | 1 | 14 | 8 | 107 | 13 | 2 | 1 | 344 | | enq | Enga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | epo | Esperanto | Artificial Language | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | eri | Ogea | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -924,7 +953,7 @@ The following tables give you an overview of the tasks in MTEB. | fin | Finnish | Uralic | 3 | 5 | 1 | 0 | 1 | 1 | 2 | 5 | 1 | 0 | 0 | 19 | | fon | Fon | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | | for | Fore | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fra | French | Indo-European | 7 | 13 | 8 | 0 | 1 | 5 | 3 | 15 | 4 | 0 | 1 | 57 | +| fra | French | Indo-European | 7 | 13 | 8 | 0 | 1 | 6 | 3 | 15 | 4 | 0 | 1 | 58 | | fry | Western Frisian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | fuc | Pulaar | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | fue | Borgu Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -981,7 +1010,7 @@ The following tables give you an overview of the tasks in MTEB. | hch | Huichol | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | heb | Hebrew | Afro-Asiatic | 4 | 5 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 11 | | heg | Helong | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hin | Hindi | Indo-European | 9 | 12 | 2 | 0 | 0 | 1 | 2 | 10 | 2 | 0 | 0 | 38 | +| hin | Hindi | Indo-European | 9 | 12 | 2 | 0 | 0 | 2 | 2 | 10 | 2 | 0 | 0 | 39 | | hix | Hixkaryána | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | hla | Halia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | hlt | Matu Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1030,7 +1059,7 @@ The following tables give you an overview of the tasks in MTEB. | jid | Bu (Kaduna State) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | jiv | Shuar | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | jni | Janji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jpn | Japanese | Japonic | 5 | 8 | 3 | 0 | 0 | 1 | 3 | 13 | 2 | 0 | 0 | 35 | +| jpn | Japanese | Japonic | 5 | 8 | 3 | 0 | 0 | 2 | 3 | 13 | 2 | 0 | 0 | 36 | | jvn | Caribbean Javanese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kab | Kabyle | Afro-Asiatic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | | kac | Kachin | Sino-Tibetan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | @@ -1086,7 +1115,7 @@ The following tables give you an overview of the tasks in MTEB. | knj | Western Kanjobal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | knv | Tabo | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kon | Kongo | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kor | Korean | Koreanic | 4 | 8 | 1 | 0 | 1 | 2 | 1 | 9 | 3 | 0 | 0 | 29 | +| kor | Korean | Koreanic | 4 | 8 | 1 | 0 | 1 | 3 | 1 | 9 | 3 | 0 | 0 | 30 | | kos | Kosraean | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kpf | Komba | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kpg | Kapingamarangi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1231,7 +1260,7 @@ The following tables give you an overview of the tasks in MTEB. | mqb | Mbuko | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | mqj | Mamasa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | mri | Maori | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| msa | Malay (macrolanguage) | Unclassified | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| msa | Malay (macrolanguage) | Unclassified | 1 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | | msb | Masbatenyo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | msc | Sankaran Maninka | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | msk | Mansaka | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1292,7 +1321,7 @@ The following tables give you an overview of the tasks in MTEB. | nij | Ngaju | Austronesian | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | | nin | Ninzo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | nko | Nkonya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nld | Dutch | Indo-European | 6 | 6 | 1 | 0 | 1 | 0 | 1 | 2 | 2 | 0 | 0 | 19 | +| nld | Dutch | Indo-European | 6 | 6 | 1 | 0 | 1 | 1 | 1 | 2 | 2 | 0 | 0 | 20 | | nlg | Gela | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | nna | Nyangumarta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | nno | Norwegian Nynorsk | Unclassified | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | @@ -1364,7 +1393,7 @@ The following tables give you an overview of the tasks in MTEB. | poi | Highland Popoluca | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | pol | Polish | Indo-European | 4 | 11 | 4 | 0 | 1 | 4 | 0 | 18 | 4 | 0 | 0 | 46 | | pon | Pohnpeian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| por | Portuguese | Indo-European | 4 | 9 | 1 | 0 | 2 | 2 | 1 | 5 | 3 | 0 | 0 | 27 | +| por | Portuguese | Indo-European | 4 | 9 | 1 | 0 | 2 | 3 | 1 | 5 | 3 | 0 | 0 | 28 | | poy | Pogolo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | ppo | Folopa | Teberan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | prf | Paranan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1449,7 +1478,7 @@ The following tables give you an overview of the tasks in MTEB. | soq | Kanasi | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | sot | Southern Sotho | Atlantic-Congo | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | | soy | Miyobe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spa | Spanish | Indo-European | 4 | 13 | 4 | 0 | 1 | 2 | 2 | 13 | 4 | 0 | 0 | 43 | +| spa | Spanish | Indo-European | 4 | 13 | 4 | 0 | 1 | 3 | 2 | 13 | 4 | 0 | 0 | 44 | | spl | Selepet | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | spm | Akukem | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | spp | Supyire Senoufo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1546,7 +1575,7 @@ The following tables give you an overview of the tasks in MTEB. | tuk | Turkmen | Turkic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | tum | Tumbuka | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | | tuo | Tucano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tur | Turkish | Turkic | 4 | 7 | 1 | 0 | 0 | 2 | 0 | 3 | 2 | 0 | 0 | 19 | +| tur | Turkish | Turkic | 4 | 7 | 1 | 0 | 0 | 3 | 0 | 3 | 2 | 0 | 0 | 20 | | tvk | Southeast Ambrym | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | twi | Twi | Unclassified | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | | txq | Tii | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1656,7 +1685,7 @@ The following tables give you an overview of the tasks in MTEB. | zaw | Mitla Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zca | Coatecas Altas Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zga | Kinga | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zho | Chinese | Unclassified | 2 | 2 | 1 | 0 | 0 | 1 | 1 | 13 | 0 | 0 | 0 | 20 | +| zho | Chinese | Unclassified | 2 | 2 | 1 | 0 | 0 | 2 | 1 | 13 | 0 | 0 | 0 | 21 | | zia | Zia | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | ziw | Zigula | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zlm | Malay (individual language) | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1675,7 +1704,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 1394 | 795 | 304 | 3 | 28 | 67 | 51 | 473 | 85 | 2 | 2 | +| Total | None | None | None | 1396 | 812 | 306 | 3 | 28 | 85 | 51 | 475 | 85 | 2 | 2 | From add930ca9f93f18a103db6490120de9c046f573d Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 5 Jan 2025 00:22:37 +0000 Subject: [PATCH 39/49] Mention ChemTEB in readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f556cad894..daf715f029 100644 --- a/README.md +++ b/README.md @@ -517,5 +517,6 @@ You may also want to read and cite the amazing work that has extended MTEB & int - Orion Weller, Benjamin Chang, Sean MacAvaney, Kyle Lo, Arman Cohan, Benjamin Van Durme, Dawn Lawrie, Luca Soldaini. "[FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions](https://arxiv.org/abs/2403.15246)" arXiv 2024 - Dawei Zhu, Liang Wang, Nan Yang, Yifan Song, Wenhao Wu, Furu Wei, Sujian Li. "[LongEmbed: Extending Embedding Models for Long Context Retrieval](https://arxiv.org/abs/2404.12096)" arXiv 2024 - Kenneth Enevoldsen, Márton Kardos, Niklas Muennighoff, Kristoffer Laigaard Nielbo. "[The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding](https://arxiv.org/abs/2406.02396)" arXiv 2024 +- Ali Shiraee Kasmaee, Mohammad Khodadad, Mohammad Arshi Saloot, Nick Sherck, Stephen Dokas, Hamidreza Mahyar, Soheila Samiee. "[ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance & Efficiency on a Specific Domain](https://arxiv.org/abs/2412.00532)" arXiv 2024 For works that have used MTEB for benchmarking, you can find them on the [leaderboard](https://huggingface.co/spaces/mteb/leaderboard). From ea75d395f71487fcf5f9149dc4bd513091c51cbe Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 5 Jan 2025 02:45:59 +0000 Subject: [PATCH 40/49] Fix some issues, update task metadata, lint - `eval_langs` fixed - Dataset path was fixed for two datasets - Metadata was completed for all tasks, mainly following fields: `date`, `task_subtypes`, `dialect`, `sample_creation` - ruff lint - rename `nomic_bert_models.py` to `nomic_bert_model.py` and update it. --- mteb/models/amazon_models.py | 12 +-- mteb/models/cohere_bedrock_models.py | 40 ++++---- ...mic_bert_models.py => nomic_bert_model.py} | 95 +++++++++---------- mteb/models/overview.py | 8 +- .../eng/PubChemSMILESBitextMining.py | 25 ++--- mteb/tasks/Classification/__init__.py | 22 ++--- .../eng/SDSEyeProtectionClassification.py | 6 +- .../eng/SDSGlovesClassification.py | 6 +- .../eng/WikipediaBioMetChemClassification.py | 8 +- .../WikipediaBiolumNeurochemClassification.py | 6 +- ...kipediaChemEngSpecialtiesClassification.py | 8 +- .../eng/WikipediaChemFieldsClassification.py | 8 +- .../WikipediaChemistryTopicsClassification.py | 8 +- ...pediaCompChemSpectroscopyClassification.py | 8 +- ...ediaCryobiologySeparationClassification.py | 8 +- ...CrystallographyAnalyticalClassification.py | 8 +- ...ediaGreenhouseEnantiopureClassification.py | 8 +- .../WikipediaIsotopesFissionClassification.py | 8 +- .../WikipediaLuminescenceClassification.py | 8 +- ...WikipediaOrganicInorganicClassification.py | 8 +- ...ipediaSaltsSemiconductorsClassification.py | 10 +- ...ipediaSolidStateColloidalClassification.py | 8 +- ...kipediaTheoreticalAppliedClassification.py | 8 +- mteb/tasks/Clustering/__init__.py | 2 +- ...WikipediaChemistrySpecialtiesClustering.py | 9 +- .../eng/WikipediaChemistryTopicsClustering.py | 11 +-- .../eng/PubChemAISentenceParaphrasePC.py | 12 +-- .../PairClassification/eng/PubChemSMILESPC.py | 17 ++-- .../eng/PubChemSynonymPC.py | 10 +- .../eng/PubChemWikiParagraphsPC.py | 13 ++- .../eng/WikipediaAIParagraphsParaphrasePC.py | 12 +-- .../PubChemWikiPairClassification.py | 32 +++---- mteb/tasks/Retrieval/__init__.py | 2 +- .../Retrieval/eng/ChemHotpotQARetrieval.py | 10 +- mteb/tasks/Retrieval/eng/ChemNQRetrieval.py | 10 +- 35 files changed, 240 insertions(+), 234 deletions(-) rename mteb/models/{nomic_bert_models.py => nomic_bert_model.py} (61%) diff --git a/mteb/models/amazon_models.py b/mteb/models/amazon_models.py index c736ebd796..845ad7bc0c 100644 --- a/mteb/models/amazon_models.py +++ b/mteb/models/amazon_models.py @@ -1,14 +1,15 @@ from __future__ import annotations +import json import logging from functools import partial from typing import Any import numpy as np -import json from mteb.model_meta import ModelMeta from mteb.requires_package import requires_package + from .wrapper import Wrapper logger = logging.getLogger(__name__) @@ -18,6 +19,7 @@ class AmazonWrapper(Wrapper): def __init__(self, model_id: str, **kwargs) -> None: requires_package(self, "boto3", "Amazon Bedrock") import boto3 + boto3_session = boto3.session.Session() region_name = boto3_session.region_name self._client = boto3.client( @@ -33,12 +35,10 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: for sentence in sentences: response = self._client.invoke_model( - body=json.dumps({ - "inputText": sentence - }), + body=json.dumps({"inputText": sentence}), modelId=self._model_id, accept="application/json", - contentType="application/json" + contentType="application/json", ) all_embeddings.append(self._to_numpy(response)) @@ -46,7 +46,7 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: def _to_numpy(self, embedding_response) -> np.ndarray: response = json.loads(embedding_response.get("body").read()) - return np.array(response['embedding']) + return np.array(response["embedding"]) amazon_titan_embed_text_v1 = ModelMeta( diff --git a/mteb/models/cohere_bedrock_models.py b/mteb/models/cohere_bedrock_models.py index a3096c8eb3..6f37660e95 100644 --- a/mteb/models/cohere_bedrock_models.py +++ b/mteb/models/cohere_bedrock_models.py @@ -1,16 +1,17 @@ from __future__ import annotations +import json import logging from functools import partial from typing import Any import numpy as np -import json import tqdm -from mteb.model_meta import ModelMeta from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta from mteb.requires_package import requires_package + from .wrapper import Wrapper logger = logging.getLogger(__name__) @@ -131,13 +132,11 @@ class CohereBedrockWrapper(Wrapper): def __init__( - self, - model_id: str, - model_prompts: dict[str, str] | None = None, - **kwargs + self, model_id: str, model_prompts: dict[str, str] | None = None, **kwargs ) -> None: requires_package(self, "boto3", "Amazon Bedrock") import boto3 + boto3_session = boto3.session.Session() region_name = boto3_session.region_name self._client = boto3.client( @@ -158,7 +157,7 @@ def _embed( max_batch_size = 96 batches = [ - sentences[i: i + max_batch_size] + sentences[i : i + max_batch_size] for i in range(0, len(sentences), max_batch_size) ] @@ -166,12 +165,15 @@ def _embed( for batch in tqdm.tqdm(batches, leave=False, disable=not show_progress_bar): response = self._client.invoke_model( - body=json.dumps({ - "texts": [sent[:2048] for sent in batch], - "input_type": cohere_task_type}), + body=json.dumps( + { + "texts": [sent[:2048] for sent in batch], + "input_type": cohere_task_type, + } + ), modelId=self._model_id, accept="*/*", - contentType="application/json" + contentType="application/json", ) all_embeddings.extend(self._to_numpy(response)) @@ -206,7 +208,7 @@ def encode( def _to_numpy(self, embedding_response) -> np.ndarray: response = json.loads(embedding_response.get("body").read()) - return np.array(response['embeddings']) + return np.array(response["embeddings"]) model_prompts = { @@ -218,8 +220,11 @@ def _to_numpy(self, embedding_response) -> np.ndarray: } cohere_embed_english_v3 = ModelMeta( - loader=partial(CohereBedrockWrapper, - model_id="cohere.embed-english-v3", model_prompts=model_prompts), + loader=partial( + CohereBedrockWrapper, + model_id="cohere.embed-english-v3", + model_prompts=model_prompts, + ), name="bedrock/cohere-embed-english-v3", languages=["eng-Latn"], open_weights=False, @@ -237,8 +242,11 @@ def _to_numpy(self, embedding_response) -> np.ndarray: ) cohere_embed_multilingual_v3 = ModelMeta( - loader=partial(CohereBedrockWrapper, - model_id="cohere.embed-multilingual-v3", model_prompts=model_prompts), + loader=partial( + CohereBedrockWrapper, + model_id="cohere.embed-multilingual-v3", + model_prompts=model_prompts, + ), name="cohere-embed-multilingual-v3", languages=supported_languages, open_weights=False, diff --git a/mteb/models/nomic_bert_models.py b/mteb/models/nomic_bert_model.py similarity index 61% rename from mteb/models/nomic_bert_models.py rename to mteb/models/nomic_bert_model.py index 10da788d7c..2210bb920c 100644 --- a/mteb/models/nomic_bert_models.py +++ b/mteb/models/nomic_bert_model.py @@ -1,31 +1,29 @@ from __future__ import annotations from functools import partial +from typing import Any +import torch +import torch.nn as nn from sentence_transformers import SentenceTransformer -from sentence_transformers.models import Transformer, Pooling +from sentence_transformers.models import Pooling, Transformer +from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer from mteb.model_meta import ModelMeta -from transformers import AutoTokenizer, AutoConfig, AutoModelForMaskedLM -from typing import Any, Dict, Optional - -import torch.nn as nn -import torch - class NomicBertTransformer(Transformer): def __init__( self, model_name_or_path: str, - max_seq_length: Optional[int] = None, - model_args: Optional[Dict[str, Any]] = None, - tokenizer_args: Optional[Dict[str, Any]] = None, - config_args: Optional[Dict[str, Any]] = None, - cache_dir: Optional[str] = None, + max_seq_length: int | None = None, + model_args: dict[str, Any] | None = None, + tokenizer_args: dict[str, Any] | None = None, + config_args: dict[str, Any] | None = None, + cache_dir: str | None = None, do_lower_case: bool = False, tokenizer_name_or_path: str = None, - revision: str = None + revision: str = None, ) -> None: nn.Module.__init__(self) self.config_keys = ["max_seq_length", "do_lower_case"] @@ -38,16 +36,23 @@ def __init__( config_args = {} config = AutoConfig.from_pretrained( - model_name_or_path, **config_args, cache_dir=cache_dir) + model_name_or_path, **config_args, cache_dir=cache_dir + ) self.auto_model = AutoModelForMaskedLM.from_pretrained( - model_name_or_path, config=config, revision=revision, cache_dir=cache_dir, **model_args + model_name_or_path, + config=config, + revision=revision, + cache_dir=cache_dir, + **model_args, ) self.auto_model.cls = nn.Identity() if max_seq_length is not None and "model_max_length" not in tokenizer_args: tokenizer_args["model_max_length"] = max_seq_length self.tokenizer = AutoTokenizer.from_pretrained( - tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path, + tokenizer_name_or_path + if tokenizer_name_or_path is not None + else model_name_or_path, cache_dir=cache_dir, **tokenizer_args, ) @@ -59,28 +64,38 @@ def __init__( and hasattr(self.tokenizer, "model_max_length") ): max_seq_length = min( - self.auto_model.config.max_position_embeddings, self.tokenizer.model_max_length) + self.auto_model.config.max_position_embeddings, + self.tokenizer.model_max_length, + ) self.max_seq_length = max_seq_length if tokenizer_name_or_path is not None: self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__ - def forward(self, features: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: """Returns token_embeddings, cls_token""" trans_features = { - "input_ids": features["input_ids"], "attention_mask": features["attention_mask"]} + "input_ids": features["input_ids"], + "attention_mask": features["attention_mask"], + } if "token_type_ids" in features: trans_features["token_type_ids"] = features["token_type_ids"] output_states = self.auto_model(**trans_features) output_tokens = output_states.logits - features.update({"token_embeddings": output_tokens, - "attention_mask": features["attention_mask"]}) + features.update( + { + "token_embeddings": output_tokens, + "attention_mask": features["attention_mask"], + } + ) if self.auto_model.config.output_hidden_states: all_layer_idx = 2 - if len(output_states) < 3: # Some models only output last_hidden_states and all_hidden_states + if ( + len(output_states) < 3 + ): # Some models only output last_hidden_states and all_hidden_states all_layer_idx = 1 hidden_states = output_states[all_layer_idx] @@ -102,43 +117,24 @@ def nomic_bert_loader( ) -> SentenceTransformer: nomic_bert_transformer = NomicBertTransformer( model_name_or_path=model_name, - tokenizer_name_or_path='bert-base-uncased', - config_args={'trust_remote_code': True}, - model_args={'trust_remote_code': True}, - revision=revision + tokenizer_name_or_path="bert-base-uncased", + config_args={"trust_remote_code": True}, + model_args={"trust_remote_code": True}, + revision=revision, ) - pooling_model = Pooling( - nomic_bert_transformer.get_word_embedding_dimension()) - - return SentenceTransformerWithNormalization(modules=[nomic_bert_transformer, pooling_model]) - + pooling_model = Pooling(nomic_bert_transformer.get_word_embedding_dimension()) -def custom_nomic_bert_loader( - model_name: str, - tokenizer_name: str, - revision: str | None, **kwargs -) -> SentenceTransformer: - nomic_bert_transformer = NomicBertTransformer( - model_name_or_path=model_name, - tokenizer_name_or_path=tokenizer_name, - config_args={'trust_remote_code': True}, - model_args={'trust_remote_code': True, 'use_auth_token': True}, - tokenizer_args={'use_auth_token': True}, - revision=revision + return SentenceTransformerWithNormalization( + modules=[nomic_bert_transformer, pooling_model] ) - pooling_model = Pooling( - nomic_bert_transformer.get_word_embedding_dimension()) - - return SentenceTransformerWithNormalization(modules=[nomic_bert_transformer, pooling_model]) - nomic_bert = ModelMeta( loader=partial( # type: ignore nomic_bert_loader, model_name="nomic-ai/nomic-bert-2048", - revision=None, + revision="40b98394640e630d5276807046089b233113aa87", ), name="nomic-ai/nomic-bert-2048", languages=["eng-Latn"], @@ -152,4 +148,3 @@ def custom_nomic_bert_loader( public_training_code=True, max_tokens=2048, ) - diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 01694f91f7..affb9a69d7 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -11,9 +11,11 @@ from mteb.encoder_interface import Encoder from mteb.model_meta import ModelMeta from mteb.models import ( + amazon_models, arctic_models, bge_models, bm25, + cohere_bedrock_models, cohere_models, colbert_models, e5_instruct, @@ -30,6 +32,7 @@ model2vec_models, mxbai_models, no_instruct_sentence_models, + nomic_bert_model, nomic_models, nvidia_models, openai_models, @@ -43,9 +46,6 @@ stella_models, uae_models, voyage_models, - amazon_models, - cohere_bedrock_models, - nomic_bert_models ) logger = logging.getLogger(__name__) @@ -90,7 +90,7 @@ stella_models, amazon_models, cohere_bedrock_models, - nomic_bert_models, + nomic_bert_model, uae_models, voyage_models, ] diff --git a/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py b/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py index 4e1d62ee36..f2a2f3baa7 100644 --- a/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py +++ b/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py @@ -14,10 +14,10 @@ } EVAL_LANGS = { - "iso-title": ["en-Latn", "eng-Latn"], - "iso-desc": ["en-Latn", "eng-Latn"], - "canon-title": ["en-Latn", "eng-Latn"], - "canon-desc": ["en-Latn", "eng-Latn"], + "iso-title": ["eng-Latn", "eng-Latn"], + "iso-desc": ["eng-Latn", "eng-Latn"], + "canon-title": ["eng-Latn", "eng-Latn"], + "canon-desc": ["eng-Latn", "eng-Latn"], } @@ -26,7 +26,7 @@ class PubChemSMILESBitextMining(AbsTaskBitextMining, MultilingualTask): name="PubChemSMILESBitextMining", dataset={ "path": "BASF-AI/PubChemSMILESBitextMining", - "revision": "36700ea628118312ebf2f90ad2353a9a8f188dc9" + "revision": "36700ea628118312ebf2f90ad2353a9a8f188dc9", }, description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", reference="https://arxiv.org/abs/2412.00532", @@ -36,13 +36,13 @@ class PubChemSMILESBitextMining(AbsTaskBitextMining, MultilingualTask): eval_splits=["test"], eval_langs=EVAL_LANGS, main_score="f1", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, @@ -71,11 +71,14 @@ def load_data(self, **kwargs): for subset in self.hf_subsets: self.dataset[subset] = datasets.load_dataset( - **self.metadata_dict["dataset"], name=subset) + **self.metadata_dict["dataset"], name=subset + ) self.dataset_transform() self.data_loaded = True def dataset_transform(self): for subset in self.hf_subsets: - self.dataset[subset] = self.dataset[subset].rename_columns(COL_MAPPING[subset]) + self.dataset[subset] = self.dataset[subset].rename_columns( + COL_MAPPING[subset] + ) diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index 09e8761a17..12b0623b6b 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -39,21 +39,21 @@ from .eng.ToxicConversationsClassification import * from .eng.TweetSentimentExtractionClassification import * from .eng.TweetTopicSingleClassification import * +from .eng.WikipediaBiolumNeurochemClassification import * from .eng.WikipediaBioMetChemClassification import * -from .eng.WikipediaGreenhouseEnantiopureClassification import * -from .eng.WikipediaSolidStateColloidalClassification import * -from .eng.WikipediaOrganicInorganicClassification import * -from .eng.WikipediaCryobiologySeparationClassification import * -from .eng.WikipediaChemistryTopicsClassification import * -from .eng.WikipediaTheoreticalAppliedClassification import * +from .eng.WikipediaChemEngSpecialtiesClassification import * from .eng.WikipediaChemFieldsClassification import * -from .eng.WikipediaLuminescenceClassification import * +from .eng.WikipediaChemistryTopicsClassification import * +from .eng.WikipediaCompChemSpectroscopyClassification import * +from .eng.WikipediaCryobiologySeparationClassification import * +from .eng.WikipediaCrystallographyAnalyticalClassification import * +from .eng.WikipediaGreenhouseEnantiopureClassification import * from .eng.WikipediaIsotopesFissionClassification import * +from .eng.WikipediaLuminescenceClassification import * +from .eng.WikipediaOrganicInorganicClassification import * from .eng.WikipediaSaltsSemiconductorsClassification import * -from .eng.WikipediaBiolumNeurochemClassification import * -from .eng.WikipediaCrystallographyAnalyticalClassification import * -from .eng.WikipediaCompChemSpectroscopyClassification import * -from .eng.WikipediaChemEngSpecialtiesClassification import * +from .eng.WikipediaSolidStateColloidalClassification import * +from .eng.WikipediaTheoreticalAppliedClassification import * from .eng.YahooAnswersTopicsClassification import * from .eng.YelpReviewFullClassification import * from .est.estonian_valence import * diff --git a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py index cb442d22bf..197060ba0c 100644 --- a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py +++ b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py @@ -19,13 +19,13 @@ class SDSEyeProtectionClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="LM-generated and reviewed", dialect=[], - sample_creation=None, + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/SDSGlovesClassification.py b/mteb/tasks/Classification/eng/SDSGlovesClassification.py index f5055bd7ab..ac471d58e9 100644 --- a/mteb/tasks/Classification/eng/SDSGlovesClassification.py +++ b/mteb/tasks/Classification/eng/SDSGlovesClassification.py @@ -19,13 +19,13 @@ class SDSGlovesClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="LM-generated and reviewed", dialect=[], - sample_creation=None, + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py b/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py index 30194b4950..3b494f46f6 100644 --- a/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py @@ -19,13 +19,13 @@ class WikipediaBioMetChemClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py b/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py index ff3fd4f6ac..623ec8fc66 100644 --- a/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py @@ -19,13 +19,13 @@ class WikipediaBiolumNeurochemClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", dialect=[], - sample_creation=None, + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py b/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py index 41c8c025d5..c95abcd4f2 100644 --- a/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py @@ -19,13 +19,13 @@ class WikipediaChemEngSpecialtiesClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py index 3a6283556c..07509d427f 100644 --- a/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py @@ -19,13 +19,13 @@ class WikipediaChemFieldsClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py b/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py index 9a7c52bd7d..02751b1a32 100644 --- a/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py @@ -19,13 +19,13 @@ class WikipediaChemistryTopicsClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py b/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py index 837800cc1f..28a42ac044 100644 --- a/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py @@ -19,13 +19,13 @@ class WikipediaCompChemSpectroscopyClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py b/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py index 3e44b91e77..0e01454298 100644 --- a/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py @@ -19,13 +19,13 @@ class WikipediaCryobiologySeparationClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py b/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py index e42d787949..724ffc4249 100644 --- a/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py @@ -19,13 +19,13 @@ class WikipediaCrystallographyAnalyticalClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py b/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py index a24567473f..b701584a70 100644 --- a/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py @@ -19,13 +19,13 @@ class WikipediaGreenhouseEnantiopureClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py b/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py index d1d713cfb8..252ad85ed9 100644 --- a/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py @@ -19,13 +19,13 @@ class WikipediaIsotopesFissionClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py b/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py index e96f172e81..8e115b59d4 100644 --- a/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py @@ -19,13 +19,13 @@ class WikipediaLuminescenceClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py b/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py index add6687bd8..0ad784b69b 100644 --- a/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py @@ -19,13 +19,13 @@ class WikipediaOrganicInorganicClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py b/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py index b71c9dcd2c..a409f87c8d 100644 --- a/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py @@ -10,7 +10,7 @@ class WikipediaSaltsSemiconductorsClassification(AbsTaskClassification): description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", reference="https://arxiv.org/abs/2412.00532", dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaHard2SaltsVsSemiconductorMaterialsClassification", + "path": "BASF-AI/WikipediaHard2SaltsVsSemiconductorMaterialsClassification", "revision": "9e5415a096012fa2d1f3a929952cf9859e4550e7", }, type="Classification", @@ -19,13 +19,13 @@ class WikipediaSaltsSemiconductorsClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py b/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py index 6ce489b681..43f95c50f3 100644 --- a/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py @@ -19,13 +19,13 @@ class WikipediaSolidStateColloidalClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py index 8724042a7d..835ce579bc 100644 --- a/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py @@ -19,13 +19,13 @@ class WikipediaTheoreticalAppliedClassification(AbsTaskClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index 601029c4d6..de27839290 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -18,8 +18,8 @@ from .eng.StackExchangeClusteringP2P import * from .eng.TwentyNewsgroupsClustering import * from .eng.WikiCitiesClustering import * -from .eng.WikipediaChemistryTopicsClustering import * from .eng.WikipediaChemistrySpecialtiesClustering import * +from .eng.WikipediaChemistryTopicsClustering import * from .fra.AlloProfClusteringP2P import * from .fra.AlloProfClusteringS2S import * from .fra.HALClusteringS2S import * diff --git a/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py b/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py index 3385e3b8b5..a4e4082a69 100644 --- a/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py +++ b/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py @@ -1,9 +1,8 @@ from __future__ import annotations +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskClustering import AbsTaskClustering - class WikipediaChemistrySpecialtiesClustering(AbsTaskClustering): metadata = TaskMetadata( @@ -20,13 +19,13 @@ class WikipediaChemistrySpecialtiesClustering(AbsTaskClustering): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="v_measure", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], task_subtypes=[], license="cc-by-nc-sa-4.0", - annotations_creators=None, + annotations_creators="derived", dialect=[], - sample_creation=None, + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py b/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py index ec211378bd..bfa5e1fcf3 100644 --- a/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py +++ b/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py @@ -1,9 +1,8 @@ from __future__ import annotations +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskClustering import AbsTaskClustering - class WikipediaChemistryTopicsClustering(AbsTaskClustering): metadata = TaskMetadata( @@ -11,7 +10,7 @@ class WikipediaChemistryTopicsClustering(AbsTaskClustering): description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", reference="https://arxiv.org/abs/2412.00532", dataset={ - "path": "BASF-We-Create-Chemistry/WikipediaEasy10Clustering", + "path": "BASF-AI/WikipediaEasy10Clustering", "revision": "0a0886b06acbfc735bca6a71b21ce1e5cb92a37b", }, type="Clustering", @@ -20,13 +19,13 @@ class WikipediaChemistryTopicsClustering(AbsTaskClustering): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="v_measure", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], task_subtypes=[], license="cc-by-nc-sa-4.0", - annotations_creators=None, + annotations_creators="derived", dialect=[], - sample_creation=None, + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py index fe1a6b56bc..54851249c8 100644 --- a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py +++ b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py @@ -13,7 +13,7 @@ class PubChemAISentenceParaphrasePC(AbsTaskPairClassification): reference="https://arxiv.org/abs/2412.00532", dataset={ "path": "BASF-AI/PubChemAISentenceParaphrasePC", - "revision": "f33a205966ce032f957c3a22f4f9e378f89a2c56" + "revision": "f33a205966ce032f957c3a22f4f9e378f89a2c56", }, type="PairClassification", category="s2s", @@ -21,13 +21,13 @@ class PubChemAISentenceParaphrasePC(AbsTaskPairClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="max_ap", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="LM-generated", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, @@ -69,7 +69,7 @@ def dataset_transform(self): { "sentence1": hf_dataset["sent1"], "sentence2": hf_dataset["sent2"], - "labels": hf_dataset["labels"] + "labels": hf_dataset["labels"], } ] self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py index 78b1ae665b..6b02b156c0 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py @@ -32,6 +32,7 @@ }, ] + class PubChemSMILESPC(AbsTaskPairClassification): metadata = TaskMetadata( name="PubChemSMILESPC", @@ -39,7 +40,7 @@ class PubChemSMILESPC(AbsTaskPairClassification): reference="https://arxiv.org/abs/2412.00532", dataset={ "path": "BASF-AI/PubChemSMILESPairClassification", - "revision": "7ba40b69f5fe6ffe4cc189aac9e1710913c73c8a" + "revision": "7ba40b69f5fe6ffe4cc189aac9e1710913c73c8a", }, type="PairClassification", category="s2s", @@ -47,13 +48,13 @@ class PubChemSMILESPC(AbsTaskPairClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="max_ap", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, @@ -106,10 +107,12 @@ def load_data(self): self.dataset_transform() self.data_loaded = True - def dataset_transform(self): self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=self.metadata_dict["eval_splits"], label="labels" + self.dataset, + seed=self.seed, + splits=self.metadata_dict["eval_splits"], + label="labels", ) _dataset = {} diff --git a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py index 18dba0eabc..965aca2666 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py @@ -13,7 +13,7 @@ class PubChemSynonymPC(AbsTaskPairClassification): reference="https://arxiv.org/abs/2412.00532", dataset={ "path": "BASF-AI/PubChemSynonymPC", - "revision": "5037d69d177c9628fb79cb57eea1299178b28c1b" + "revision": "5037d69d177c9628fb79cb57eea1299178b28c1b", }, type="PairClassification", category="s2s", @@ -21,13 +21,13 @@ class PubChemSynonymPC(AbsTaskPairClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="max_ap", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py index 99c0097280..e2836f6bfe 100644 --- a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py @@ -13,7 +13,7 @@ class PubChemWikiParagraphsPC(AbsTaskPairClassification): reference="https://arxiv.org/abs/2412.00532", dataset={ "path": "BASF-AI/PubChemWikiParagraphsPC", - "revision": "7fb14716e4106b72f51a16e682e5cd2d67e9bd70" + "revision": "7fb14716e4106b72f51a16e682e5cd2d67e9bd70", }, type="PairClassification", category="p2p", @@ -21,13 +21,13 @@ class PubChemWikiParagraphsPC(AbsTaskPairClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="max_ap", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, @@ -46,7 +46,6 @@ class PubChemWikiParagraphsPC(AbsTaskPairClassification): publisher={Oxford University Press} } """, - ) def load_data(self): @@ -70,7 +69,7 @@ def dataset_transform(self): { "sentence1": hf_dataset["sent1"], "sentence2": hf_dataset["sent2"], - "labels": hf_dataset["labels"] + "labels": hf_dataset["labels"], } ] self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py b/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py index 5701db915c..87806cd76c 100644 --- a/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py +++ b/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py @@ -13,7 +13,7 @@ class WikipediaAIParagraphsParaphrasePC(AbsTaskPairClassification): reference="https://arxiv.org/abs/2412.00532", dataset={ "path": "BASF-AI/WikipediaAIParagraphsParaphrasePC", - "revision": "7694661b6e28000d9b2c2376a1bbd49417d279ea" + "revision": "7694661b6e28000d9b2c2376a1bbd49417d279ea", }, type="PairClassification", category="p2p", @@ -21,13 +21,13 @@ class WikipediaAIParagraphsParaphrasePC(AbsTaskPairClassification): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="max_ap", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="LM-generated", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, @@ -59,7 +59,7 @@ def dataset_transform(self): { "sentence1": hf_dataset["sent1"], "sentence2": hf_dataset["sent2"], - "labels": hf_dataset["labels"] + "labels": hf_dataset["labels"], } ] self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py b/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py index 29b4895f83..59a0605a82 100644 --- a/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py +++ b/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py @@ -5,18 +5,18 @@ from mteb.abstasks.TaskMetadata import TaskMetadata _LANGUAGES = { - 'de': ["deu-Latn", "eng-Latn"], - 'nl': ["nld-Latn", "eng-Latn"], - 'zh': ["zho-Hans", "eng-Latn"], - 'fr': ["fra-Latn", "eng-Latn"], - 'es': ["spa-Latn", "eng-Latn"], - 'pt': ["por-Latn", "eng-Latn"], - 'ms': ["msa-Latn", "eng-Latn"], - 'ko': ["kor-Hang", "eng-Latn"], - 'tr': ["tur-Latn", "eng-Latn"], - 'hi': ["hin-Deva", "eng-Latn"], - 'cs': ["ces-Latn", "eng-Latn"], - 'ja': ["jpn-Jpan", "eng-Latn"], + "de": ["deu-Latn", "eng-Latn"], + "nl": ["nld-Latn", "eng-Latn"], + "zh": ["zho-Hans", "eng-Latn"], + "fr": ["fra-Latn", "eng-Latn"], + "es": ["spa-Latn", "eng-Latn"], + "pt": ["por-Latn", "eng-Latn"], + "ms": ["msa-Latn", "eng-Latn"], + "ko": ["kor-Hang", "eng-Latn"], + "tr": ["tur-Latn", "eng-Latn"], + "hi": ["hin-Deva", "eng-Latn"], + "cs": ["ces-Latn", "eng-Latn"], + "ja": ["jpn-Jpan", "eng-Latn"], } @@ -35,13 +35,13 @@ class PubChemWikiPairClassification(AbsTaskPairClassification, MultilingualTask) eval_splits=["test"], eval_langs=_LANGUAGES, main_score="max_ap", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="created", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index f1e549eace..6c146379ea 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -29,8 +29,8 @@ from .eng.ARCChallengeRetrieval import * from .eng.ArguAnaRetrieval import * from .eng.BrightRetrieval import * -from .eng.ChemNQRetrieval import * from .eng.ChemHotpotQARetrieval import * +from .eng.ChemNQRetrieval import * from .eng.ClimateFEVERRetrieval import * from .eng.CQADupstackAndroidRetrieval import * from .eng.CQADupstackEnglishRetrieval import * diff --git a/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py b/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py index ce9d222058..88fbc50df4 100644 --- a/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py +++ b/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py @@ -1,7 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata class ChemHotpotQARetrieval(AbsTaskRetrieval): @@ -19,13 +19,13 @@ class ChemHotpotQARetrieval(AbsTaskRetrieval): eval_splits=["train", "dev", "test"], eval_langs=["eng-Latn"], main_score="ndcg_at_10", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="found", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, diff --git a/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py b/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py index a0bfb9c1fa..1e77971331 100644 --- a/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py +++ b/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py @@ -1,7 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata class ChemNQRetrieval(AbsTaskRetrieval): @@ -19,13 +19,13 @@ class ChemNQRetrieval(AbsTaskRetrieval): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="ndcg_at_10", - date=None, + date=("2024-06-01", "2024-11-30"), domains=["Chemistry"], - task_subtypes=None, + task_subtypes=[], license="cc-by-nc-sa-4.0", annotations_creators="derived", - dialect=None, - sample_creation=None, + dialect=[], + sample_creation="found", bibtex_citation=""" @article{kasmaee2024chemteb, title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, From 256d6d827b3bd49e01358aeda726533051d69969 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 5 Jan 2025 03:19:45 +0000 Subject: [PATCH 41/49] Remove `nomic_bert_model.py` as it is now compatible with SentenceTransformer. --- mteb/models/nomic_bert_model.py | 150 -------------------------------- mteb/models/overview.py | 2 - 2 files changed, 152 deletions(-) delete mode 100644 mteb/models/nomic_bert_model.py diff --git a/mteb/models/nomic_bert_model.py b/mteb/models/nomic_bert_model.py deleted file mode 100644 index 2210bb920c..0000000000 --- a/mteb/models/nomic_bert_model.py +++ /dev/null @@ -1,150 +0,0 @@ -from __future__ import annotations - -from functools import partial -from typing import Any - -import torch -import torch.nn as nn -from sentence_transformers import SentenceTransformer -from sentence_transformers.models import Pooling, Transformer -from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer - -from mteb.model_meta import ModelMeta - - -class NomicBertTransformer(Transformer): - def __init__( - self, - model_name_or_path: str, - max_seq_length: int | None = None, - model_args: dict[str, Any] | None = None, - tokenizer_args: dict[str, Any] | None = None, - config_args: dict[str, Any] | None = None, - cache_dir: str | None = None, - do_lower_case: bool = False, - tokenizer_name_or_path: str = None, - revision: str = None, - ) -> None: - nn.Module.__init__(self) - self.config_keys = ["max_seq_length", "do_lower_case"] - self.do_lower_case = do_lower_case - if model_args is None: - model_args = {} - if tokenizer_args is None: - tokenizer_args = {} - if config_args is None: - config_args = {} - - config = AutoConfig.from_pretrained( - model_name_or_path, **config_args, cache_dir=cache_dir - ) - self.auto_model = AutoModelForMaskedLM.from_pretrained( - model_name_or_path, - config=config, - revision=revision, - cache_dir=cache_dir, - **model_args, - ) - self.auto_model.cls = nn.Identity() - if max_seq_length is not None and "model_max_length" not in tokenizer_args: - tokenizer_args["model_max_length"] = max_seq_length - - self.tokenizer = AutoTokenizer.from_pretrained( - tokenizer_name_or_path - if tokenizer_name_or_path is not None - else model_name_or_path, - cache_dir=cache_dir, - **tokenizer_args, - ) - # No max_seq_length set. Try to infer from model - if max_seq_length is None: - if ( - hasattr(self.auto_model, "config") - and hasattr(self.auto_model.config, "max_position_embeddings") - and hasattr(self.tokenizer, "model_max_length") - ): - max_seq_length = min( - self.auto_model.config.max_position_embeddings, - self.tokenizer.model_max_length, - ) - - self.max_seq_length = max_seq_length - if tokenizer_name_or_path is not None: - self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__ - - def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: - """Returns token_embeddings, cls_token""" - trans_features = { - "input_ids": features["input_ids"], - "attention_mask": features["attention_mask"], - } - if "token_type_ids" in features: - trans_features["token_type_ids"] = features["token_type_ids"] - - output_states = self.auto_model(**trans_features) - output_tokens = output_states.logits - - features.update( - { - "token_embeddings": output_tokens, - "attention_mask": features["attention_mask"], - } - ) - - if self.auto_model.config.output_hidden_states: - all_layer_idx = 2 - if ( - len(output_states) < 3 - ): # Some models only output last_hidden_states and all_hidden_states - all_layer_idx = 1 - - hidden_states = output_states[all_layer_idx] - features.update({"all_layer_embeddings": hidden_states}) - - return features - - -class SentenceTransformerWithNormalization(SentenceTransformer): - def encode(self, sentences, *args, **kwargs): - if "normalize_embeddings" not in kwargs: - kwargs["normalize_embeddings"] = True - - return super().encode(sentences, *args, **kwargs) - - -def nomic_bert_loader( - model_name: str, revision: str | None, **kwargs -) -> SentenceTransformer: - nomic_bert_transformer = NomicBertTransformer( - model_name_or_path=model_name, - tokenizer_name_or_path="bert-base-uncased", - config_args={"trust_remote_code": True}, - model_args={"trust_remote_code": True}, - revision=revision, - ) - - pooling_model = Pooling(nomic_bert_transformer.get_word_embedding_dimension()) - - return SentenceTransformerWithNormalization( - modules=[nomic_bert_transformer, pooling_model] - ) - - -nomic_bert = ModelMeta( - loader=partial( # type: ignore - nomic_bert_loader, - model_name="nomic-ai/nomic-bert-2048", - revision="40b98394640e630d5276807046089b233113aa87", - ), - name="nomic-ai/nomic-bert-2048", - languages=["eng-Latn"], - open_weights=True, - revision="40b98394640e630d5276807046089b233113aa87", - release_date="2024-01-03", # first commit - license="apache-2.0", - framework=["Sentence Transformers", "PyTorch"], - reference="https://huggingface.co/nomic-ai/nomic-bert-2048", - public_training_data=True, - public_training_code=True, - max_tokens=2048, -) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index affb9a69d7..c908b24b27 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -32,7 +32,6 @@ model2vec_models, mxbai_models, no_instruct_sentence_models, - nomic_bert_model, nomic_models, nvidia_models, openai_models, @@ -90,7 +89,6 @@ stella_models, amazon_models, cohere_bedrock_models, - nomic_bert_model, uae_models, voyage_models, ] From 347501785a9fda0aed14fcd2cf966d18bbbb4756 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 5 Jan 2025 03:32:21 +0000 Subject: [PATCH 42/49] Remove `WikipediaAIParagraphsParaphrasePC` task due to being trivial. --- docs/benchmarks.md | 42 ++++++------ docs/tasks.md | 8 +-- mteb/benchmarks/benchmarks.py | 1 - mteb/tasks/PairClassification/__init__.py | 1 - .../eng/WikipediaAIParagraphsParaphrasePC.py | 65 ------------------- 5 files changed, 24 insertions(+), 93 deletions(-) delete mode 100644 mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 15051f0fe3..7c0f07d878 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -8,26 +8,26 @@ The following table gives you an overview of the benchmarks in MTEB. | Name | # Tasks | Task Types | Domains | Languages | |------|---------|------------|---------|-----------| | [BRIGHT](https://brightbenchmark.github.io/) | 1 | {'Retrieval': 1} | [Non-fiction] | eng | -| [ChemTEB](https://arxiv.org/abs/2412.00532) | 28 | {'BitextMining': 1, 'Classification': 17, 'Clustering': 2, 'PairClassification': 6, 'Retrieval': 2} | [Chemistry] | msa,zho,jpn,fra,kor,en,nld,por,hin,tur,deu,eng,spa,ces | -| [CoIR](https://github.com/CoIR-team/coir) | 10 | {'Retrieval': 10} | [Written, Programming] | c++,javascript,java,ruby,go,python,sql,eng,php | -| [LongEmbed](https://arxiv.org/abs/2404.12096v2) | 6 | {'Retrieval': 6} | [Written, Academic, Non-fiction, Encyclopaedic, Fiction, Spoken, Blog] | eng | -| [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | 7 | {'BitextMining': 7} | [Reviews, Written, Social] | tgl,arz,tha,cym,srp,ita,swh,sun,hrv,ace,bul,jpn,mak,tam,xho,yue,ind,dtp,mui,ban,war,epo,hau,fra,ber,tzl,urd,hun,swe,heb,aze,nds,lit,por,tuk,bel,kor,spa,est,bos,gla,hye,kur,cbk,ces,kaz,eus,uzb,max,kat,arq,nij,tur,eng,bre,bug,nld,mad,glg,ido,bbc,nob,sqi,pol,amh,ang,awa,dan,cat,bjn,slv,tel,ast,ibo,bew,lat,zsm,ceb,ara,isl,yor,fry,slk,wuu,bhp,hin,fin,oci,vie,mal,pam,pms,mar,nov,mkd,lfn,pcm,ukr,mhr,afr,dsb,min,cmn,fao,ben,lvs,deu,orv,gle,kab,cor,jav,ile,ell,nno,uig,cha,abs,hsb,khm,csb,pes,ina,kzj,rej,gsw,tat,mon,ron,swg,yid,rus | -| MTEB(Europe, beta) | 74 | {'BitextMining': 7, 'Classification': 21, 'Clustering': 8, 'Retrieval': 15, 'InstructionRetrieval': 3, 'MultilabelClassification': 2, 'PairClassification': 6, 'Reranking': 3, 'STS': 9} | [News, Spoken, Written, Academic, Religious, Non-fiction, Encyclopaedic, Reviews, Constructed, Government, Social, Medical, Programming, Blog, Legal, Subtitles, Web, Fiction] | cwe,gub,poe,nbq,mlt,nhi,mek,szl,cak,rom,sot,stp,taw,kiz,sna,kgk,okv,kqw,aoi,gnw,not,cle,txq,vec,ian,kyg,cbi,pab,car,luo,mzz,srm,jae,caa,fuf,xla,nuy,zho,cot,bak,tbc,mai,quc,kmo,cap,kwd,mya,heb,nab,aze,ikw,ajp,nop,nys,rop,tee,dji,bqp,jid,hne,spa,nhe,pio,qve,kkl,csy,met,mpm,cbc,azb,cnl,chk,abx,aia,geb,atg,nak,cso,amr,nnq,yml,cjk,wmt,kbq,mmo,glk,bsp,kaz,ton,zlm,cop,naf,upv,sim,dwr,tlf,cav,aai,tur,wrk,bsn,jvn,gng,mpt,hns,cbt,kde,dov,mle,mwf,bug,zga,kam,bba,nld,rai,ltz,glg,opm,sat,awb,boa,gmv,wsk,hch,mcf,pol,tiw,uzn,tsw,hbo,ngu,maq,mcq,bjz,jao,tel,ast,zpc,rkb,row,lua,mxt,heg,lid,srd,pap,ksd,maa,lbb,tvk,zar,als,ote,huu,waj,bkx,zao,qvw,bki,gdr,apz,ata,hmo,dgr,atd,kmk,acm,nhw,wln,myy,nii,yss,crh,hop,sps,nso,shj,ffm,mux,mgc,tof,maj,tzj,nor,bao,apn,qwh,azj,nou,tew,kkc,acq,gah,aom,kpg,qxn,aaz,sny,tbo,mar,nwi,orm,mos,quy,cco,kql,ebk,xav,zpz,min,cmn,zaa,imo,klv,kbc,zas,suz,amu,atb,knf,obo,san,agm,gyr,amm,tmd,zpl,gui,toc,caf,aui,kmu,nno,ter,azg,cha,tbz,aoj,pls,cpy,kwi,hix,ewe,mox,ubr,seh,anv,hlt,cab,nlg,rgu,bpr,kpj,tat,mbt,nko,jiv,etr,kdc,nho,rug,uvl,sue,ktm,urw,ssg,guj,mti,mcb,far,spm,bzh,tgl,cbs,sgz,aso,bsj,taj,bjk,mic,noa,leu,bon,chf,qvc,rmc,jpn,cpc,blz,kjs,kmr,cux,pwg,xho,yue,kwf,mee,qul,mpp,jic,tdt,agt,iou,kto,soq,ken,gwi,kon,hui,cax,mop,kyz,ncj,trc,viv,wiu,ziw,buk,cui,ots,pon,hun,swe,por,ood,run,fuh,kbp,mir,wuv,grc,ssx,ltg,ino,ntu,wal,amk,fon,llg,zap,crx,kpr,mil,est,fuc,kmg,kin,mdy,ixl,fas,agn,zam,bqc,gvc,xtm,sri,bos,sbk,nya,con,cme,gla,hye,kur,pad,bmh,kqa,xsi,knv,kat,gvs,ulk,wer,cbv,kas,mbh,lav,mau,bea,eng,nhy,otn,bre,yre,dad,djk,kqf,sus,bch,uli,kmb,qup,djr,mav,pbt,ndg,mlp,snc,kyf,nqo,mph,zpq,kup,nsn,amh,tac,aau,isn,xed,yon,zia,fuv,ncu,slv,kne,agd,cjv,swp,zaw,ibo,tna,glv,ikk,aey,cof,ame,wro,zsm,haw,sbe,guh,apr,ubu,ttc,sua,mie,sgb,nhr,for,ppo,msa,tav,pjt,zai,cni,mbj,quh,fue,top,wrs,inb,cmo,qxh,ztq,myw,apw,wim,vmy,dgz,cgc,wbp,afr,kze,lif,aly,aii,bss,ben,udu,bgs,kan,med,lvs,bco,mna,tir,too,hat,gle,ydd,sbs,nss,cbu,jni,yap,dif,kab,bjv,som,fil,uig,avt,wnc,zsr,ntp,tso,mkn,txu,tpi,toj,muy,yaq,tnn,enq,agg,spl,gum,srq,zpo,mig,ycn,box,iws,arn,chq,bhg,ron,dik,mcr,ary,bef,rus,tbf,bmr,cya,kgp,srp,rro,dhg,ory,tgk,ita,cym,swh,sun,aka,hrv,pri,sxb,ace,mlg,cpu,bem,kek,apu,bhl,mbc,tgp,tos,gvn,aby,faa,ded,apc,yad,kud,zpu,fij,war,qvn,nde,aak,mib,cjo,scn,hau,tte,fra,tnp,pah,sin,usp,ksr,mxb,tku,plu,lit,lim,pao,bel,arp,huv,nas,guo,kor,awk,bvr,xtd,cpb,acu,mbb,qvm,twi,dyu,tsn,kpx,beu,daa,arl,blw,msy,tcz,yle,swa,tum,tke,gaw,ces,aer,nif,zos,eus,npl,zat,kvn,are,lug,tca,maz,kaq,bnp,tzm,omw,wap,tif,gai,yal,bbb,gvf,spp,gnn,kew,piu,snx,tgo,mqb,lac,ndj,nhu,tpa,yrb,nob,sqi,amp,pus,boj,msc,dah,kje,awa,cat,kir,bjn,kyc,ilo,ctp,bdd,mxq,beo,mks,ven,dob,lat,pma,mri,cek,wbi,npi,nvm,wmw,ara,nch,shn,mqj,mey,shi,adz,nhg,knj,hin,msb,zac,mxp,fur,sag,kwj,oci,mco,chz,ura,nep,cpa,kqc,bbr,lww,lbk,nin,poy,eri,qvs,snd,bus,lex,lao,khk,zpm,tuc,bjp,mgh,ntj,mkd,bho,kpf,bps,rwo,sja,ssd,zca,zpv,klt,tod,dwy,gof,bod,kik,grn,mva,kea,ake,ons,acr,mbl,gul,mpx,qxo,crn,wos,yva,dzo,khs,mni,ayr,tpt,nfa,soy,zav,yuw,msk,tfr,khm,pes,mto,mio,sco,smk,agr,dop,tcs,apb,kpw,eko,emi,arb,pir,mgw,mwp,lij,emp,qvh,zab,cub,gaz,arz,otm,tha,pib,qvz,agu,ptu,gfk,hla,sll,tzo,snp,bul,gdn,mkl,tam,ind,cut,hvn,mbs,azz,lmo,ban,cth,epo,bmu,jac,mwc,usa,div,ghs,urd,umb,ncl,uvh,myk,tuk,tuf,tpz,xbi,dgc,zyp,mmx,roo,amn,bzd,pag,kms,tim,wol,hto,ese,hub,lus,poi,auy,snn,poh,bkq,fai,mps,rmy,yby,urt,tah,mag,byr,mih,kac,mcd,cbk,mhl,ptp,esk,mkj,kdl,mlh,uzb,reg,srn,gup,alq,ign,mcp,tuo,hus,cta,lin,meq,xnn,gym,mca,ksj,kgf,cao,mpj,cac,tnk,bvd,mvn,lgl,prf,nus,prs,nyu,hmn,mam,dww,yut,dan,khz,sey,cbr,bxh,bam,kvg,byx,bkd,pan,yaa,bmk,ceb,plt,yka,hot,att,urb,cnt,isl,ckb,yor,slk,wiv,zty,fin,ong,tbg,gam,anh,vie,mal,kiw,sab,tiy,gux,ipi,asm,acf,amx,bjr,mwe,myu,gun,kmh,xon,amo,knc,tet,uri,kbm,msm,meu,aeb,chd,yuj,wnu,ukr,alp,clu,mit,fao,ssw,ngp,otq,shp,deu,kue,kbh,mjc,cuc,bzj,big,kos,jav,ctu,aon,bgt,ell,abt,miz,amf,kyq,wat,qub,zaj,auc,quf,nna,zul,nca,cuk,smo,spy,vid,zad,ape,mon,wed,taq,ars,tue,lcm,tnc,ruf,awx | -| MTEB(Indic, beta) | 23 | {'BitextMining': 4, 'Clustering': 1, 'Classification': 13, 'STS': 1, 'PairClassification': 1, 'Retrieval': 2, 'Reranking': 1} | [News, Written, Religious, Non-fiction, Reviews, Constructed, Government, Encyclopaedic, Social, Web, Legal, Spoken, Fiction] | tgl,arz,tha,cym,mlt,ory,srp,szl,ita,tgk,sot,sun,aka,hrv,sna,ace,swh,bul,jpn,bem,kmr,tam,xho,yue,ind,vec,apc,lmo,ban,fij,war,kon,luo,epo,gom,zho,scn,hau,fra,bak,sin,mai,urd,umb,hun,mya,swe,heb,por,lim,lit,tuk,run,ajp,bel,kbp,pag,ltg,kor,wol,lus,dyu,twi,fon,hne,spa,tsn,mup,est,kin,azb,mag,fas,bos,kac,swa,gla,nya,hye,tum,cjk,ces,kaz,eus,lug,kat,kas,tur,tzm,eng,bug,kmb,kam,lin,nld,ltz,glg,sat,pbt,nob,prs,nus,gaz,nqo,pol,uzn,amh,pus,fuv,awa,dan,cat,kir,bjn,slv,ilo,brx,tel,bam,ast,lua,ibo,srd,pap,pan,mri,rus,als,zsm,ceb,npi,plt,raj,ara,shn,bgc,isl,ckb,yor,slk,acm,hin,sag,fur,fin,oci,crh,nep,nso,vie,mal,nor,asm,snd,azj,acq,knc,lao,khk,mar,aeb,mkd,gbm,mos,ukr,quy,bho,afr,fao,min,cmn,ssw,ben,kan,lvs,san,bod,deu,kik,tir,grn,gle,boy,hat,kea,ydd,mwr,kab,jav,nno,ell,dzo,som,mni,ayr,uig,tso,tpi,khm,ewe,pes,zul,smo,tat,doi,ron,taq,arb,ars,dik,lij,ary,guj | -| MTEB(Medical) | 12 | {'Retrieval': 9, 'Clustering': 2, 'Reranking': 1} | [Written, Academic, Non-fiction, Medical, Web, Government] | zho,ara,cmn,fra,kor,pol,vie,rus,eng,spa | -| MTEB(Multilingual, beta) | 132 | {'BitextMining': 13, 'Classification': 43, 'Clustering': 17, 'Retrieval': 18, 'InstructionRetrieval': 3, 'MultilabelClassification': 5, 'PairClassification': 11, 'Reranking': 6, 'STS': 16} | [News, Spoken, Written, Academic, Religious, Non-fiction, Encyclopaedic, Reviews, Government, Constructed, Social, Medical, Programming, Blog, Legal, Subtitles, Web, Fiction] | cwe,gub,poe,nbq,mlt,nhi,mek,szl,cak,rom,sot,stp,taw,kiz,sna,kgk,okv,kqw,aoi,gnw,not,cle,txq,vec,ian,kyg,cbi,mui,pab,car,luo,mzz,srm,jae,caa,fuf,xla,nuy,zho,cot,gom,bak,tbc,mai,quc,kmo,cap,kwd,mya,heb,nab,aze,ikw,ajp,nop,nys,rop,tee,dji,bqp,jid,hne,spa,nhe,pio,qve,kkl,csy,met,mpm,cbc,azb,cnl,chk,abx,aia,geb,atg,nak,cso,amr,nnq,yml,cjk,wmt,kbq,mmo,glk,bsp,kaz,ton,zlm,cop,naf,upv,sim,arq,dwr,tlf,cav,aai,tur,wrk,bsn,jvn,gng,mpt,hns,cbt,kde,dov,mle,mwf,bug,zga,kam,bba,nld,rai,ltz,glg,opm,sat,awb,boa,gmv,wsk,hch,mcf,pol,tiw,uzn,tsw,hbo,ngu,maq,brx,mcq,bjz,jao,tel,ast,zpc,rkb,row,lua,mxt,heg,bew,lid,srd,pap,ksd,maa,lbb,tvk,zar,als,ote,huu,waj,bkx,zao,qvw,bki,gdr,apz,ata,hmo,tyv,dgr,atd,kmk,acm,nhw,wln,myy,nii,yss,crh,hop,sps,nso,shj,ffm,mux,mgc,tof,maj,tzj,nor,bao,apn,qwh,azj,nou,tew,kkc,acq,gah,aom,kpg,qxn,aaz,sny,tbo,mar,nwi,orm,mos,quy,cco,kql,ebk,xav,mhr,zpz,min,cmn,zaa,imo,klv,kbc,zas,suz,amu,atb,knf,obo,san,agm,gyr,amm,tmd,zpl,gui,toc,caf,cor,aui,kmu,nno,ile,ter,azg,cha,tbz,aoj,hsb,pls,cpy,kwi,hix,ewe,mox,ubr,seh,anv,sah,hlt,cab,nlg,rgu,bpr,kpj,tat,mbt,nko,jiv,etr,kdc,nho,rug,uvl,sue,ktm,urw,ssg,yid,guj,mti,mcb,far,spm,bzh,tgl,cbs,sgz,aso,bsj,taj,bjk,mic,noa,leu,bon,chf,qvc,rmc,jpn,cpc,blz,kjs,kmr,cux,pwg,xho,yue,kwf,mee,qul,dtp,mpp,jic,tdt,agt,iou,kto,soq,ken,gwi,kon,hui,cax,mop,kyz,ncj,trc,viv,wiu,ziw,buk,cui,ots,pon,hun,swe,por,ood,run,fuh,kbp,mir,wuv,grc,ssx,ltg,chv,ino,ntu,wal,amk,fon,llg,zap,crx,kpr,mil,est,fuc,kmg,kin,mdy,ixl,fas,agn,zam,bqc,gvc,xtm,sri,bos,sbk,nya,con,cme,gla,hye,kur,pad,bmh,kqa,xsi,knv,kat,gvs,ulk,wer,cbv,kas,mbh,lav,mau,bea,eng,nhy,otn,bre,yre,dad,djk,kqf,sus,bch,uli,kmb,qup,djr,mav,pbt,ndg,mlp,snc,kyf,nqo,mph,zpq,kup,nsn,amh,tac,aau,isn,xed,yon,zia,fuv,ncu,slv,kne,agd,cjv,swp,zaw,ibo,tna,glv,ikk,aey,cof,ame,wro,zsm,haw,sbe,guh,apr,ubu,ttc,sua,mie,sgb,nhr,for,ppo,msa,tav,pjt,zai,cni,mbj,quh,fue,top,wrs,inb,cmo,qxh,ztq,myw,apw,wim,vmy,dgz,cgc,pcm,wbp,afr,kze,lif,aly,aii,bss,ben,udu,bgs,svk,kan,med,lvs,bco,mna,tir,too,hat,gle,ydd,sbs,nss,cbu,jni,yap,dif,kab,bjv,som,fil,uig,avt,wnc,zsr,ntp,tso,mkn,txu,tpi,toj,muy,yaq,tnn,enq,agg,spl,gum,srq,zpo,mig,kzj,ycn,box,iws,arn,chq,bhg,ron,dik,mcr,ary,bef,rus,tbf,bmr,cya,kgp,srp,rro,dhg,ory,tgk,ita,cym,swh,sun,aka,hrv,pri,sxb,ace,mlg,cpu,bem,kek,apu,bhl,mbc,tgp,tos,gvn,aby,faa,ded,apc,yad,kud,zpu,fij,war,qvn,nde,aak,mib,cjo,scn,hau,tte,fra,tnp,pah,sin,tzl,usp,ksr,mxb,tku,plu,lit,lim,pao,nds,bel,arp,huv,nas,guo,kor,awk,bvr,xtd,cpb,acu,mbb,qvm,twi,dyu,tsn,kpx,beu,daa,arl,blw,msy,tcz,yle,swa,tum,tke,gaw,ces,aer,nif,zos,eus,npl,zat,kvn,are,lug,tca,maz,kaq,bnp,tzm,omw,wap,tif,gai,yal,bbb,gvf,mad,spp,gnn,kew,piu,snx,tgo,mqb,ido,bbc,lac,ndj,nhu,tpa,yrb,nob,sqi,amp,pus,boj,msc,dah,kje,awa,cat,kir,bjn,kyc,ilo,ctp,bdd,mxq,beo,mks,ven,dob,lat,pma,mri,cek,wbi,npi,raj,nvm,wmw,ara,nch,shn,mqj,mey,shi,adz,nhg,knj,bhp,hin,msb,zac,mxp,fur,sag,kwj,oci,mco,chz,ura,nep,cpa,kqc,bbr,lww,lbk,nin,poy,eri,qvs,snd,bus,pms,lex,lao,khk,zpm,nov,tuc,bjp,mgh,ntj,mkd,lfn,bho,dsb,kpf,bps,rwo,sja,ssd,zca,zpv,klt,tod,dwy,gof,bod,kik,orv,grn,mva,kea,ake,mwr,ons,acr,mbl,gul,mpx,krc,qxo,crn,wos,yva,dzo,khs,mni,ayr,tpt,nfa,soy,zav,yuw,abs,msk,tfr,khm,pes,ina,mto,mio,sco,smk,agr,dop,tcs,apb,rej,kpw,gsw,doi,eko,emi,arb,pir,mgw,mwp,lij,emp,qvh,zab,cub,gaz,arz,otm,tha,pib,qvz,agu,ptu,gfk,hla,sll,tzo,snp,bul,gdn,mkl,tam,mak,ind,cut,hvn,mbs,azz,lmo,ban,cth,epo,bmu,jac,mwc,usa,div,ber,ghs,urd,umb,ncl,uvh,myk,tuk,tuf,tpz,xbi,dgc,zyp,mmx,roo,amn,bzd,pag,kms,tim,wol,hto,ese,hub,lus,poi,auy,snn,poh,bkq,fai,mps,rmy,yby,mup,urt,tah,mag,byr,mih,kac,mcd,cbk,mhl,ptp,esk,mkj,kdl,mlh,uzb,max,reg,srn,gup,alq,nij,ign,mcp,tuo,hus,cta,lin,meq,xnn,gym,mca,ksj,kgf,cao,mpj,cac,tnk,bvd,mvn,lgl,prf,nus,prs,nyu,hmn,mam,dww,ang,yut,dan,khz,sey,cbr,bxh,bam,kvg,byx,bkd,pan,yaa,bmk,ceb,plt,yka,hot,att,urb,cnt,bgc,isl,ckb,yor,fry,slk,wuu,wiv,zty,fin,ong,tbg,gam,anh,vie,mal,pam,kiw,sab,tiy,gux,ipi,asm,acf,amx,bjr,mwe,myu,gun,kmh,xon,amo,knc,tet,uri,kbm,msm,meu,aeb,chd,yuj,gbm,wnu,ukr,alp,clu,mit,fao,ssw,ngp,otq,shp,deu,kue,kbh,boy,mjc,cuc,bzj,big,kos,jav,ctu,aon,bgt,ell,abt,miz,amf,kyq,wat,qub,zaj,auc,quf,csb,nna,zul,nca,cuk,smo,spy,vid,zad,ape,mon,wed,taq,ars,swg,tue,lcm,tnc,ruf,awx | -| [MTEB(Retrieval w/Instructions)](https://arxiv.org/abs/2403.15246) | 3 | {'InstructionRetrieval': 3} | [News, Written] | eng | -| [MTEB(Scandinavian)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | 28 | {'BitextMining': 2, 'Classification': 13, 'Retrieval': 7, 'Clustering': 6} | [News, Spoken, Written, Non-fiction, Encyclopaedic, Reviews, Government, Social, Blog, Legal, Web, Fiction] | nob,fao,isl,dan,swe,nno | -| MTEB(code) | 12 | {'Retrieval': 12} | [Written, Programming] | c++,javascript,typescript,java,ruby,swift,go,scala,shell,python,sql,rust,c,eng,php | -| [MTEB(deu)](https://arxiv.org/html/2401.02709v1) | 19 | {'Classification': 6, 'Clustering': 4, 'PairClassification': 2, 'Reranking': 1, 'Retrieval': 4, 'STS': 2} | [News, Written, Reviews, Encyclopaedic, Web, Spoken] | pol,deu,eng,fra | -| MTEB(eng, beta) | 41 | {'Classification': 8, 'Retrieval': 10, 'Clustering': 8, 'Reranking': 2, 'STS': 9, 'PairClassification': 3, 'Summarization': 1} | [Spoken, News, Written, Academic, Non-fiction, Reviews, Medical, Encyclopaedic, Social, Programming, Web, Blog] | ara,cmn,fra,pol,ita,nld,tur,deu,eng,spa | -| MTEB(eng, classic) | 67 | {'Classification': 12, 'Retrieval': 26, 'Clustering': 11, 'Reranking': 4, 'STS': 10, 'PairClassification': 3, 'Summarization': 1} | [Spoken, News, Written, Academic, Non-fiction, Reviews, Medical, Encyclopaedic, Social, Programming, Web, Blog] | ara,cmn,fra,pol,ita,nld,tur,deu,eng,spa | -| [MTEB(fra)](https://arxiv.org/abs/2405.20468) | 26 | {'Classification': 6, 'Clustering': 7, 'PairClassification': 2, 'Reranking': 2, 'Retrieval': 5, 'STS': 3, 'Summarization': 1} | [News, Written, Academic, Non-fiction, Reviews, Encyclopaedic, Social, Web, Legal, Spoken] | pol,deu,eng,fra | -| [MTEB(jpn)](https://github.com/sbintuitions/JMTEB) | 16 | {'Clustering': 2, 'Classification': 4, 'STS': 2, 'PairClassification': 1, 'Retrieval': 6, 'Reranking': 1} | [News, Written, Academic, Non-fiction, Reviews, Encyclopaedic, Web, Spoken] | jpn | -| MTEB(kor) | 6 | {'Classification': 1, 'Reranking': 1, 'Retrieval': 2, 'STS': 2} | [News, Written, Reviews, Encyclopaedic, Web, Spoken] | kor | -| [MTEB(law)](https://aclanthology.org/2023.eacl-main.148/) | 8 | {'Retrieval': 8} | [Legal, Written] | deu,eng,zho | -| [MTEB(pol)](https://arxiv.org/abs/2405.10138) | 18 | {'Classification': 7, 'Clustering': 3, 'PairClassification': 4, 'STS': 4} | [News, Written, Academic, Non-fiction, Social, Web, Legal, Spoken, Fiction] | pol,deu,eng,fra | -| [MTEB(rus)](https://aclanthology.org/2023.eacl-main.148/) | 23 | {'Classification': 9, 'Clustering': 3, 'MultilabelClassification': 2, 'PairClassification': 1, 'Reranking': 2, 'Retrieval': 3, 'STS': 3} | [News, Spoken, Written, Academic, Reviews, Encyclopaedic, Social, Web, Blog] | rus | -| [NanoBEIR](https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6) | 13 | {'Retrieval': 13} | [News, Written, Academic, Non-fiction, Medical, Encyclopaedic, Social, Web] | eng | +| [ChemTEB](https://arxiv.org/abs/2412.00532) | 27 | {'BitextMining': 1, 'Classification': 17, 'Clustering': 2, 'PairClassification': 5, 'Retrieval': 2} | [Chemistry] | nld,tur,eng,ces,kor,zho,spa,hin,jpn,deu,fra,msa,por | +| [CoIR](https://github.com/CoIR-team/coir) | 10 | {'Retrieval': 10} | [Written, Programming] | javascript,ruby,sql,go,eng,java,php,python,c++ | +| [LongEmbed](https://arxiv.org/abs/2404.12096v2) | 6 | {'Retrieval': 6} | [Fiction, Academic, Written, Blog, Non-fiction, Spoken, Encyclopaedic] | eng | +| [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | 7 | {'BitextMining': 7} | [Reviews, Written, Social] | sqi,ban,srp,jpn,nds,lat,por,mon,kur,bul,slv,mak,deu,uzb,yor,kzj,max,kat,cha,yid,zsm,spa,pms,mhr,min,fao,heb,nij,mui,tuk,rus,bew,swe,pes,slk,ceb,bjn,ido,abs,ukr,ina,kab,tgl,cor,dan,kaz,fry,rej,hrv,ces,lfn,glg,dsb,hau,ace,urd,ben,yue,nld,eng,epo,ron,xho,wuu,cmn,ind,ang,hsb,mad,pam,nov,swh,bbc,pcm,ara,hye,mkd,nno,ast,jav,lvs,mal,swg,nob,tat,arz,vie,ile,tam,est,ber,bre,csb,pol,afr,cbk,bug,tzl,kor,ibo,hun,war,aze,tha,mar,uig,gla,orv,hin,amh,bel,sun,fin,cat,awa,gsw,isl,oci,ell,cym,arq,ita,fra,bos,dtp,eus,bhp,tel,tur,khm,lit,gle | +| MTEB(Europe, beta) | 74 | {'BitextMining': 7, 'Classification': 21, 'Clustering': 8, 'Retrieval': 15, 'InstructionRetrieval': 3, 'MultilabelClassification': 2, 'PairClassification': 6, 'Reranking': 3, 'STS': 9} | [Web, Fiction, Social, Academic, Religious, Written, Medical, Blog, Constructed, Non-fiction, Legal, News, Government, Reviews, Spoken, Encyclopaedic, Programming, Subtitles] | qvm,esk,nlg,toj,gup,llg,jpn,azj,for,lav,kmh,por,bsj,tna,upv,cta,smk,zty,qvz,ntj,ton,uvh,cjk,kgf,gaw,bak,seh,jiv,hui,ksr,uli,kwi,qvw,kkl,arl,msk,omw,aai,tet,yby,mva,fao,kgk,min,kac,dji,box,rus,chz,emp,ktm,bps,bon,nus,bss,cut,sue,meq,kpr,rwo,ceb,zaj,mib,aui,apc,kdl,mxb,okv,rai,big,reg,ulk,mlg,yap,tpt,hrv,nak,plu,nde,kyc,arp,hau,ary,alp,apr,caa,mbh,uvl,zat,bjp,urd,bki,lin,mek,hlt,iws,spl,xav,yml,lcm,ese,xho,are,mux,lww,ndg,ntu,tzj,ame,yss,zar,fil,aii,csy,gvs,zpm,amh,spp,ken,avt,ltz,swh,viv,kmk,zul,bqp,cav,wln,leu,tcs,tuf,mkd,clu,msy,too,ast,amx,quf,jav,yre,nhe,tat,lbk,maj,msm,rug,nor,tbc,prf,pad,zlm,kze,wnc,fai,cbs,mai,aoi,mxq,bao,kos,mlh,nep,mkl,roo,umb,poh,bod,nna,aey,afr,aly,cac,maa,aze,fon,tha,mhl,chd,tpi,tzm,acq,kyz,nbq,yle,ape,bco,att,nin,mkj,yuj,ata,djr,atb,enq,cpb,sxb,rmc,zas,guj,kbq,gfk,tgo,acm,cux,fin,npi,etr,tsn,dob,mpt,alq,byx,cak,cso,spy,oci,asm,ttc,nwi,srn,hmn,gyr,hto,ngu,cpa,tif,fuv,kue,yuw,ote,mgw,ssg,bos,mvn,dop,aso,mox,ndj,stp,mpp,nas,kon,mks,caf,mbs,mcd,wap,cco,tod,aon,aom,cnl,srp,zga,lat,sja,kpj,nhi,nko,swp,bho,blw,mih,mon,sna,bgs,als,kyf,kur,bul,uzb,knj,mam,yor,zos,gdr,aka,bam,bmh,gnw,lid,cha,msc,zpl,gun,qxn,zsm,spa,mgh,nca,cpc,quc,hvn,bvr,agu,ngp,aak,jni,mau,sab,wos,huv,swe,kea,tum,pes,som,pbt,mmo,amo,kgp,taq,sbe,mil,nhg,bmu,bvd,wrs,atg,muy,tpa,ign,vmy,uri,chf,cek,knf,pib,soy,boa,ces,xed,pma,hix,kbc,orm,sim,ace,nhw,kud,ppo,xnn,yut,snx,ilo,zaa,nld,bsp,aau,myk,grn,bkq,cme,bbb,ssd,fur,knc,knv,heg,urw,ayr,ons,sat,crx,rop,szl,suz,ncl,anh,kto,tca,chk,xla,qxh,ziw,ntp,azb,ara,tew,sot,cjv,djk,usa,ltg,cap,arz,lmo,vec,jao,wer,dhg,vie,ded,hop,khk,faa,tam,sus,mwc,ikk,kek,mie,trc,tue,ura,crh,bkd,bzj,kwj,klt,sps,jid,xsi,swa,qxo,lim,nqo,hns,tmd,mbt,mbc,ibo,hun,wrk,bnp,abt,kaq,car,kiz,nvm,nfa,gul,guo,uzn,beo,aer,nhy,otm,cjo,tgk,bel,eri,mca,wsk,rro,row,bsn,tpz,fij,tvk,msb,mpx,abx,poy,sgb,kas,tcz,top,dif,awk,cbc,bea,ell,myy,pus,bmr,ssx,pao,ebk,ajp,opm,wnu,gub,acr,tbf,ubr,cth,taj,aby,kde,mqj,zao,khm,hat,gle,azg,cbv,ian,apu,ptp,kbm,met,plt,sag,agd,pag,ydd,ckb,mzz,div,kmg,miz,tac,tuo,gvn,boj,tee,mph,mna,qwh,gng,agg,mle,rgu,haw,med,kyg,mig,nhu,tnc,waj,kat,lua,zpz,kpx,tof,ven,dzo,yaa,bqc,klv,qul,kqw,bef,gai,heb,nuy,zac,mcr,zpc,ssw,meu,tuk,gui,kmo,usp,otq,khs,ksj,xbi,nya,cya,aoj,kmr,grc,sny,snp,mir,piu,geb,tgl,dik,agn,dan,qvn,kaz,kbp,mto,tiy,xon,zav,dww,zap,kqa,lac,kne,wat,cbt,naf,inb,kwf,crn,azz,wim,ben,wro,poi,yue,awb,cgc,eng,mjc,amf,mps,mwe,ncu,cle,tdt,hne,zai,gdn,toc,bhl,kir,ron,fue,kyq,ixl,ghs,ncj,tbz,nnq,mio,kwd,mxp,beu,sbk,fuh,gym,ztq,mey,ikw,pab,kmb,cof,tso,ipi,byr,aia,wiv,agm,npl,ter,hye,iou,tku,nno,cnt,kqc,sll,lvs,gnn,nob,dah,nii,san,wuv,udu,gux,ots,zpq,cuk,mbj,nab,bjz,hbo,imo,mcf,glk,zam,twi,srd,sin,zca,qvc,agr,con,kjs,zaw,mav,gum,dov,ood,soq,tte,msa,chq,cbk,isn,kpf,ptu,mri,cao,aeb,cni,aaz,yon,pan,sgz,rom,mop,gwi,nou,uig,gla,far,atd,hin,tnp,bbr,kpg,huu,arn,jvn,cat,awa,amm,urb,run,mit,pir,gam,adz,tir,isl,pls,mlt,qve,nyu,txu,tbg,dwy,quy,ruf,kiw,shp,amr,ita,maq,dgr,fra,kin,ubu,gof,gaz,mgc,cmo,ctu,tel,eus,mcq,bpr,ino,snd,bgt,mwf,acu,jic,kkc,jac,lit,xtd,dyu,kvn,zyp,prs,cop,auc,wed,apb,sqi,ban,wal,poe,tnk,myu,otn,kje,ong,bkx,zsr,hch,agt,wiu,spm,zpu,scn,sri,myw,buk,kdc,zho,sbs,slv,deu,kqf,kvg,tgp,bhg,dwr,xtm,amu,wbp,tim,ory,tos,kan,kbh,mya,mwp,mcb,shn,bdd,cub,yrb,tbo,yal,lug,tah,txq,emi,hub,nso,slk,zpo,zpv,bmk,nss,bjn,nch,bzd,shj,ukr,mbl,tlf,kab,kew,kpw,luo,cpy,kmu,kup,zab,pri,snc,wbi,acf,gmv,glg,amp,qup,nop,srq,yka,apw,mqb,wmt,bch,ewe,sey,lbb,epo,qvh,taw,fuc,kql,ksd,smo,gvf,cmn,yad,ind,qvs,obo,wmw,nsn,anv,mic,pap,ake,fas,cbr,bjr,glv,mdy,tsw,gvc,noa,bus,bjv,cwe,pon,pio,snn,mal,nho,bba,jae,mxt,wol,nif,ycn,lao,tfr,ffm,qub,hus,bzh,mlp,mti,not,nys,tzo,arb,mos,kam,cuc,dgc,pah,pjt,est,bxh,hot,bre,kms,cot,awx,bjk,pwg,cpu,hla,mpm,fuf,pol,tnn,shi,auy,mpj,tuc,bug,kor,zad,war,ars,rkb,mni,cbu,lif,mar,dad,mee,dgz,mco,kik,apz,mkn,sco,mbb,maz,lij,khz,hmo,guh,sun,cbi,lgl,nhr,tiw,daa,amn,amk,tke,lex,mag,cym,eko,zia,mcp,gah,urt,sua,cab,quh,srm,vid,blz,mmx,apn,tur,rmy,bem,yaq,ctp,cui,lus,tav,cax,yva | +| MTEB(Indic, beta) | 23 | {'BitextMining': 4, 'Clustering': 1, 'Classification': 13, 'STS': 1, 'PairClassification': 1, 'Retrieval': 2, 'Reranking': 1} | [Web, Fiction, Social, Encyclopaedic, Religious, Written, Constructed, Non-fiction, Legal, News, Spoken, Reviews, Government] | ban,pag,ckb,ydd,srp,azj,jpn,bho,por,sna,als,scn,cjk,zho,mwr,bul,slv,deu,yor,bak,ory,aka,bam,kat,lua,kan,dzo,mya,zsm,spa,shn,min,nus,fao,heb,kac,lug,tuk,kea,rus,ssw,tum,swe,nso,pes,slk,som,mup,pbt,nya,ceb,bjn,kmr,apc,taq,ukr,kab,luo,tgl,dik,dan,kaz,kbp,hrv,ces,glg,ary,hau,ace,urd,ben,boy,ewe,ilo,yue,lin,nld,eng,hne,epo,kir,grn,ron,xho,smo,fur,knc,cmn,ind,ayr,sat,szl,pap,fas,kmb,tso,ltz,swh,brx,zul,azb,doi,ara,hye,mkd,nno,ast,jav,lvs,mal,lao,sot,wol,nob,ltg,tat,san,arz,lmo,vec,nor,vie,sag,khk,arb,mos,kam,tam,bgc,mai,gbm,srd,est,twi,crh,sin,nep,swa,umb,bod,pol,lim,nqo,afr,bug,kor,ibo,mri,hun,aeb,war,ars,mni,fon,tha,mar,tpi,tzm,acq,pan,uzn,kik,gla,uig,hin,lij,tgk,amh,bel,sun,acm,guj,fin,cat,awa,fij,npi,run,tsn,kas,tir,isl,asm,mlt,ell,oci,mag,cym,pus,gom,quy,ajp,raj,fuv,ita,kin,bos,fra,gaz,eus,tel,tur,snd,kon,khm,bem,dyu,gle,hat,lit,prs,lus,plt | +| MTEB(Medical) | 12 | {'Retrieval': 9, 'Clustering': 2, 'Reranking': 1} | [Web, Academic, Medical, Written, Non-fiction, Government] | rus,eng,kor,ara,spa,zho,vie,fra,pol,cmn | +| MTEB(Multilingual, beta) | 132 | {'BitextMining': 13, 'Classification': 43, 'Clustering': 17, 'Retrieval': 18, 'InstructionRetrieval': 3, 'MultilabelClassification': 5, 'PairClassification': 11, 'Reranking': 6, 'STS': 16} | [Web, Fiction, Social, Academic, Religious, Written, Medical, Blog, Constructed, Non-fiction, Legal, Government, News, Reviews, Spoken, Encyclopaedic, Programming, Subtitles] | qvm,esk,nlg,toj,gup,llg,jpn,azj,for,lav,kmh,por,bsj,tna,upv,cta,smk,zty,qvz,ntj,ton,uvh,cjk,kgf,gaw,bak,seh,jiv,hui,ksr,uli,kwi,qvw,kkl,arl,msk,omw,aai,tet,yby,mva,fao,kgk,min,kac,dji,mui,box,rus,chz,emp,bew,ktm,bps,bon,nus,bss,cut,sue,meq,kpr,rwo,ceb,zaj,mib,aui,apc,kdl,mxb,okv,rai,big,reg,ulk,mlg,yap,tpt,rej,hrv,nak,plu,nde,lfn,kyc,arp,hau,ary,alp,apr,caa,mbh,uvl,zat,bjp,urd,bki,lin,mek,hlt,iws,spl,xav,yml,lcm,ese,xho,are,mux,lww,ndg,ntu,tzj,ame,yss,zar,fil,aii,csy,gvs,zpm,amh,spp,ken,avt,ltz,swh,viv,kmk,zul,bqp,cav,wln,leu,tcs,tuf,mkd,clu,msy,too,ast,amx,quf,jav,yre,nhe,tat,lbk,maj,msm,rug,nor,tbc,prf,pad,zlm,kze,wnc,fai,cbs,mai,aoi,mxq,bao,kos,mlh,nep,mkl,roo,umb,poh,bod,nna,aey,afr,aly,cac,maa,aze,fon,tha,mhl,chd,tpi,tzm,acq,kyz,nbq,yle,ape,bco,att,nin,mkj,yuj,ata,djr,atb,enq,cpb,sxb,rmc,zas,guj,kbq,gfk,tgo,acm,cux,fin,npi,etr,tsn,dob,mpt,alq,byx,cak,cso,spy,oci,asm,ttc,nwi,srn,hmn,gyr,hto,arq,ngu,cpa,tif,fuv,raj,kue,yuw,ote,mgw,ssg,bos,mvn,dop,aso,mox,ndj,stp,mpp,nas,kon,mks,caf,mbs,mcd,wap,cco,tod,aon,aom,cnl,srp,zga,lat,sja,kpj,nhi,nko,swp,bho,blw,mih,mon,sna,bgs,als,kyf,kur,bul,uzb,knj,mam,yor,zos,gdr,aka,bam,bmh,gnw,lid,cha,msc,zpl,gun,qxn,zsm,spa,mgh,nca,yid,pms,mhr,cpc,quc,hvn,bvr,agu,svk,ngp,aak,jni,mau,sab,wos,huv,swe,kea,tum,pes,som,mup,pbt,mmo,amo,kgp,ido,taq,sbe,mil,nhg,bmu,bvd,wrs,atg,muy,tpa,chv,ign,vmy,cor,uri,fry,chf,cek,knf,pib,soy,boa,ces,xed,pma,hix,kbc,orm,sim,ace,nhw,kud,ppo,xnn,yut,boy,snx,ilo,zaa,nld,bsp,aau,myk,grn,bkq,cme,bbb,ssd,fur,knc,wuu,knv,heg,urw,ayr,ons,sat,crx,ang,hsb,rop,szl,suz,mad,ncl,anh,kto,tca,chk,xla,qxh,brx,ziw,ntp,azb,ara,tew,sot,cjv,djk,usa,ltg,cap,arz,lmo,vec,jao,wer,dhg,vie,ded,hop,khk,faa,tam,bgc,sus,mwc,ikk,kek,mie,trc,tue,ura,crh,ber,bkd,bzj,kwj,klt,sps,jid,xsi,swa,qxo,csb,lim,nqo,hns,tmd,mbt,mbc,ibo,hun,wrk,bnp,abt,kaq,car,kiz,nvm,nfa,gul,guo,uzn,beo,aer,nhy,otm,orv,cjo,tgk,bel,eri,mca,wsk,rro,row,bsn,tpz,fij,tvk,msb,mpx,abx,poy,sgb,kas,tcz,top,dif,awk,cbc,bea,ell,myy,pus,bmr,ssx,pao,ebk,ajp,opm,wnu,gub,acr,max,tbf,ubr,cth,taj,aby,kde,mqj,zao,tyv,khm,hat,gle,azg,cbv,ian,apu,ptp,kbm,met,plt,sag,agd,sah,pag,ydd,ckb,mzz,div,kmg,miz,tac,tuo,gvn,boj,tee,mph,mna,qwh,gng,agg,mle,mak,rgu,haw,med,kyg,mig,nhu,tnc,waj,kat,lua,zpz,kpx,tof,ven,dzo,yaa,bqc,klv,qul,kqw,bef,gai,heb,nuy,zac,mcr,zpc,ssw,meu,tuk,gui,kmo,usp,otq,khs,ksj,xbi,nya,cya,aoj,kmr,grc,sny,snp,mir,piu,geb,tgl,dik,agn,dan,qvn,kaz,kbp,mto,tiy,xon,zav,dww,zap,kqa,lac,kne,wat,cbt,naf,inb,kwf,crn,azz,wim,ben,wro,poi,yue,awb,cgc,eng,mjc,amf,mps,mwe,ncu,cle,tdt,hne,zai,gdn,toc,bhl,kir,ron,fue,kyq,ixl,ghs,ncj,tbz,nnq,mio,kwd,mxp,beu,sbk,fuh,gym,ztq,mey,ikw,pab,pam,kmb,cof,tso,ipi,byr,aia,wiv,pcm,agm,doi,npl,ter,hye,iou,tku,nno,cnt,kqc,sll,lvs,gnn,nob,dah,nii,san,wuv,udu,gux,ots,zpq,cuk,mbj,nab,bjz,hbo,imo,mcf,glk,zam,twi,srd,sin,zca,qvc,agr,con,kjs,zaw,mav,gum,dov,ood,soq,tte,msa,chq,cbk,tzl,isn,kpf,ptu,mri,cao,aeb,cni,aaz,yon,pan,sgz,rom,mop,gwi,nou,uig,gla,far,atd,hin,tnp,bbr,kpg,huu,arn,jvn,cat,awa,amm,urb,run,mit,pir,gam,adz,tir,isl,pls,mlt,gsw,qve,nyu,txu,tbg,dwy,quy,ruf,kiw,shp,amr,ita,maq,dgr,fra,kin,ubu,gof,gaz,mgc,cmo,ctu,tel,eus,mcq,bpr,ino,snd,bgt,mwf,acu,jic,kkc,jac,lit,xtd,dyu,kvn,zyp,prs,cop,auc,wed,apb,sqi,ban,wal,poe,tnk,myu,otn,kje,ong,bkx,zsr,nds,hch,agt,wiu,spm,zpu,scn,sri,myw,buk,kdc,zho,sbs,slv,mwr,deu,kqf,kvg,tgp,bhg,dwr,xtm,amu,wbp,tim,ory,kzj,tos,kan,kbh,mya,mwp,mcb,shn,bdd,cub,yrb,tbo,yal,nij,lug,tah,txq,emi,hub,nso,slk,zpo,zpv,bmk,nss,bjn,nch,abs,bzd,shj,ukr,mbl,ina,tlf,kab,kew,kpw,luo,cpy,kmu,kup,zab,pri,snc,wbi,acf,gmv,glg,dsb,amp,qup,nop,srq,yka,apw,mqb,wmt,bch,ewe,sey,lbb,epo,qvh,taw,fuc,kql,ksd,smo,gvf,cmn,yad,ind,qvs,obo,wmw,nsn,anv,mic,pap,ake,nov,fas,cbr,bjr,glv,mdy,bbc,tsw,gvc,noa,bus,bjv,cwe,pon,pio,snn,swg,mal,nho,bba,jae,mxt,wol,nif,ycn,lao,tfr,ffm,qub,hus,bzh,mlp,mti,not,nys,ile,tzo,arb,mos,kam,cuc,dgc,pah,pjt,gbm,est,bxh,hot,bre,kms,cot,awx,bjk,pwg,cpu,hla,mpm,fuf,pol,tnn,shi,auy,mpj,tuc,bug,kor,zad,war,ars,rkb,mni,cbu,lif,mar,krc,dad,mee,dgz,mco,kik,apz,mkn,sco,mbb,maz,lij,khz,hmo,guh,sun,cbi,lgl,nhr,tiw,daa,amn,amk,tke,lex,mag,cym,gom,eko,zia,mcp,gah,urt,sua,cab,quh,srm,dtp,vid,blz,bhp,mmx,apn,tur,rmy,bem,yaq,ctp,cui,lus,tav,cax,yva | +| [MTEB(Retrieval w/Instructions)](https://arxiv.org/abs/2403.15246) | 3 | {'InstructionRetrieval': 3} | [Written, News] | eng | +| [MTEB(Scandinavian)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | 28 | {'BitextMining': 2, 'Classification': 13, 'Retrieval': 7, 'Clustering': 6} | [Web, Fiction, Social, Written, Blog, Non-fiction, Legal, News, Spoken, Reviews, Government, Encyclopaedic] | swe,nno,isl,dan,fao,nob | +| MTEB(code) | 12 | {'Retrieval': 12} | [Written, Programming] | javascript,ruby,sql,go,c,eng,shell,typescript,rust,java,php,python,scala,swift,c++ | +| [MTEB(deu)](https://arxiv.org/html/2401.02709v1) | 19 | {'Classification': 6, 'Clustering': 4, 'PairClassification': 2, 'Reranking': 1, 'Retrieval': 4, 'STS': 2} | [Web, Written, News, Spoken, Reviews, Encyclopaedic] | pol,deu,fra,eng | +| MTEB(eng, beta) | 41 | {'Classification': 8, 'Retrieval': 10, 'Clustering': 8, 'Reranking': 2, 'STS': 9, 'PairClassification': 3, 'Summarization': 1} | [Web, Academic, Social, Written, Medical, Blog, Non-fiction, News, Spoken, Reviews, Encyclopaedic, Programming] | nld,tur,eng,ara,spa,ita,deu,fra,pol,cmn | +| MTEB(eng, classic) | 67 | {'Classification': 12, 'Retrieval': 26, 'Clustering': 11, 'Reranking': 4, 'STS': 10, 'PairClassification': 3, 'Summarization': 1} | [Web, Academic, Social, Written, Medical, Blog, Non-fiction, News, Spoken, Reviews, Encyclopaedic, Programming] | nld,tur,eng,ara,spa,ita,deu,fra,pol,cmn | +| [MTEB(fra)](https://arxiv.org/abs/2405.20468) | 26 | {'Classification': 6, 'Clustering': 7, 'PairClassification': 2, 'Reranking': 2, 'Retrieval': 5, 'STS': 3, 'Summarization': 1} | [Web, Academic, Social, Written, Non-fiction, Legal, News, Spoken, Reviews, Encyclopaedic] | pol,deu,fra,eng | +| [MTEB(jpn)](https://github.com/sbintuitions/JMTEB) | 16 | {'Clustering': 2, 'Classification': 4, 'STS': 2, 'PairClassification': 1, 'Retrieval': 6, 'Reranking': 1} | [Web, Academic, Written, Non-fiction, News, Spoken, Reviews, Encyclopaedic] | jpn | +| MTEB(kor) | 6 | {'Classification': 1, 'Reranking': 1, 'Retrieval': 2, 'STS': 2} | [Web, Written, News, Spoken, Reviews, Encyclopaedic] | kor | +| [MTEB(law)](https://aclanthology.org/2023.eacl-main.148/) | 8 | {'Retrieval': 8} | [Written, Legal] | deu,zho,eng | +| [MTEB(pol)](https://arxiv.org/abs/2405.10138) | 18 | {'Classification': 7, 'Clustering': 3, 'PairClassification': 4, 'STS': 4} | [Web, Fiction, Academic, Social, Written, Non-fiction, Legal, News, Spoken] | pol,deu,fra,eng | +| [MTEB(rus)](https://aclanthology.org/2023.eacl-main.148/) | 23 | {'Classification': 9, 'Clustering': 3, 'MultilabelClassification': 2, 'PairClassification': 1, 'Reranking': 2, 'Retrieval': 3, 'STS': 3} | [Web, Social, Academic, Written, Blog, News, Spoken, Reviews, Encyclopaedic] | rus | +| [NanoBEIR](https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6) | 13 | {'Retrieval': 13} | [Web, Academic, Social, Medical, Written, Non-fiction, News, Encyclopaedic] | eng | | [RAR-b](https://arxiv.org/abs/2404.06347) | 17 | {'Retrieval': 17} | [Encyclopaedic, Written, Programming] | eng | \ No newline at end of file diff --git a/docs/tasks.md b/docs/tasks.md index 459c03e033..15b9474168 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -428,7 +428,7 @@ The following tables give you an overview of the tasks in MTEB. | [PolEmo2.0-OUT](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Written, Social] | None | None | | [PpcPC](https://arxiv.org/pdf/2207.12759.pdf) (Sławomir Dadas, 2022) | ['pol'] | PairClassification | s2s | [Fiction, Non-fiction, Web, Written, Spoken, Social, News] | None | None | | [PubChemAISentenceParaphrasePC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | -| [PubChemSMILESBitextMining](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['en', 'eng'] | BitextMining | s2s | [Chemistry] | None | None | +| [PubChemSMILESBitextMining](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | BitextMining | s2s | [Chemistry] | None | None | | [PubChemSMILESPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | | [PubChemSynonymPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | | [PubChemWikiPairClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['ces', 'deu', 'eng', 'fra', 'hin', 'jpn', 'kor', 'msa', 'nld', 'por', 'spa', 'tur', 'zho'] | PairClassification | s2s | [Chemistry] | None | None | @@ -605,7 +605,6 @@ The following tables give you an overview of the tasks in MTEB. | [WebLINXCandidatesReranking](https://mcgill-nlp.github.io/weblinx) (Xing Han Lù, 2024) | ['eng'] | Reranking | p2p | [Academic, Web, Written] | None | None | | [WikiCitiesClustering](https://huggingface.co/datasets/wikipedia) | ['eng'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | | [WikiClusteringP2P.v2](https://github.com/Rysias/wiki-clustering) | ['bos', 'cat', 'ces', 'dan', 'eus', 'glv', 'ilo', 'kur', 'lav', 'min', 'mlt', 'sco', 'sqi', 'wln'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | -| [WikipediaAIParagraphsParaphrasePC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | p2p | [Chemistry] | None | None | | [WikipediaBioMetChemClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaBiolumNeurochemClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaChemEngSpecialtiesClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | @@ -931,8 +930,7 @@ The following tables give you an overview of the tasks in MTEB. | ell | Modern Greek (1453-) | Indo-European | 3 | 6 | 1 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 16 | | emi | Mussau-Emira | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | emp | Northern Emberá | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| en | unknown | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 17 | 160 | 18 | 3 | 1 | 14 | 8 | 107 | 13 | 2 | 1 | 344 | +| eng | English | Indo-European | 17 | 160 | 18 | 3 | 1 | 13 | 8 | 107 | 13 | 2 | 1 | 343 | | enq | Enga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | epo | Esperanto | Artificial Language | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | eri | Ogea | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1704,7 +1702,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 1396 | 812 | 306 | 3 | 28 | 85 | 51 | 475 | 85 | 2 | 2 | +| Total | None | None | None | 1395 | 812 | 306 | 3 | 28 | 84 | 51 | 475 | 85 | 2 | 2 | diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 5ec3e09635..84490333be 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1066,7 +1066,6 @@ def load_results( "PubChemSMILESPC", "PubChemSynonymPC", "PubChemWikiParagraphsPC", - "WikipediaAIParagraphsParaphrasePC", "PubChemWikiPairClassification", "ChemNQRetrieval", "ChemHotpotQARetrieval", diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index d821346ab5..e229195df0 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -11,7 +11,6 @@ from .eng.SprintDuplicateQuestionsPC import * from .eng.TwitterSemEval2015PC import * from .eng.TwitterURLCorpusPC import * -from .eng.WikipediaAIParagraphsParaphrasePC import * from .fas.FarsTail import * from .hye.ArmenianParaphrasePC import * from .ind.IndoNLI import * diff --git a/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py b/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py deleted file mode 100644 index 87806cd76c..0000000000 --- a/mteb/tasks/PairClassification/eng/WikipediaAIParagraphsParaphrasePC.py +++ /dev/null @@ -1,65 +0,0 @@ -from __future__ import annotations - -import datasets - -from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class WikipediaAIParagraphsParaphrasePC(AbsTaskPairClassification): - metadata = TaskMetadata( - name="WikipediaAIParagraphsParaphrasePC", - description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", - reference="https://arxiv.org/abs/2412.00532", - dataset={ - "path": "BASF-AI/WikipediaAIParagraphsParaphrasePC", - "revision": "7694661b6e28000d9b2c2376a1bbd49417d279ea", - }, - type="PairClassification", - category="p2p", - modalities=["text"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="max_ap", - date=("2024-06-01", "2024-11-30"), - domains=["Chemistry"], - task_subtypes=[], - license="cc-by-nc-sa-4.0", - annotations_creators="LM-generated", - dialect=[], - sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, - ) - - def load_data(self): - """Load dataset from HuggingFace hub""" - if self.data_loaded: - return - - self.dataset = datasets.load_dataset( - self.metadata_dict["dataset"]["path"], - revision=self.metadata_dict["dataset"]["revision"], - trust_remote_code=True, - ) - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self): - _dataset = {} - for split in self.metadata.eval_splits: - hf_dataset = self.dataset[split] - _dataset[split] = [ - { - "sentence1": hf_dataset["sent1"], - "sentence2": hf_dataset["sent2"], - "labels": hf_dataset["labels"], - } - ] - self.dataset = _dataset From da4ef353539ccea92f82510205ce63e503f49795 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 5 Jan 2025 20:20:36 +0000 Subject: [PATCH 43/49] Merge `amazon_models` and `cohere_bedrock_models.py` into `bedrock_models.py` --- mteb/models/amazon_models.py | 86 --------- mteb/models/bedrock_models.py | 217 ++++++++++++++++++++++ mteb/models/cohere_bedrock_models.py | 264 --------------------------- mteb/models/overview.py | 6 +- 4 files changed, 219 insertions(+), 354 deletions(-) delete mode 100644 mteb/models/amazon_models.py create mode 100644 mteb/models/bedrock_models.py delete mode 100644 mteb/models/cohere_bedrock_models.py diff --git a/mteb/models/amazon_models.py b/mteb/models/amazon_models.py deleted file mode 100644 index 845ad7bc0c..0000000000 --- a/mteb/models/amazon_models.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import annotations - -import json -import logging -from functools import partial -from typing import Any - -import numpy as np - -from mteb.model_meta import ModelMeta -from mteb.requires_package import requires_package - -from .wrapper import Wrapper - -logger = logging.getLogger(__name__) - - -class AmazonWrapper(Wrapper): - def __init__(self, model_id: str, **kwargs) -> None: - requires_package(self, "boto3", "Amazon Bedrock") - import boto3 - - boto3_session = boto3.session.Session() - region_name = boto3_session.region_name - self._client = boto3.client( - "bedrock-runtime", - region_name, - ) - self._model_id = model_id - - def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: - requires_package(self, "boto3", "Amazon Bedrock") - - all_embeddings = [] - - for sentence in sentences: - response = self._client.invoke_model( - body=json.dumps({"inputText": sentence}), - modelId=self._model_id, - accept="application/json", - contentType="application/json", - ) - all_embeddings.append(self._to_numpy(response)) - - return np.array(all_embeddings) - - def _to_numpy(self, embedding_response) -> np.ndarray: - response = json.loads(embedding_response.get("body").read()) - return np.array(response["embedding"]) - - -amazon_titan_embed_text_v1 = ModelMeta( - name="amazon/titan-embed-text-v1", - revision="1", - release_date="2023-09-27", - languages=None, # supported languages not specified - loader=partial(AmazonWrapper, model_id="amazon.titan-embed-text-v1"), - max_tokens=8192, - embed_dim=1536, - open_weights=False, - n_parameters=None, - memory_usage=None, - license=None, - reference="https://aws.amazon.com/about-aws/whats-new/2023/09/amazon-titan-embeddings-generally-available/", - similarity_fn_name="cosine", - framework=["API"], - use_instructions=False, -) - -amazon_titan_embed_text_v2 = ModelMeta( - name="amazon/titan-embed-text-v2", - revision="1", - release_date="2024-04-30", - languages=None, # supported languages not specified - loader=partial(AmazonWrapper, model_id="amazon.titan-embed-text-v2:0"), - max_tokens=8192, - embed_dim=1024, - open_weights=False, - n_parameters=None, - memory_usage=None, - license=None, - reference="https://aws.amazon.com/about-aws/whats-new/2024/04/amazon-titan-text-embeddings-v2-amazon-bedrock/", - similarity_fn_name="cosine", - framework=["API"], - use_instructions=False, -) diff --git a/mteb/models/bedrock_models.py b/mteb/models/bedrock_models.py new file mode 100644 index 0000000000..c3bfbfcdf0 --- /dev/null +++ b/mteb/models/bedrock_models.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +import json +import logging +from functools import partial +from typing import Any + +import numpy as np +import tqdm + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta +from mteb.models.cohere_models import model_prompts as cohere_model_prompts +from mteb.models.cohere_models import supported_languages as cohere_supported_languages +from mteb.requires_package import requires_package + +from .wrapper import Wrapper + +logger = logging.getLogger(__name__) + + +class BedrockWrapper(Wrapper): + def __init__( + self, + model_id: str, + provider: str, + model_prompts: dict[str, str] | None = None, + **kwargs, + ) -> None: + requires_package(self, "boto3", "The AWS SDK for Python") + import boto3 + + boto3_session = boto3.session.Session() + region_name = boto3_session.region_name + self._client = boto3.client("bedrock-runtime", region_name) + + self._model_id = model_id + self._provider = provider.lower() + + if self._provider == "cohere": + self.model_prompts = ( + self.validate_task_to_prompt_name(model_prompts) + if model_prompts + else None + ) + self._max_batch_size = 96 + self._max_sequence_length = 2048 + + def encode( + self, + sentences: list[str], + *, + task_name: str | None = None, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> np.ndarray: + requires_package(self, "boto3", "Amazon Bedrock") + show_progress_bar = ( + False + if "show_progress_bar" not in kwargs + else kwargs.pop("show_progress_bar") + ) + if self._provider == "amazon": + return self._encode_amazon(sentences, show_progress_bar) + elif self._provider == "cohere": + prompt_name = self.get_prompt_name( + self.model_prompts, task_name, prompt_type + ) + cohere_task_type = self.model_prompts.get(prompt_name, "search_document") + return self._encode_cohere(sentences, cohere_task_type, show_progress_bar) + else: + raise ValueError( + f"Unknown provider '{self._provider}'. Must be 'amazon' or 'cohere'." + ) + + def _encode_amazon( + self, sentences: list[str], show_progress_bar: bool = False + ) -> np.ndarray: + all_embeddings = [] + + for sentence in tqdm.tqdm( + sentences, leave=False, disable=not show_progress_bar + ): + response = self._client.invoke_model( + body=json.dumps({"inputText": sentence}), + modelId=self._model_id, + accept="application/json", + contentType="application/json", + ) + all_embeddings.append(self._to_numpy(response)) + + return np.array(all_embeddings) + + def _encode_cohere( + self, + sentences: list[str], + cohere_task_type: str, + show_progress_bar: bool = False, + ) -> np.ndarray: + batches = [ + sentences[i : i + self._max_batch_size] + for i in range(0, len(sentences), self._max_batch_size) + ] + + all_embeddings = [] + + for batch in tqdm.tqdm(batches, leave=False, disable=not show_progress_bar): + response = self._client.invoke_model( + body=json.dumps( + { + "texts": [sent[: self._max_sequence_length] for sent in batch], + "input_type": cohere_task_type, + } + ), + modelId=self._model_id, + accept="*/*", + contentType="application/json", + ) + all_embeddings.extend(self._to_numpy(response)) + + return np.array(all_embeddings) + + def _to_numpy(self, embedding_response) -> np.ndarray: + response = json.loads(embedding_response.get("body").read()) + key = "embedding" if self._provider == "amazon" else "embeddings" + return np.array(response[key]) + + +amazon_titan_embed_text_v1 = ModelMeta( + name="bedrock/amazon-titan-embed-text-v1", + revision="1", + release_date="2023-09-27", + languages=None, # not specified + loader=partial( + BedrockWrapper, + model_id="amazon.titan-embed-text-v1", + provider="amazon", + ), + max_tokens=8192, + embed_dim=1536, + open_weights=False, + n_parameters=None, + memory_usage=None, + license=None, + reference="https://aws.amazon.com/about-aws/whats-new/2023/09/amazon-titan-embeddings-generally-available/", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=False, +) + +amazon_titan_embed_text_v2 = ModelMeta( + name="bedrock/amazon-titan-embed-text-v2", + revision="1", + release_date="2024-04-30", + languages=None, # not specified + loader=partial( + BedrockWrapper, + model_id="amazon.titan-embed-text-v2:0", + provider="amazon", + ), + max_tokens=8192, + embed_dim=1024, + open_weights=False, + n_parameters=None, + memory_usage=None, + license=None, + reference="https://aws.amazon.com/about-aws/whats-new/2024/04/amazon-titan-text-embeddings-v2-amazon-bedrock/", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=False, +) + +cohere_embed_english_v3 = ModelMeta( + loader=partial( + BedrockWrapper, + model_id="cohere.embed-english-v3", + provider="cohere", + model_prompts=cohere_model_prompts, + ), + name="bedrock/cohere-embed-english-v3", + languages=["eng-Latn"], + open_weights=False, + reference="https://cohere.com/blog/introducing-embed-v3", + revision="1", + release_date="2023-11-02", + n_parameters=None, + memory_usage=None, + max_tokens=512, + embed_dim=1024, + license=None, + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, +) + +cohere_embed_multilingual_v3 = ModelMeta( + loader=partial( + BedrockWrapper, + model_id="cohere.embed-multilingual-v3", + provider="cohere", + model_prompts=cohere_model_prompts, + ), + name="bedrock/cohere-embed-multilingual-v3", + languages=cohere_supported_languages, + open_weights=False, + reference="https://cohere.com/blog/introducing-embed-v3", + revision="1", + release_date="2023-11-02", + n_parameters=None, + memory_usage=None, + max_tokens=512, + embed_dim=1024, + license=None, + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, +) diff --git a/mteb/models/cohere_bedrock_models.py b/mteb/models/cohere_bedrock_models.py deleted file mode 100644 index 6f37660e95..0000000000 --- a/mteb/models/cohere_bedrock_models.py +++ /dev/null @@ -1,264 +0,0 @@ -from __future__ import annotations - -import json -import logging -from functools import partial -from typing import Any - -import numpy as np -import tqdm - -from mteb.encoder_interface import PromptType -from mteb.model_meta import ModelMeta -from mteb.requires_package import requires_package - -from .wrapper import Wrapper - -logger = logging.getLogger(__name__) - - -supported_languages = [ - "afr-Latn", - "amh-Ethi", - "ara-Arab", - "asm-Beng", - "aze-Latn", - "bel-Cyrl", - "bul-Cyrl", - "ben-Beng", - "bod-Tibt", - "bos-Latn", - "cat-Latn", - "ceb-Latn", - "cos-Latn", - "ces-Latn", - "cym-Latn", - "dan-Latn", - "deu-Latn", - "ell-Grek", - "eng-Latn", - "epo-Latn", - "spa-Latn", - "est-Latn", - "eus-Latn", - "fas-Arab", - "fin-Latn", - "fra-Latn", - "fry-Latn", - "gle-Latn", - "gla-Latn", - "glg-Latn", - "guj-Gujr", - "hau-Latn", - "haw-Latn", - "heb-Hebr", - "hin-Deva", - "hmn-Latn", - "hrv-Latn", - "hat-Latn", - "hun-Latn", - "hye-Armn", - "ind-Latn", - "ibo-Latn", - "isl-Latn", - "ita-Latn", - "jpn-Jpan", - "jav-Latn", - "kat-Geor", - "kaz-Cyrl", - "khm-Khmr", - "kan-Knda", - "kor-Kore", - "kur-Arab", - "kir-Cyrl", - "lat-Latn", - "ltz-Latn", - "lao-Laoo", - "lit-Latn", - "lav-Latn", - "mlg-Latn", - "mri-Latn", - "mkd-Cyrl", - "mal-Mlym", - "mon-Cyrl", - "mar-Deva", - "msa-Latn", - "mlt-Latn", - "mya-Mymr", - "nep-Deva", - "nld-Latn", - "nor-Latn", - "nya-Latn", - "ori-Orya", - "pan-Guru", - "pol-Latn", - "por-Latn", - "ron-Latn", - "rus-Cyrl", - "kin-Latn", - "sin-Sinh", - "slk-Latn", - "slv-Latn", - "smo-Latn", - "sna-Latn", - "som-Latn", - "sqi-Latn", - "srp-Cyrl", - "sot-Latn", - "sun-Latn", - "swe-Latn", - "swa-Latn", - "tam-Taml", - "tel-Telu", - "tgk-Cyrl", - "tha-Thai", - "tuk-Latn", - "tgl-Latn", - "tur-Latn", - "tat-Cyrl", - "uig-Arab", - "ukr-Cyrl", - "urd-Arab", - "uzb-Latn", - "vie-Latn", - "wol-Latn", - "xho-Latn", - "yid-Hebr", - "yor-Latn", - "zho-Hans", - "zul-Latn", -] - - -class CohereBedrockWrapper(Wrapper): - def __init__( - self, model_id: str, model_prompts: dict[str, str] | None = None, **kwargs - ) -> None: - requires_package(self, "boto3", "Amazon Bedrock") - import boto3 - - boto3_session = boto3.session.Session() - region_name = boto3_session.region_name - self._client = boto3.client( - "bedrock-runtime", - region_name, - ) - self._model_id = model_id - self.model_prompts = ( - self.validate_task_to_prompt_name(model_prompts) if model_prompts else None - ) - - def _embed( - self, - sentences: list[str], - cohere_task_type: str, - show_progress_bar: bool = False, - ) -> np.ndarray: - max_batch_size = 96 - - batches = [ - sentences[i : i + max_batch_size] - for i in range(0, len(sentences), max_batch_size) - ] - - all_embeddings = [] - - for batch in tqdm.tqdm(batches, leave=False, disable=not show_progress_bar): - response = self._client.invoke_model( - body=json.dumps( - { - "texts": [sent[:2048] for sent in batch], - "input_type": cohere_task_type, - } - ), - modelId=self._model_id, - accept="*/*", - contentType="application/json", - ) - all_embeddings.extend(self._to_numpy(response)) - - return np.array(all_embeddings) - - def encode( - self, - sentences: list[str], - *, - task_name: str, - prompt_type: PromptType | None = None, - **kwargs: Any, - ) -> np.ndarray: - prompt_name = self.get_prompt_name(self.model_prompts, task_name, prompt_type) - cohere_task_type = self.model_prompts.get(prompt_name) - - if cohere_task_type is None: - # search_document is recommended if unknown (https://cohere.com/blog/introducing-embed-v3) - cohere_task_type = "search_document" - - show_progress_bar = ( - False - if "show_progress_bar" not in kwargs - else kwargs.pop("show_progress_bar") - ) - - return self._embed( - sentences, - cohere_task_type=cohere_task_type, - show_progress_bar=show_progress_bar, - ) - - def _to_numpy(self, embedding_response) -> np.ndarray: - response = json.loads(embedding_response.get("body").read()) - return np.array(response["embeddings"]) - - -model_prompts = { - "Classification": "classification", - "MultilabelClassification": "classification", - "Clustering": "clustering", - PromptType.query.value: "search_query", - PromptType.passage.value: "search_document", -} - -cohere_embed_english_v3 = ModelMeta( - loader=partial( - CohereBedrockWrapper, - model_id="cohere.embed-english-v3", - model_prompts=model_prompts, - ), - name="bedrock/cohere-embed-english-v3", - languages=["eng-Latn"], - open_weights=False, - reference="https://cohere.com/blog/introducing-embed-v3", - revision="1", - release_date="2023-11-02", - n_parameters=None, - memory_usage=None, - max_tokens=512, - embed_dim=1024, - license=None, - similarity_fn_name="cosine", - framework=["API"], - use_instructions=True, -) - -cohere_embed_multilingual_v3 = ModelMeta( - loader=partial( - CohereBedrockWrapper, - model_id="cohere.embed-multilingual-v3", - model_prompts=model_prompts, - ), - name="cohere-embed-multilingual-v3", - languages=supported_languages, - open_weights=False, - reference="https://cohere.com/blog/introducing-embed-v3", - revision="1", - release_date="2023-11-02", - n_parameters=None, - memory_usage=None, - max_tokens=512, - embed_dim=1024, - license=None, - similarity_fn_name="cosine", - framework=["API"], - use_instructions=True, -) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index c908b24b27..01c6e476c3 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -11,11 +11,10 @@ from mteb.encoder_interface import Encoder from mteb.model_meta import ModelMeta from mteb.models import ( - amazon_models, arctic_models, + bedrock_models, bge_models, bm25, - cohere_bedrock_models, cohere_models, colbert_models, e5_instruct, @@ -87,8 +86,7 @@ jasper_models, uae_models, stella_models, - amazon_models, - cohere_bedrock_models, + bedrock_models, uae_models, voyage_models, ] From f50cd66dea6e83bce1d733ccb6500bf87b97cae9 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 5 Jan 2025 21:20:57 +0000 Subject: [PATCH 44/49] Remove unnecessary `load_data` for some tasks. --- .../eng/PubChemSMILESBitextMining.py | 18 +----------------- .../eng/PubChemAISentenceParaphrasePC.py | 15 --------------- .../PairClassification/eng/PubChemSynonymPC.py | 16 ---------------- .../eng/PubChemWikiParagraphsPC.py | 15 --------------- 4 files changed, 1 insertion(+), 63 deletions(-) diff --git a/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py b/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py index f2a2f3baa7..4951d8c596 100644 --- a/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py +++ b/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py @@ -1,7 +1,5 @@ from __future__ import annotations -import datasets - from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining from mteb.abstasks.MultilingualTask import MultilingualTask from mteb.abstasks.TaskMetadata import TaskMetadata @@ -21,7 +19,7 @@ } -class PubChemSMILESBitextMining(AbsTaskBitextMining, MultilingualTask): +class PubChemSMILESBitextMining(MultilingualTask, AbsTaskBitextMining): metadata = TaskMetadata( name="PubChemSMILESBitextMining", dataset={ @@ -63,20 +61,6 @@ class PubChemSMILESBitextMining(AbsTaskBitextMining, MultilingualTask): """, ) - def load_data(self, **kwargs): - """Load dataset from HuggingFace hub and convert it to the standard format.""" - if self.data_loaded: - return - self.dataset = {} - - for subset in self.hf_subsets: - self.dataset[subset] = datasets.load_dataset( - **self.metadata_dict["dataset"], name=subset - ) - - self.dataset_transform() - self.data_loaded = True - def dataset_transform(self): for subset in self.hf_subsets: self.dataset[subset] = self.dataset[subset].rename_columns( diff --git a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py index 54851249c8..f453ebee31 100644 --- a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py +++ b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py @@ -1,7 +1,5 @@ from __future__ import annotations -import datasets - from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification from mteb.abstasks.TaskMetadata import TaskMetadata @@ -48,19 +46,6 @@ class PubChemAISentenceParaphrasePC(AbsTaskPairClassification): """, ) - def load_data(self): - """Load dataset from HuggingFace hub""" - if self.data_loaded: - return - - self.dataset = datasets.load_dataset( - self.metadata_dict["dataset"]["path"], - revision=self.metadata_dict["dataset"]["revision"], - trust_remote_code=True, - ) - self.dataset_transform() - self.data_loaded = True - def dataset_transform(self): _dataset = {} for split in self.metadata.eval_splits: diff --git a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py index 965aca2666..6b6dfd81c8 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py @@ -1,7 +1,5 @@ from __future__ import annotations -import datasets - from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification from mteb.abstasks.TaskMetadata import TaskMetadata @@ -48,20 +46,6 @@ class PubChemSynonymPC(AbsTaskPairClassification): """, ) - def load_data(self): - """Load dataset from HuggingFace hub""" - if self.data_loaded: - return - - _dataset = datasets.load_dataset( - self.metadata_dict["dataset"]["path"], - revision=self.metadata_dict["dataset"]["revision"], - trust_remote_code=True, - ) - self.dataset = _dataset - self.dataset_transform() - self.data_loaded = True - def dataset_transform(self): _dataset = {} diff --git a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py index e2836f6bfe..679580f28c 100644 --- a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py @@ -1,7 +1,5 @@ from __future__ import annotations -import datasets - from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification from mteb.abstasks.TaskMetadata import TaskMetadata @@ -48,19 +46,6 @@ class PubChemWikiParagraphsPC(AbsTaskPairClassification): """, ) - def load_data(self): - """Load dataset from HuggingFace hub""" - if self.data_loaded: - return - - self.dataset = datasets.load_dataset( - self.metadata_dict["dataset"]["path"], - revision=self.metadata_dict["dataset"]["revision"], - trust_remote_code=True, - ) - self.dataset_transform() - self.data_loaded = True - def dataset_transform(self): _dataset = {} for split in self.metadata.eval_splits: From 1dd051ce5345f807e3688acb45f1e331b42f39d8 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Wed, 8 Jan 2025 20:45:35 +0000 Subject: [PATCH 45/49] Update `bedrock_models.py`, `openai_models.py` and two dataset revisions - Text should be truncated for amazon text embedding models. - `text-embedding-ada-002` returns null embeddings for some inputs with 8192 tokens. - Two datasets are updated, dropping very long samples (len > 99th percentile) --- mteb/models/bedrock_models.py | 33 ++++++++++++++----- mteb/models/openai_models.py | 2 +- .../eng/WikipediaChemFieldsClassification.py | 2 +- ...kipediaTheoreticalAppliedClassification.py | 2 +- 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/mteb/models/bedrock_models.py b/mteb/models/bedrock_models.py index c3bfbfcdf0..6521a4ee85 100644 --- a/mteb/models/bedrock_models.py +++ b/mteb/models/bedrock_models.py @@ -24,6 +24,7 @@ def __init__( self, model_id: str, provider: str, + max_tokens: int, model_prompts: dict[str, str] | None = None, **kwargs, ) -> None: @@ -44,7 +45,9 @@ def __init__( else None ) self._max_batch_size = 96 - self._max_sequence_length = 2048 + self._max_sequence_length = max_tokens * 4 + else: + self._max_tokens = max_tokens def encode( self, @@ -77,17 +80,18 @@ def _encode_amazon( self, sentences: list[str], show_progress_bar: bool = False ) -> np.ndarray: all_embeddings = [] + # https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html + max_sequence_length = int(self._max_tokens * 4.5) for sentence in tqdm.tqdm( sentences, leave=False, disable=not show_progress_bar ): - response = self._client.invoke_model( - body=json.dumps({"inputText": sentence}), - modelId=self._model_id, - accept="application/json", - contentType="application/json", - ) - all_embeddings.append(self._to_numpy(response)) + if len(sentence) > max_sequence_length: + all_embeddings.append( + self._embed_amazon(sentence[:max_sequence_length]) + ) + else: + all_embeddings.append(self._embed_amazon(sentence)) return np.array(all_embeddings) @@ -120,6 +124,15 @@ def _encode_cohere( return np.array(all_embeddings) + def _embed_amazon(self, sentence: str) -> np.ndarray: + response = self._client.invoke_model( + body=json.dumps({"inputText": sentence}), + modelId=self._model_id, + accept="application/json", + contentType="application/json", + ) + return self._to_numpy(response) + def _to_numpy(self, embedding_response) -> np.ndarray: response = json.loads(embedding_response.get("body").read()) key = "embedding" if self._provider == "amazon" else "embeddings" @@ -135,6 +148,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: BedrockWrapper, model_id="amazon.titan-embed-text-v1", provider="amazon", + max_tokens=8192, ), max_tokens=8192, embed_dim=1536, @@ -157,6 +171,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: BedrockWrapper, model_id="amazon.titan-embed-text-v2:0", provider="amazon", + max_tokens=8192, ), max_tokens=8192, embed_dim=1024, @@ -175,6 +190,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: BedrockWrapper, model_id="cohere.embed-english-v3", provider="cohere", + max_tokens=512, model_prompts=cohere_model_prompts, ), name="bedrock/cohere-embed-english-v3", @@ -198,6 +214,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: BedrockWrapper, model_id="cohere.embed-multilingual-v3", provider="cohere", + max_tokens=512, model_prompts=cohere_model_prompts, ), name="bedrock/cohere-embed-multilingual-v3", diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index aecacf549a..e9f79db575 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -166,7 +166,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: OpenAIWrapper, model_name="text-embedding-ada-002", tokenizer_name="cl100k_base", - max_tokens=8192, + max_tokens=8191, ), reference="https://openai.com/index/new-and-improved-embedding-model/", max_tokens=8191, diff --git a/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py index 07509d427f..7c0179fb1e 100644 --- a/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py @@ -11,7 +11,7 @@ class WikipediaChemFieldsClassification(AbsTaskClassification): reference="https://arxiv.org/abs/2412.00532", dataset={ "path": "BASF-AI/WikipediaEZ10Classification", - "revision": "bb465f7e0dc023c7effc39b45aa268ff70d4312c", + "revision": "a75fae77759acc115f015f2b856baa47776d733d", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py index 835ce579bc..f33b02f4bb 100644 --- a/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py @@ -11,7 +11,7 @@ class WikipediaTheoreticalAppliedClassification(AbsTaskClassification): reference="https://arxiv.org/abs/2412.00532", dataset={ "path": "BASF-AI/WikipediaEZ2Classification", - "revision": "39350f72444caf0cff039dbf0f57933d8226f73e", + "revision": "7896906653d31d7102a143d7f55d67cd688e3147", }, type="Classification", category="s2s", From 7b93330c77f93e38060ef0354200c18fbfd4cfe8 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sat, 11 Jan 2025 22:32:20 +0000 Subject: [PATCH 46/49] Add a layer of dynamic truncation for amazon models in `bedrock_models.py` --- mteb/models/bedrock_models.py | 28 ++++++++++++++++++++++++---- mteb/models/openai_models.py | 2 +- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/mteb/models/bedrock_models.py b/mteb/models/bedrock_models.py index 6521a4ee85..60b20d9263 100644 --- a/mteb/models/bedrock_models.py +++ b/mteb/models/bedrock_models.py @@ -2,6 +2,7 @@ import json import logging +import re from functools import partial from typing import Any @@ -79,6 +80,8 @@ def encode( def _encode_amazon( self, sentences: list[str], show_progress_bar: bool = False ) -> np.ndarray: + from botocore.exceptions import ValidationError + all_embeddings = [] # https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html max_sequence_length = int(self._max_tokens * 4.5) @@ -87,11 +90,28 @@ def _encode_amazon( sentences, leave=False, disable=not show_progress_bar ): if len(sentence) > max_sequence_length: - all_embeddings.append( - self._embed_amazon(sentence[:max_sequence_length]) - ) + truncated_sentence = sentence[:max_sequence_length] else: - all_embeddings.append(self._embed_amazon(sentence)) + truncated_sentence = sentence + + try: + embedding = self._embed_amazon(truncated_sentence) + all_embeddings.append(embedding) + + except ValidationError as e: + error_str = str(e) + pattern = r"request input token count:\s*(\d+)" + match = re.search(pattern, error_str) + if match: + num_tokens = int(match.group(1)) + + ratio = 0.9 * (self._max_tokens / num_tokens) + dynamic_cutoff = int(len(truncated_sentence) * ratio) + + embedding = self._embed_amazon(truncated_sentence[:dynamic_cutoff]) + all_embeddings.append(embedding) + else: + raise e return np.array(all_embeddings) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index e9f79db575..aecacf549a 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -166,7 +166,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: OpenAIWrapper, model_name="text-embedding-ada-002", tokenizer_name="cl100k_base", - max_tokens=8191, + max_tokens=8192, ), reference="https://openai.com/index/new-and-improved-embedding-model/", max_tokens=8191, From 064d0537c06ee0a108c6e36e73809dc8ffdeee38 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Sun, 12 Jan 2025 14:27:11 +0000 Subject: [PATCH 47/49] Replace `metadata_dict` with `self.metadata` in `PubChemSMILESPC.py` --- mteb/tasks/PairClassification/eng/PubChemSMILESPC.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py index 6b02b156c0..b3e297e043 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py @@ -83,9 +83,9 @@ def load_data(self): _hf_dataset = None for dataset_col_map in _DATASET_COLUMN_MAP: _dataset = datasets.load_dataset( - self.metadata_dict["dataset"]["path"], + self.metadata.dataset["path"], dataset_col_map["name"], - revision=self.metadata_dict["dataset"]["revision"], + revision=self.metadata.dataset["revision"], ) _dataset = _dataset.rename_columns( @@ -111,7 +111,7 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, - splits=self.metadata_dict["eval_splits"], + splits=self.metadata["eval_splits"], label="labels", ) From 0156440842e4ed35529bdb71dd0ab2057a15b2cb Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Thu, 23 Jan 2025 17:10:42 +0000 Subject: [PATCH 48/49] fix model meta for bedrock models --- mteb/models/bedrock_models.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/mteb/models/bedrock_models.py b/mteb/models/bedrock_models.py index 60b20d9263..af063dab3e 100644 --- a/mteb/models/bedrock_models.py +++ b/mteb/models/bedrock_models.py @@ -174,7 +174,9 @@ def _to_numpy(self, embedding_response) -> np.ndarray: embed_dim=1536, open_weights=False, n_parameters=None, - memory_usage=None, + public_training_code=None, + public_training_data=None, # assumed + training_datasets=None, license=None, reference="https://aws.amazon.com/about-aws/whats-new/2023/09/amazon-titan-embeddings-generally-available/", similarity_fn_name="cosine", @@ -197,7 +199,9 @@ def _to_numpy(self, embedding_response) -> np.ndarray: embed_dim=1024, open_weights=False, n_parameters=None, - memory_usage=None, + public_training_code=None, + public_training_data=None, # assumed + training_datasets=None, license=None, reference="https://aws.amazon.com/about-aws/whats-new/2024/04/amazon-titan-text-embeddings-v2-amazon-bedrock/", similarity_fn_name="cosine", @@ -220,7 +224,9 @@ def _to_numpy(self, embedding_response) -> np.ndarray: revision="1", release_date="2023-11-02", n_parameters=None, - memory_usage=None, + public_training_code=None, + public_training_data=None, # assumed + training_datasets=None, max_tokens=512, embed_dim=1024, license=None, @@ -244,7 +250,9 @@ def _to_numpy(self, embedding_response) -> np.ndarray: revision="1", release_date="2023-11-02", n_parameters=None, - memory_usage=None, + public_training_code=None, + public_training_data=None, # assumed + training_datasets=None, max_tokens=512, embed_dim=1024, license=None, From c630053e52ea2d3e9e3270ad8bdaebeb43ab3a59 Mon Sep 17 00:00:00 2001 From: Ali Shiraee Date: Fri, 24 Jan 2025 22:29:38 +0000 Subject: [PATCH 49/49] Add reference comment to original Cohere API implementation --- mteb/models/bedrock_models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mteb/models/bedrock_models.py b/mteb/models/bedrock_models.py index af063dab3e..4616209df1 100644 --- a/mteb/models/bedrock_models.py +++ b/mteb/models/bedrock_models.py @@ -208,7 +208,9 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, ) - +# Note: For the original Cohere API implementation, refer to: +# https://github.com/embeddings-benchmark/mteb/blob/main/mteb/models/cohere_models.py +# This implementation uses the Amazon Bedrock endpoint for Cohere models. cohere_embed_english_v3 = ModelMeta( loader=partial( BedrockWrapper,