Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Add datasets in CodeRAG-Bench #1595

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions mteb/abstasks/AbsTaskRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,8 @@ def load_data(self, **kwargs):
"trust_remote_code", False
),
).load(split=split)


Samoed marked this conversation as resolved.
Show resolved Hide resolved
# Conversion from DataSet
queries = {query["id"]: query["text"] for query in queries}
corpus = {
Expand Down Expand Up @@ -346,6 +348,7 @@ def _evaluate_subset(
end_time = time()
logger.info(f"Time taken to retrieve: {end_time - start_time:.2f} seconds")


save_predictions = kwargs.get("save_predictions", False)
export_errors = kwargs.get("export_errors", False)
if save_predictions or export_errors:
Expand Down
5 changes: 4 additions & 1 deletion mteb/languages.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ def from_languages_and_scripts(
script_codes: set[str] = set(scripts) if (scripts is not None) else set()
# normalize to 3 letter language codes
normalized_langs = set()

# if type(languages) is not list and languages is not None:
# import pdb; pdb.set_trace()
# print('')
Samoed marked this conversation as resolved.
Show resolved Hide resolved

if languages is not None:
for lang in languages:
lang_script = lang.split("-")
Expand Down
4 changes: 4 additions & 0 deletions mteb/tasks/Retrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
from .code.CodeTransOceanDLRetrieval import *
from .code.COIRCodeSearchNetRetrieval import *
from .code.CosQARetrieval import *
from .code.CodeRAGOnlineTutorialsRetrieval import *
from .code.CodeRAGLibraryDocumentationRetrieval import *
from .code.CodeRAGProgrammingSolutionsRetrieval import *
from .code.CodeRAGStackoverflowPostsRetrieval import *
from .code.StackOverflowQARetrieval import *
from .code.SyntheticText2SqlRetrieval import *
from .dan.DanFeverRetrieval import *
Expand Down
96 changes: 96 additions & 0 deletions mteb/tasks/Retrieval/code/CodeRAGLibraryDocumentationRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from __future__ import annotations

import datasets

from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from mteb.abstasks.TaskMetadata import TaskMetadata


def split_by_first_newline(s):
parts = s.split("\n", 1) # Split the string by the first newline
return parts if len(parts) > 1 else (s, "") # Return parts or (s, '') if no newline
isaac-chung marked this conversation as resolved.
Show resolved Hide resolved


class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
metadata = TaskMetadata(
name="CodeRAGLibraryDocumentationSolutions",
description="Ranking of related scientific papers based on their title.",
reference="https://arxiv.org/pdf/2406.14497",
type="Reranking",
category="s2s",
modalities=["text"],
eval_splits=["train"],
eval_langs=["python-Code"],
main_score="ndcg_at_10",
dataset={
"path": "code-rag-bench/library-documentation",
"revision": "b530d3b5a25087d2074e731b76232db85b9e9107",
},
date=("2024-06-02", "2024-06-02"), # best guess
domains=["Programming"],
task_subtypes=["Code retrieval"],
license="cc-by-sa-4.0",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""
@misc{wang2024coderagbenchretrievalaugmentcode,
title={CodeRAG-Bench: Can Retrieval Augment Code Generation?},
author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried},
year={2024},
eprint={2406.14497},
archivePrefix={arXiv},
primaryClass={cs.SE},
url={https://arxiv.org/abs/2406.14497},
}
""",
)

def load_data(self, **kwargs):
Samoed marked this conversation as resolved.
Show resolved Hide resolved
"""Load dataset from HuggingFace hub"""
if self.data_loaded:
return
self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self) -> None:
"""And transform to a retrieval datset, which have the following attributes

self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text
self.queries = Dict[query_id, str] #id => query
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
"""
self.corpus = {}
self.relevant_docs = {}
self.queries = {}

for split in self.dataset:
Samoed marked this conversation as resolved.
Show resolved Hide resolved
ds: datasets.Dataset = self.dataset[split] # type: ignore
ds = ds.shuffle(seed=42)
split = "test"

self.queries[split] = {}
self.relevant_docs[split] = {}
self.corpus[split] = {}

ids = ds["doc_id"]
texts = ds["doc_content"]

id = 0
for _, text in zip(ids, texts):
Samoed marked this conversation as resolved.
Show resolved Hide resolved
# text format "document title \n document content"
query, doc = split_by_first_newline(text)

# some library documents doesn't have query-doc pair
if not doc:
continue
query_id = str(id)
doc_id = f"doc_{id}"
self.queries[split][query_id] = query
self.corpus[split][doc_id] = {"title": "", "text": doc}

self.relevant_docs[split][query_id] = {
doc_id: 1
} # only one correct matches
Samoed marked this conversation as resolved.
Show resolved Hide resolved
id += 1
90 changes: 90 additions & 0 deletions mteb/tasks/Retrieval/code/CodeRAGOnlineTutorialsRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from __future__ import annotations

import datasets

from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from mteb.abstasks.TaskMetadata import TaskMetadata


class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
metadata = TaskMetadata(
name="CodeRAGOnlineTutorials",
description="Ranking of related scientific papers based on their title.",
reference="https://arxiv.org/pdf/2406.14497",
type="Reranking",
category="s2s",
modalities=["text"],
eval_splits=["train"],
eval_langs=["python-Code"],
main_score="ndcg_at_10",
dataset={
"path": "code-rag-bench/online-tutorials",
"revision": "095bb77130082e4690d6c3a031997b03487bf6e2",
},
date=("2024-06-02", "2024-06-02"), # best guess
domains=["Programming"],
task_subtypes=["Code retrieval"],
license="cc-by-sa-4.0",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""
@misc{wang2024coderagbenchretrievalaugmentcode,
title={CodeRAG-Bench: Can Retrieval Augment Code Generation?},
author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried},
year={2024},
eprint={2406.14497},
archivePrefix={arXiv},
primaryClass={cs.SE},
url={https://arxiv.org/abs/2406.14497},
}
""",
)

def load_data(self, **kwargs):
"""Load dataset from HuggingFace hub"""
if self.data_loaded:
return
self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self) -> None:
"""And transform to a retrieval datset, which have the following attributes

self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text
self.queries = Dict[query_id, str] #id => query
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
"""
self.corpus = {}
self.relevant_docs = {}
self.queries = {}

for split in self.dataset:
ds: datasets.Dataset = self.dataset[split] # type: ignore
ds = ds.shuffle(seed=42)

self.queries[split] = {}
self.relevant_docs[split] = {}
self.corpus[split] = {}

titles = ds["title"]
texts = ds["text"]
parsed = ds["parsed"]
id = 0
for title, text, mt in zip(titles, texts, parsed):
# in code-rag-bench,
# query=doc(code)
# text=query+doc(code)
query, doc = title, text

query_id = str(id)
doc_id = f"doc_{id}"
self.queries[split][query_id] = query
self.corpus[split][doc_id] = {"title": "", "text": doc}

self.relevant_docs[split][query_id] = {
doc_id: 1
} # only one correct matches

id += 1
92 changes: 92 additions & 0 deletions mteb/tasks/Retrieval/code/CodeRAGProgrammingSolutionsRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from __future__ import annotations

import datasets

from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from mteb.abstasks.TaskMetadata import TaskMetadata


def split_by_first_newline(s):
parts = s.split("\n", 1) # Split the string by the first newline
return parts if len(parts) > 1 else (s, "") # Return parts or (s, '') if no newline


class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
metadata = TaskMetadata(
name="CodeRAGProgrammingSolutions",
description="Ranking of related scientific papers based on their title.",
reference="https://arxiv.org/pdf/2406.14497",
type="Reranking",
category="s2s",
modalities=["text"],
eval_splits=["train"],
eval_langs=["python-Code"],
main_score="ndcg_at_10",
dataset={
"path": "code-rag-bench/programming-solutions",
"revision": "1064f7bba54d5400d4836f5831fe4c2332a566a6",
},
date=("2024-06-02", "2024-06-02"), # best guess
domains=["Programming"],
task_subtypes=["Code retrieval"],
license="cc-by-sa-4.0",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""
@misc{wang2024coderagbenchretrievalaugmentcode,
title={CodeRAG-Bench: Can Retrieval Augment Code Generation?},
author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried},
year={2024},
eprint={2406.14497},
archivePrefix={arXiv},
primaryClass={cs.SE},
url={https://arxiv.org/abs/2406.14497},
}
""",
)

def load_data(self, **kwargs):
"""Load dataset from HuggingFace hub"""
if self.data_loaded:
return
self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self) -> None:
"""And transform to a retrieval datset, which have the following attributes

self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text
self.queries = Dict[query_id, str] #id => query
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
"""
self.corpus = {}
self.relevant_docs = {}
self.queries = {}

for split in self.dataset:
ds: datasets.Dataset = self.dataset[split] # type: ignore
ds = ds.shuffle(seed=42)

self.queries[split] = {}
self.relevant_docs[split] = {}
self.corpus[split] = {}

texts = ds["text"]
meta = ds["meta"]
for text, mt in zip(texts, meta):
# in code-rag-bench,
# text = query + "\n" + doc(code)
query, doc = split_by_first_newline(text)

id = mt["task_id"]

query_id = id
doc_id = f"doc_{id}"
self.queries[split][query_id] = query
self.corpus[split][doc_id] = {"title": "", "text": doc}

self.relevant_docs[split][query_id] = {
doc_id: 1
} # only one correct matches
Loading