diff --git a/docs/tasks.md b/docs/tasks.md index 15b9474168..941d26e4b4 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -131,6 +131,9 @@ The following tables give you an overview of the tasks in MTEB. | [CodeEditSearchRetrieval](https://huggingface.co/datasets/cassanof/CodeEditSearch/viewer) (Niklas Muennighoff, 2023) | ['c', 'c++', 'go', 'java', 'javascript', 'php', 'python', 'ruby', 'rust', 'scala', 'shell', 'swift', 'typescript'] | Retrieval | p2p | [Programming, Written] | {'train': 26000} | {'train': {'number_of_characters': 935841, 'num_samples': 26000, 'num_queries': 13000, 'num_documents': 13000, 'min_document_length': 18, 'average_document_length': 70.99, 'max_document_length': 2532, 'unique_documents': 13000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 13000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 13000, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 70519, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 21, 'average_document_length': 69.52, 'max_document_length': 1811, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'javascript': {'number_of_characters': 57880, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 18, 'average_document_length': 56.88, 'max_document_length': 601, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'typescript': {'number_of_characters': 61092, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 60.09, 'max_document_length': 659, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'go': {'number_of_characters': 71797, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 70.8, 'max_document_length': 1529, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'ruby': {'number_of_characters': 67900, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 20, 'average_document_length': 66.9, 'max_document_length': 751, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'java': {'number_of_characters': 63984, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 23, 'average_document_length': 62.98, 'max_document_length': 807, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'php': {'number_of_characters': 62927, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 21, 'average_document_length': 61.93, 'max_document_length': 766, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'c': {'number_of_characters': 98588, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 20, 'average_document_length': 97.59, 'max_document_length': 1672, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'c++': {'number_of_characters': 115480, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 22, 'average_document_length': 114.48, 'max_document_length': 1856, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'rust': {'number_of_characters': 68503, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 67.5, 'max_document_length': 2532, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'swift': {'number_of_characters': 58279, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 57.28, 'max_document_length': 727, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'scala': {'number_of_characters': 65833, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 22, 'average_document_length': 64.83, 'max_document_length': 685, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'shell': {'number_of_characters': 73059, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 18, 'average_document_length': 72.06, 'max_document_length': 813, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}}}} | | [CodeFeedbackMT](https://arxiv.org/abs/2402.14658) (Tianyu Zheng, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 79660} | {'test': {'number_of_characters': 156266302, 'num_samples': 79660, 'num_queries': 13277, 'num_documents': 66383, 'min_document_length': 127, 'average_document_length': 885.13, 'max_document_length': 32432, 'unique_documents': 66383, 'min_query_length': 2, 'average_query_length': 7344.18, 'max_query_length': 9403, 'unique_queries': 13277, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 13277}} | | [CodeFeedbackST](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 187832} | {'test': {'number_of_characters': 260957682, 'num_samples': 187832, 'num_queries': 31306, 'num_documents': 156526, 'min_document_length': 26, 'average_document_length': 144.85, 'max_document_length': 13851, 'unique_documents': 156526, 'min_query_length': 1, 'average_query_length': 7611.46, 'max_query_length': 11354, 'unique_queries': 31306, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 31306}} | +| ["CodeRAGLibraryDocumentationSolutions"](https://arxiv.org/abs/2406.14497) (Zhiruo Wang, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'train': 61198} | {'train': {'number_of_characters': 2571365, 'num_samples': 61198, 'num_queries': 30599, 'num_documents': 30599, 'min_document_length': 2, 'average_document_length': 82.03428216608386, 'max_document_length': 43706, 'unique_documents': 30599, 'min_query_length': 2, 'average_query_length': 2.0, 'max_query_length': 2, 'unique_queries': 30599, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 30599}} +| ["CodeRAGOnlineTutorials"](https://arxiv.org/abs/2406.14497) (Zhiruo Wang, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] |{'train': 153286} | {'train': {'number_of_characters': 4241139, 'num_samples': 153286, 'num_queries': 76643, 'num_documents': 76643, 'min_document_length': 3, 'average_document_length': 53.33628641885104, 'max_document_length': 221, 'unique_documents': 76643, 'min_query_length': 2, 'average_query_length': 2.0, 'max_query_length': 2, 'unique_queries': 76643, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 76643}} +| ["CodeRAGProgrammingSolutions"](https://arxiv.org/abs/2406.14497) (Zhiruo Wang, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'train': | 1972} {'train': {'number_of_characters': 80085, 'num_samples': 1972, 'num_queries': 986, 'num_documents': 986, 'min_document_length': 11, 'average_document_length': 79.22210953346855, 'max_document_length': 251, 'unique_documents': 986, 'min_query_length': 2, 'average_query_length': 2.0, 'max_query_length': 2, 'unique_queries': 986, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 986}} | [CodeSearchNetCCRetrieval](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 1058035} | {'test': {'number_of_characters': 22407915, 'num_samples': 1058035, 'num_queries': 52561, 'num_documents': 1005474, 'min_document_length': 23, 'average_document_length': 20.29, 'max_document_length': 214210, 'unique_documents': 1005474, 'min_query_length': 2, 'average_query_length': 38.26, 'max_query_length': 2, 'unique_queries': 52561, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 52561, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 8792958, 'num_samples': 295570, 'num_queries': 14918, 'num_documents': 280652, 'min_document_length': 38, 'average_document_length': 29.33, 'max_document_length': 8326, 'unique_documents': 280652, 'min_query_length': 2, 'average_query_length': 37.63, 'max_query_length': 2, 'unique_queries': 14918, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14918}, 'javascript': {'number_of_characters': 1590642, 'num_samples': 68492, 'num_queries': 3291, 'num_documents': 65201, 'min_document_length': 40, 'average_document_length': 22.4, 'max_document_length': 214210, 'unique_documents': 65201, 'min_query_length': 2, 'average_query_length': 39.62, 'max_query_length': 2, 'unique_queries': 3291, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3291}, 'go': {'number_of_characters': 2264134, 'num_samples': 190857, 'num_queries': 8122, 'num_documents': 182735, 'min_document_length': 23, 'average_document_length': 10.39, 'max_document_length': 3589, 'unique_documents': 182735, 'min_query_length': 2, 'average_query_length': 45.0, 'max_query_length': 2, 'unique_queries': 8122, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 8122}, 'ruby': {'number_of_characters': 391703, 'num_samples': 28849, 'num_queries': 1261, 'num_documents': 27588, 'min_document_length': 36, 'average_document_length': 12.2, 'max_document_length': 2244, 'unique_documents': 27588, 'min_query_length': 2, 'average_query_length': 43.76, 'max_query_length': 2, 'unique_queries': 1261, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1261}, 'java': {'number_of_characters': 4114584, 'num_samples': 192016, 'num_queries': 10955, 'num_documents': 181061, 'min_document_length': 38, 'average_document_length': 20.72, 'max_document_length': 5066, 'unique_documents': 181061, 'min_query_length': 2, 'average_query_length': 33.06, 'max_query_length': 2, 'unique_queries': 10955, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10955}, 'php': {'number_of_characters': 5253894, 'num_samples': 282251, 'num_queries': 14014, 'num_documents': 268237, 'min_document_length': 40, 'average_document_length': 17.59, 'max_document_length': 2995, 'unique_documents': 268237, 'min_query_length': 2, 'average_query_length': 38.28, 'max_query_length': 2, 'unique_queries': 14014, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14014}}}} | | [CodeSearchNetRetrieval](https://huggingface.co/datasets/code_search_net/) (Husain et al., 2019) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 12000} | {'test': {'number_of_characters': 1950074, 'num_samples': 12000, 'num_queries': 6000, 'num_documents': 6000, 'min_document_length': 2, 'average_document_length': 324.01, 'max_document_length': 17533, 'unique_documents': 6000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 6000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 6000, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 467546, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 8, 'average_document_length': 466.55, 'max_document_length': 8636, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'javascript': {'number_of_characters': 187018, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 2, 'average_document_length': 186.02, 'max_document_length': 7657, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'go': {'number_of_characters': 126213, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 14, 'average_document_length': 125.21, 'max_document_length': 1501, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'ruby': {'number_of_characters': 314818, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 5, 'average_document_length': 313.82, 'max_document_length': 17533, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'java': {'number_of_characters': 691360, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 2, 'average_document_length': 690.36, 'max_document_length': 6473, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'php': {'number_of_characters': 163119, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 5, 'average_document_length': 162.12, 'max_document_length': 1240, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}}}} | | [CodeTransOceanContest](https://arxiv.org/abs/2310.04951) (Weixiang Yan, 2023) | ['c++', 'python'] | Retrieval | p2p | [Programming, Written] | {'test': 1229} | {'test': {'number_of_characters': 1744286, 'num_samples': 1229, 'num_queries': 221, 'num_documents': 1008, 'min_document_length': 8, 'average_document_length': 221.9, 'max_document_length': 4147, 'unique_documents': 1008, 'min_query_length': 8, 'average_query_length': 6880.58, 'max_query_length': 10852, 'unique_queries': 221, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 221}} | diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 415b11eddb..acb8b8a7d2 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1153,6 +1153,29 @@ def load_results( }""", ) + +CODE_RAG = Benchmark( + name="CodeRAG", + tasks=get_tasks( + tasks=[ + "CodeRAGLibraryDocumentationSolutions", + "CodeRAGOnlineTutorials", + "CodeRAGProgrammingSolutions", + "CodeRAGStackoverflowPosts", + ], + ), + description="A benchmark for evaluating code retrieval augmented generation, testing models' ability to retrieve relevant programming solutions, tutorials and documentation.", + reference="https://arxiv.org/abs/2406.14497", + citation="""@misc{wang2024coderagbenchretrievalaugmentcode, + title={CodeRAG-Bench: Can Retrieval Augment Code Generation?}, + author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried}, + year={2024}, + eprint={2406.14497}, + archivePrefix={arXiv}, + primaryClass={cs.SE}, + url={https://arxiv.org/abs/2406.14497}, + }""", + NANOBEIR = Benchmark( name="NanoBEIR", tasks=get_tasks( diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 6c146379ea..26d3dd667c 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -5,6 +5,7 @@ from .code.CodeEditSearchRetrieval import * from .code.CodeFeedbackMTRetrieval import * from .code.CodeFeedbackSTRetrieval import * +from .code.CodeRAG import * from .code.CodeSearchNetCCRetrieval import * from .code.CodeSearchNetRetrieval import * from .code.CodeTransOceanContestRetrieval import * diff --git a/mteb/tasks/Retrieval/code/CodeRAG.py b/mteb/tasks/Retrieval/code/CodeRAG.py new file mode 100644 index 0000000000..3724f44eca --- /dev/null +++ b/mteb/tasks/Retrieval/code/CodeRAG.py @@ -0,0 +1,272 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +def split_by_first_newline(s): + # Split the string by the first newline + parts = s.split("\n", 1) + # Return parts or (s, '') if no newline + return parts if len(parts) > 1 else (s, "") + + +common_args = { + "reference": "https://arxiv.org/pdf/2406.14497", + "type": "Reranking", + "category": "s2s", + "modalities": ["text"], + "eval_splits": ["train"], + "eval_langs": ["python-Code"], + "main_score": "ndcg_at_10", + "date": ("2024-06-02", "2024-06-02"), # best guess + "domains": ["Programming"], + "task_subtypes": ["Code retrieval"], + "license": "cc-by-sa-4.0", + "annotations_creators": "derived", + "dialect": [], + "sample_creation": "found", + "bibtex_citation": """ + @misc{wang2024coderagbenchretrievalaugmentcode, + title={CodeRAG-Bench: Can Retrieval Augment Code Generation?}, + author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried}, + year={2024}, + eprint={2406.14497}, + archivePrefix={arXiv}, + primaryClass={cs.SE}, + url={https://arxiv.org/abs/2406.14497}, + } + """, +} + + +class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CodeRAGProgrammingSolutions", + description="Evaluation of programming solution retrieval using CodeRAG-Bench. Tests the ability to retrieve relevant programming solutions given code-related queries.", + dataset={ + "path": "code-rag-bench/programming-solutions", + "revision": "1064f7bba54d5400d4836f5831fe4c2332a566a6", + }, + **common_args, # type: ignore + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self) -> None: + """And transform to a retrieval datset, which have the following attributes + + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.queries = Dict[query_id, str] #id => query + self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] + """ + self.corpus = {} + self.relevant_docs = {} + self.queries = {} + + split = self.metadata.eval_splits[0] + ds: datasets.Dataset = self.dataset[split] # type: ignore + ds = ds.shuffle(seed=42) + + self.queries[split] = {} + self.relevant_docs[split] = {} + self.corpus[split] = {} + + texts = ds["text"] + meta = ds["meta"] + for text, mt in zip(texts, meta): + # in code-rag-bench, + # text = query + "\n" + doc(code) + query, doc = split_by_first_newline(text) + + id = mt["task_id"] + + query_id = id + doc_id = f"doc_{id}" + self.queries[split][query_id] = query + self.corpus[split][doc_id] = {"title": "", "text": doc} + + self.relevant_docs[split][query_id] = { + doc_id: 1 + } # only one correct matches + + +class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CodeRAGOnlineTutorials", + description="Evaluation of online programming tutorial retrieval using CodeRAG-Bench. Tests the ability to retrieve relevant tutorials from online platforms given code-related queries.", + dataset={ + "path": "code-rag-bench/online-tutorials", + "revision": "095bb77130082e4690d6c3a031997b03487bf6e2", + }, + **common_args, # type: ignore + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self) -> None: + """And transform to a retrieval datset, which have the following attributes + + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.queries = Dict[query_id, str] #id => query + self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] + """ + self.corpus = {} + self.relevant_docs = {} + self.queries = {} + + split = self.metadata.eval_splits[0] + ds: datasets.Dataset = self.dataset[split] # type: ignore + ds = ds.shuffle(seed=42) + + self.queries[split] = {} + self.relevant_docs[split] = {} + self.corpus[split] = {} + + titles = ds["title"] + texts = ds["text"] + parsed = ds["parsed"] + id = 0 + for title, text, mt in zip(titles, texts, parsed): + # in code-rag-bench, + # query=doc(code) + # text=query+doc(code) + query, doc = title, text + + query_id = str(id) + doc_id = f"doc_{id}" + self.queries[split][query_id] = query + self.corpus[split][doc_id] = {"title": "", "text": doc} + + self.relevant_docs[split][query_id] = { + doc_id: 1 + } # only one correct matches + + id += 1 + + +class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CodeRAGLibraryDocumentationSolutions", + description="Evaluation of code library documentation retrieval using CodeRAG-Bench. Tests the ability to retrieve relevant Python library documentation sections given code-related queries.", + dataset={ + "path": "code-rag-bench/library-documentation", + "revision": "b530d3b5a25087d2074e731b76232db85b9e9107", + }, + **common_args, # type: ignore + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self) -> None: + """And transform to a retrieval datset, which have the following attributes + + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.queries = Dict[query_id, str] #id => query + self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] + """ + self.corpus = {} + self.relevant_docs = {} + self.queries = {} + + split = self.metadata.eval_splits[0] + ds: datasets.Dataset = self.dataset[split] # type: ignore + ds = ds.shuffle(seed=42) + + self.queries[split] = {} + self.relevant_docs[split] = {} + self.corpus[split] = {} + + texts = ds["doc_content"] + + id = 0 + for text in texts: + # text format "document title \n document content" + query, doc = split_by_first_newline(text) + + # some library documents doesn't have query-doc pair + if not doc: + continue + query_id = str(id) + doc_id = f"doc_{id}" + self.queries[split][query_id] = query + self.corpus[split][doc_id] = {"title": "", "text": doc} + # only one correct match + self.relevant_docs[split][query_id] = {doc_id: 1} + id += 1 + + +class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CodeRAGStackoverflowPosts", + description="Evaluation of StackOverflow post retrieval using CodeRAG-Bench. Tests the ability to retrieve relevant StackOverflow posts given code-related queries.", + dataset={ + "path": "code-rag-bench/stackoverflow-posts", + "revision": "04e05d86cb0ac467b29a5d87f4c56eac99dfc0a4", + }, + **common_args, # type: ignore + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self) -> None: + """And transform to a retrieval datset, which have the following attributes + + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.queries = Dict[query_id, str] #id => query + self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] + """ + self.corpus = {} + self.relevant_docs = {} + self.queries = {} + + split = self.metadata.eval_splits[0] + ds: datasets.Dataset = self.dataset[split] # type: ignore + ds = ds.shuffle(seed=42) + + self.queries[split] = {} + self.relevant_docs[split] = {} + self.corpus[split] = {} + + texts = ds["text"] + id = 0 + for text in texts: + # in code-rag-bench, + # text = query + "\n" + doc + query, doc = split_by_first_newline(text) + + query_id = str(id) + doc_id = f"doc_{id}" + self.queries[split][query_id] = query + self.corpus[split][doc_id] = {"title": "", "text": doc} + + self.relevant_docs[split][query_id] = { + doc_id: 1 + } # only one correct matches + id += 1