Add mtrag benchmark (#1548)

* Add mtrag benchmark Signed-off-by: elronbandel <[email protected]> * Add multi_type_serializer for references and prediction fields in various JSON metrics Signed-off-by: elronbandel <[email protected]> * Remove unused TempOperator class and delete obsolete multi_turn.json task file Signed-off-by: elronbandel <[email protected]> --------- Signed-off-by: elronbandel <[email protected]>
IBM · Jan 24, 2025 · 3a0d99a · 3a0d99a
1 parent 7421e72
commit 3a0d99a
Show file tree

Hide file tree

Showing 35 changed files with 534 additions and 18 deletions.
diff --git a/prepare/cards/mtrag.py b/prepare/cards/mtrag.py
@@ -0,0 +1,131 @@
+import json
+
+from unitxt import add_to_catalog
+from unitxt.blocks import (
+    TaskCard,
+)
+from unitxt.collections_operators import Dictify, Wrap
+from unitxt.loaders import LoadCSV
+from unitxt.operators import (
+    Cast,
+    Copy,
+    MapInstanceValues,
+    Set,
+    ZipFieldValues,
+)
+from unitxt.templates import InputOutputTemplate
+from unitxt.test_utils.card import test_card
+
+card = TaskCard(
+    loader=LoadCSV(
+        files={
+            "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl"
+        },
+        file_type="json",
+        lines=True,
+        data_classification_policy=["public"],
+    ),
+    preprocess_steps=[
+        MapInstanceValues(
+            {
+                "Answerability": {
+                    "['UNANSWERABLE']": False,
+                    "['ANSWERABLE']": True,
+                    "['PARTIAL']": True,
+                },
+            }
+        ),
+        Copy(
+            field_to_field={
+                "targets/*/text": "reference_answers",
+                "Answerability": "is_answerable_label",
+                "task_id": "question_id",
+                "contexts/*/document_id": "reference_context_ids",
+                "contexts/*/text": "reference_contexts",
+                "input/*/speaker": "roles",
+                "input/*/text": "contents",
+            },
+        ),
+        ZipFieldValues(
+            fields=["roles", "contents"],
+            to_field="conversation",
+        ),
+        Dictify(
+            field="conversation",
+            with_keys=["role", "content"],
+            to_field="question",
+            process_every_value=True,
+        ),
+    ],
+    task="tasks.rag.end_to_end",
+    templates={"default": "templates.rag.end_to_end.json_predictions"},
+    __tags__={"license": "apache-2.0"},
+    __description__="""MTRAG: a comprehensive and diverse human-generated multi-turn RAG dataset, accompanied by four document corpora. To the best of our knowledge, MTRAG is the first end-to-end human-generated multi-turn RAG benchmark that reflects real-world properties of multi-turn conversations.
+""",
+)
+wrong_answer = {
+    "contexts": ["hi"],
+    "is_answerable": True,
+    "answer": "Don't know",
+    "context_ids": ["id0"],
+}
+
+test_card(
+    card,
+    strict=False,
+    full_mismatch_prediction_values=[json.dumps(wrong_answer)],
+    debug=False,
+    demos_taken_from="test",
+    demos_pool_size=5,
+)
+
+add_to_catalog(card, "cards.rag.mtrag", overwrite=True)
+
+
+for subset in ["clapnq", "cloud", "fiqa", "govt"]:
+    subset_operators = []
+    if subset in ["fiqa", "clapnq"]:
+        subset_operators.append(
+            Cast(
+                field="_id",
+                to="str",
+                to_field="document_id",
+            )
+        )
+    if subset in ["cloud"]:
+        subset_operators.append(Set(fields={"title": ""}))
+
+    card = TaskCard(
+        loader=LoadCSV(
+            files={
+                "test": f"https://github.com/IBM/mt-rag-benchmark/raw/refs/heads/main/corpora/{subset}.jsonl.zip"
+            },
+            compression="zip",
+            file_type="json",
+            lines=True,
+            data_classification_policy=["public"],
+        ),
+        preprocess_steps=[
+            *subset_operators,
+            Wrap(field="text", inside="list", to_field="passages"),
+            Set(
+                fields={
+                    "metadata_field": "",
+                }
+            ),
+        ],
+        task="tasks.rag.corpora",
+        templates={
+            "empty": InputOutputTemplate(
+                input_format="",
+                output_format="",
+            ),
+        },
+    )
+    test_card(
+        card,
+        strict=False,
+        demos_taken_from="test",
+    )
+
+    add_to_catalog(card, f"cards.rag.mtrag.documents.{subset}", overwrite=True)
diff --git a/prepare/metrics/rag.py b/prepare/metrics/rag.py
@@ -7,6 +7,7 @@
     TokenOverlap,
 )
 from unitxt.operators import Copy, ListFieldValues
+from unitxt.serializers import MultiTypeSerializer
 from unitxt.test_utils.metrics import test_metric
 
 metrics = {
@@ -494,6 +495,7 @@
     "metrics.rag.end_to_end.answer_reward": [
         copy_field_prediction_answer_to_prediction,
         copy_field_question_to_references_in_a_list,
+        MultiTypeSerializer(field="references", process_every_value=True),
     ],
     "metrics.rag.end_to_end.answer_faithfulness": [
         copy_field_prediction_contexts_to_references,
@@ -506,6 +508,7 @@
     "metrics.rag.end_to_end.context_relevance": [
         copy_field_prediction_contexts_to_references,
         copy_field_question_to_prediction,
+        MultiTypeSerializer(field="prediction"),
     ],
 }
 

diff --git a/prepare/metrics/rag_answer_relevance.py b/prepare/metrics/rag_answer_relevance.py
@@ -3,6 +3,7 @@
     MetricPipeline,
 )
 from unitxt.operators import Copy, ListFieldValues
+from unitxt.serializers import MultiTypeSerializer
 
 task_names = ["external_rag", "response_generation", "end_to_end"]
 base = "metrics.rag"
@@ -30,6 +31,7 @@ def get_preprocess_steps(task):
                     "task_data/question": "references",
                 }
             ),
+            MultiTypeSerializer(field="references", process_every_value=True),
             last_step,
         ]
     if task == "end_to_end":
@@ -40,6 +42,7 @@ def get_preprocess_steps(task):
                     "prediction/answer": "prediction",
                 }
             ),
+            MultiTypeSerializer(field="references", process_every_value=True),
             last_step,
         ]
     raise ValueError(f"Unsupported rag task {task}")

diff --git a/prepare/metrics/rag_context_relevance.py b/prepare/metrics/rag_context_relevance.py
@@ -3,6 +3,7 @@
     MetricPipeline,
 )
 from unitxt.operators import Copy
+from unitxt.serializers import MultiTypeSerializer
 
 base = "metrics.rag"
 tasks = ["external_rag", "end_to_end"]
@@ -15,11 +16,13 @@ def get_preprocess_steps(task):
         return [
             Copy(field="contexts", to_field="references"),
             Copy(field="question", to_field="prediction"),
+            MultiTypeSerializer(field="prediction"),
         ]
     if task == "end_to_end":
         return [
             Copy(field="prediction/contexts", to_field="references"),
             Copy(field="task_data/question", to_field="prediction"),
+            MultiTypeSerializer(field="prediction"),
         ]
     raise ValueError(f"Unsupported rag task for {dimension}:{task}")
 

diff --git a/prepare/metrics/rag_metrics_deprecated.py b/prepare/metrics/rag_metrics_deprecated.py
@@ -2,6 +2,7 @@
 from unitxt.collections_operators import Wrap
 from unitxt.metrics import MetricPipeline
 from unitxt.operators import Copy, ListFieldValues
+from unitxt.serializers import MultiTypeSerializer
 
 base = "metrics.rag"
 new_base = "metrics.rag.external_rag"
@@ -100,6 +101,7 @@ def get_replacing_metric(depr_metric):
     # This metric compares the answer (as the prediction) to the question (as the reference).
     # We have to wrap the question by a list (otherwise it will be a string),
     # because references are expected to be lists
+    MultiTypeSerializer(field="references"),
     ListFieldValues(fields=["references"], to_field="references"),
 ]
 add_metric_pipeline_to_catalog(

diff --git a/prepare/tasks/rag/rag_end_to_end.py b/prepare/tasks/rag/rag_end_to_end.py
@@ -2,7 +2,7 @@
 
 from unitxt import add_to_catalog
 from unitxt.blocks import Task
-from unitxt.types import RagResponse
+from unitxt.types import Dialog, RagResponse
 
 add_to_catalog(
     Task(
@@ -11,7 +11,7 @@
         For details of RAG see: https://www.unitxt.ai/en/latest/docs/rag_support.html.
 """,
         input_fields={
-            "question": str,
+            "question": Union[str, Dialog],
             "question_id": Any,
             "metadata_field": str,
         },
@@ -44,6 +44,7 @@
     overwrite=True,
 )
 
+
 add_to_catalog(
     Task(
         input_fields={

diff --git a/prepare/templates/rag/end_to_end.py b/prepare/templates/rag/end_to_end.py
@@ -1,7 +1,7 @@
 from unitxt import add_to_catalog
 from unitxt.operator import SequentialOperator
 from unitxt.struct_data_operators import LoadJson
-from unitxt.templates import InputOutputTemplate
+from unitxt.templates import JsonOutputTemplate
 
 add_to_catalog(
     SequentialOperator(
@@ -18,9 +18,14 @@
 
 add_to_catalog(
     # For rag end-to-end tasks
-    InputOutputTemplate(
+    JsonOutputTemplate(
         input_format="",
-        output_format='{{"answer": "{reference_answers}", "contexts" : ["{reference_contexts}"],  "context_ids" : ["{reference_context_ids}"]}}',
+        output_fields={
+            "reference_answers": "answer",
+            "reference_contexts": "contexts",
+            "reference_context_ids": "context_ids",
+        },
+        wrap_with_list_fields=["reference_contexts", "reference_context_ids"],
         postprocessors=["processors.load_json_predictions"],
     ),
     "templates.rag.end_to_end.json_predictions",

diff --git a/src/unitxt/catalog/cards/rag/mtrag.json b/src/unitxt/catalog/cards/rag/mtrag.json
@@ -0,0 +1,64 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_csv",
+        "files": {
+            "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl"
+        },
+        "file_type": "json",
+        "lines": true,
+        "data_classification_policy": [
+            "public"
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "map_instance_values",
+            "mappers": {
+                "Answerability": {
+                    "['UNANSWERABLE']": false,
+                    "['ANSWERABLE']": true,
+                    "['PARTIAL']": true
+                }
+            }
+        },
+        {
+            "__type__": "copy",
+            "field_to_field": {
+                "targets/*/text": "reference_answers",
+                "Answerability": "is_answerable_label",
+                "task_id": "question_id",
+                "contexts/*/document_id": "reference_context_ids",
+                "contexts/*/text": "reference_contexts",
+                "input/*/speaker": "roles",
+                "input/*/text": "contents"
+            }
+        },
+        {
+            "__type__": "zip_field_values",
+            "fields": [
+                "roles",
+                "contents"
+            ],
+            "to_field": "conversation"
+        },
+        {
+            "__type__": "dictify",
+            "field": "conversation",
+            "with_keys": [
+                "role",
+                "content"
+            ],
+            "to_field": "question",
+            "process_every_value": true
+        }
+    ],
+    "task": "tasks.rag.end_to_end",
+    "templates": {
+        "default": "templates.rag.end_to_end.json_predictions"
+    },
+    "__tags__": {
+        "license": "apache-2.0"
+    },
+    "__description__": "MTRAG: a comprehensive and diverse human-generated multi-turn RAG dataset, accompanied by four document corpora. To the best of our knowledge, MTRAG is the first end-to-end human-generated multi-turn RAG benchmark that reflects real-world properties of multi-turn conversations.\n"
+}
diff --git a/src/unitxt/catalog/cards/rag/mtrag/documents/clapnq.json b/src/unitxt/catalog/cards/rag/mtrag/documents/clapnq.json
@@ -0,0 +1,43 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_csv",
+        "files": {
+            "test": "https://github.com/IBM/mt-rag-benchmark/raw/refs/heads/main/corpora/clapnq.jsonl.zip"
+        },
+        "compression": "zip",
+        "file_type": "json",
+        "lines": true,
+        "data_classification_policy": [
+            "public"
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "cast",
+            "field": "_id",
+            "to": "str",
+            "to_field": "document_id"
+        },
+        {
+            "__type__": "wrap",
+            "field": "text",
+            "inside": "list",
+            "to_field": "passages"
+        },
+        {
+            "__type__": "set",
+            "fields": {
+                "metadata_field": ""
+            }
+        }
+    ],
+    "task": "tasks.rag.corpora",
+    "templates": {
+        "empty": {
+            "__type__": "input_output_template",
+            "input_format": "",
+            "output_format": ""
+        }
+    }
+}