Skip to content

Commit

Permalink
Add mtrag benchmark (#1548)
Browse files Browse the repository at this point in the history
* Add mtrag benchmark

Signed-off-by: elronbandel <[email protected]>

* Add multi_type_serializer for references and prediction fields in various JSON metrics

Signed-off-by: elronbandel <[email protected]>

* Remove unused TempOperator class and delete obsolete multi_turn.json task file

Signed-off-by: elronbandel <[email protected]>

---------

Signed-off-by: elronbandel <[email protected]>
  • Loading branch information
elronbandel authored and tejaswini-nexplore committed Jan 24, 2025
1 parent 7421e72 commit 3a0d99a
Show file tree
Hide file tree
Showing 35 changed files with 534 additions and 18 deletions.
131 changes: 131 additions & 0 deletions prepare/cards/mtrag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import json

from unitxt import add_to_catalog
from unitxt.blocks import (
TaskCard,
)
from unitxt.collections_operators import Dictify, Wrap
from unitxt.loaders import LoadCSV
from unitxt.operators import (
Cast,
Copy,
MapInstanceValues,
Set,
ZipFieldValues,
)
from unitxt.templates import InputOutputTemplate
from unitxt.test_utils.card import test_card

card = TaskCard(
loader=LoadCSV(
files={
"test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl"
},
file_type="json",
lines=True,
data_classification_policy=["public"],
),
preprocess_steps=[
MapInstanceValues(
{
"Answerability": {
"['UNANSWERABLE']": False,
"['ANSWERABLE']": True,
"['PARTIAL']": True,
},
}
),
Copy(
field_to_field={
"targets/*/text": "reference_answers",
"Answerability": "is_answerable_label",
"task_id": "question_id",
"contexts/*/document_id": "reference_context_ids",
"contexts/*/text": "reference_contexts",
"input/*/speaker": "roles",
"input/*/text": "contents",
},
),
ZipFieldValues(
fields=["roles", "contents"],
to_field="conversation",
),
Dictify(
field="conversation",
with_keys=["role", "content"],
to_field="question",
process_every_value=True,
),
],
task="tasks.rag.end_to_end",
templates={"default": "templates.rag.end_to_end.json_predictions"},
__tags__={"license": "apache-2.0"},
__description__="""MTRAG: a comprehensive and diverse human-generated multi-turn RAG dataset, accompanied by four document corpora. To the best of our knowledge, MTRAG is the first end-to-end human-generated multi-turn RAG benchmark that reflects real-world properties of multi-turn conversations.
""",
)
wrong_answer = {
"contexts": ["hi"],
"is_answerable": True,
"answer": "Don't know",
"context_ids": ["id0"],
}

test_card(
card,
strict=False,
full_mismatch_prediction_values=[json.dumps(wrong_answer)],
debug=False,
demos_taken_from="test",
demos_pool_size=5,
)

add_to_catalog(card, "cards.rag.mtrag", overwrite=True)


for subset in ["clapnq", "cloud", "fiqa", "govt"]:
subset_operators = []
if subset in ["fiqa", "clapnq"]:
subset_operators.append(
Cast(
field="_id",
to="str",
to_field="document_id",
)
)
if subset in ["cloud"]:
subset_operators.append(Set(fields={"title": ""}))

card = TaskCard(
loader=LoadCSV(
files={
"test": f"https://github.com/IBM/mt-rag-benchmark/raw/refs/heads/main/corpora/{subset}.jsonl.zip"
},
compression="zip",
file_type="json",
lines=True,
data_classification_policy=["public"],
),
preprocess_steps=[
*subset_operators,
Wrap(field="text", inside="list", to_field="passages"),
Set(
fields={
"metadata_field": "",
}
),
],
task="tasks.rag.corpora",
templates={
"empty": InputOutputTemplate(
input_format="",
output_format="",
),
},
)
test_card(
card,
strict=False,
demos_taken_from="test",
)

add_to_catalog(card, f"cards.rag.mtrag.documents.{subset}", overwrite=True)
3 changes: 3 additions & 0 deletions prepare/metrics/rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
TokenOverlap,
)
from unitxt.operators import Copy, ListFieldValues
from unitxt.serializers import MultiTypeSerializer
from unitxt.test_utils.metrics import test_metric

metrics = {
Expand Down Expand Up @@ -494,6 +495,7 @@
"metrics.rag.end_to_end.answer_reward": [
copy_field_prediction_answer_to_prediction,
copy_field_question_to_references_in_a_list,
MultiTypeSerializer(field="references", process_every_value=True),
],
"metrics.rag.end_to_end.answer_faithfulness": [
copy_field_prediction_contexts_to_references,
Expand All @@ -506,6 +508,7 @@
"metrics.rag.end_to_end.context_relevance": [
copy_field_prediction_contexts_to_references,
copy_field_question_to_prediction,
MultiTypeSerializer(field="prediction"),
],
}

Expand Down
3 changes: 3 additions & 0 deletions prepare/metrics/rag_answer_relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
MetricPipeline,
)
from unitxt.operators import Copy, ListFieldValues
from unitxt.serializers import MultiTypeSerializer

task_names = ["external_rag", "response_generation", "end_to_end"]
base = "metrics.rag"
Expand Down Expand Up @@ -30,6 +31,7 @@ def get_preprocess_steps(task):
"task_data/question": "references",
}
),
MultiTypeSerializer(field="references", process_every_value=True),
last_step,
]
if task == "end_to_end":
Expand All @@ -40,6 +42,7 @@ def get_preprocess_steps(task):
"prediction/answer": "prediction",
}
),
MultiTypeSerializer(field="references", process_every_value=True),
last_step,
]
raise ValueError(f"Unsupported rag task {task}")
Expand Down
3 changes: 3 additions & 0 deletions prepare/metrics/rag_context_relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
MetricPipeline,
)
from unitxt.operators import Copy
from unitxt.serializers import MultiTypeSerializer

base = "metrics.rag"
tasks = ["external_rag", "end_to_end"]
Expand All @@ -15,11 +16,13 @@ def get_preprocess_steps(task):
return [
Copy(field="contexts", to_field="references"),
Copy(field="question", to_field="prediction"),
MultiTypeSerializer(field="prediction"),
]
if task == "end_to_end":
return [
Copy(field="prediction/contexts", to_field="references"),
Copy(field="task_data/question", to_field="prediction"),
MultiTypeSerializer(field="prediction"),
]
raise ValueError(f"Unsupported rag task for {dimension}:{task}")

Expand Down
2 changes: 2 additions & 0 deletions prepare/metrics/rag_metrics_deprecated.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from unitxt.collections_operators import Wrap
from unitxt.metrics import MetricPipeline
from unitxt.operators import Copy, ListFieldValues
from unitxt.serializers import MultiTypeSerializer

base = "metrics.rag"
new_base = "metrics.rag.external_rag"
Expand Down Expand Up @@ -100,6 +101,7 @@ def get_replacing_metric(depr_metric):
# This metric compares the answer (as the prediction) to the question (as the reference).
# We have to wrap the question by a list (otherwise it will be a string),
# because references are expected to be lists
MultiTypeSerializer(field="references"),
ListFieldValues(fields=["references"], to_field="references"),
]
add_metric_pipeline_to_catalog(
Expand Down
5 changes: 3 additions & 2 deletions prepare/tasks/rag/rag_end_to_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from unitxt import add_to_catalog
from unitxt.blocks import Task
from unitxt.types import RagResponse
from unitxt.types import Dialog, RagResponse

add_to_catalog(
Task(
Expand All @@ -11,7 +11,7 @@
For details of RAG see: https://www.unitxt.ai/en/latest/docs/rag_support.html.
""",
input_fields={
"question": str,
"question": Union[str, Dialog],
"question_id": Any,
"metadata_field": str,
},
Expand Down Expand Up @@ -44,6 +44,7 @@
overwrite=True,
)


add_to_catalog(
Task(
input_fields={
Expand Down
11 changes: 8 additions & 3 deletions prepare/templates/rag/end_to_end.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from unitxt import add_to_catalog
from unitxt.operator import SequentialOperator
from unitxt.struct_data_operators import LoadJson
from unitxt.templates import InputOutputTemplate
from unitxt.templates import JsonOutputTemplate

add_to_catalog(
SequentialOperator(
Expand All @@ -18,9 +18,14 @@

add_to_catalog(
# For rag end-to-end tasks
InputOutputTemplate(
JsonOutputTemplate(
input_format="",
output_format='{{"answer": "{reference_answers}", "contexts" : ["{reference_contexts}"], "context_ids" : ["{reference_context_ids}"]}}',
output_fields={
"reference_answers": "answer",
"reference_contexts": "contexts",
"reference_context_ids": "context_ids",
},
wrap_with_list_fields=["reference_contexts", "reference_context_ids"],
postprocessors=["processors.load_json_predictions"],
),
"templates.rag.end_to_end.json_predictions",
Expand Down
64 changes: 64 additions & 0 deletions src/unitxt/catalog/cards/rag/mtrag.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{
"__type__": "task_card",
"loader": {
"__type__": "load_csv",
"files": {
"test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl"
},
"file_type": "json",
"lines": true,
"data_classification_policy": [
"public"
]
},
"preprocess_steps": [
{
"__type__": "map_instance_values",
"mappers": {
"Answerability": {
"['UNANSWERABLE']": false,
"['ANSWERABLE']": true,
"['PARTIAL']": true
}
}
},
{
"__type__": "copy",
"field_to_field": {
"targets/*/text": "reference_answers",
"Answerability": "is_answerable_label",
"task_id": "question_id",
"contexts/*/document_id": "reference_context_ids",
"contexts/*/text": "reference_contexts",
"input/*/speaker": "roles",
"input/*/text": "contents"
}
},
{
"__type__": "zip_field_values",
"fields": [
"roles",
"contents"
],
"to_field": "conversation"
},
{
"__type__": "dictify",
"field": "conversation",
"with_keys": [
"role",
"content"
],
"to_field": "question",
"process_every_value": true
}
],
"task": "tasks.rag.end_to_end",
"templates": {
"default": "templates.rag.end_to_end.json_predictions"
},
"__tags__": {
"license": "apache-2.0"
},
"__description__": "MTRAG: a comprehensive and diverse human-generated multi-turn RAG dataset, accompanied by four document corpora. To the best of our knowledge, MTRAG is the first end-to-end human-generated multi-turn RAG benchmark that reflects real-world properties of multi-turn conversations.\n"
}
43 changes: 43 additions & 0 deletions src/unitxt/catalog/cards/rag/mtrag/documents/clapnq.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"__type__": "task_card",
"loader": {
"__type__": "load_csv",
"files": {
"test": "https://github.com/IBM/mt-rag-benchmark/raw/refs/heads/main/corpora/clapnq.jsonl.zip"
},
"compression": "zip",
"file_type": "json",
"lines": true,
"data_classification_policy": [
"public"
]
},
"preprocess_steps": [
{
"__type__": "cast",
"field": "_id",
"to": "str",
"to_field": "document_id"
},
{
"__type__": "wrap",
"field": "text",
"inside": "list",
"to_field": "passages"
},
{
"__type__": "set",
"fields": {
"metadata_field": ""
}
}
],
"task": "tasks.rag.corpora",
"templates": {
"empty": {
"__type__": "input_output_template",
"input_format": "",
"output_format": ""
}
}
}
Loading

0 comments on commit 3a0d99a

Please sign in to comment.