Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics: add domain specific rubrics based scoring #1189

Merged
merged 23 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
173c167
Updated regex_based.py from iter-v3 branch
vaishakhRaveendran Jul 18, 2024
c8fb5ff
New update
vaishakhRaveendran Aug 9, 2024
e2da934
solve merge conflicts
vaishakhRaveendran Aug 9, 2024
7c98088
update to_pandas() methods
vaishakhRaveendran Aug 9, 2024
18c9c6c
Add to_dict() method to QAC class
vaishakhRaveendran Aug 9, 2024
40eeea5
Add prometheus-eval to metrics
vaishakhRaveendran Aug 12, 2024
6d47d22
Update base.py
vaishakhRaveendran Aug 12, 2024
e9f0240
roll back
vaishakhRaveendran Aug 12, 2024
02fa528
roll back
vaishakhRaveendran Aug 12, 2024
09dc1ef
roll back
vaishakhRaveendran Aug 12, 2024
8f8f150
improve and shorten prompt
shahules786 Aug 13, 2024
106c953
Removes the relative grading part and focuses on the absolute grading
vaishakhRaveendran Aug 14, 2024
e3cb141
Merge remote-tracking branch 'origin/metrics' into metrics
vaishakhRaveendran Aug 14, 2024
91335cb
Removes the relative grading part and focuses on the absolute grading
vaishakhRaveendran Aug 14, 2024
7fe2b17
update the _init_.py to add PrometheusAbsolute
vaishakhRaveendran Aug 14, 2024
4056858
remove-rename and bug fixes
shahules786 Aug 14, 2024
64c70e0
add rubrics to init
shahules786 Aug 14, 2024
27d194d
remove format changes
shahules786 Aug 14, 2024
bb4b54d
Merge branch 'main' into pr/vaishakhRaveendran/1189-1
shahules786 Aug 14, 2024
16950d0
Merge branch 'main' into metrics
shahules786 Aug 14, 2024
98b2a56
change metric to domain specific eval
shahules786 Aug 14, 2024
81b5f89
add required imports
shahules786 Aug 14, 2024
5114296
add default rubrics
shahules786 Aug 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/ragas/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@
)
from ragas.metrics._context_recall import ContextRecall, context_recall
from ragas.metrics._faithfulness import Faithfulness, FaithulnesswithHHEM, faithfulness
from ragas.metrics._rubrics_based import (
LabelledRubricsScore,
ReferenceFreeRubricsScore,
labelled_rubrics_score,
reference_free_rubrics_score,
)
from ragas.metrics._summarization import SummarizationScore, summarization_score
from ragas.metrics.critique import AspectCritique

Expand All @@ -37,4 +43,8 @@
"context_entity_recall",
"SummarizationScore",
"summarization_score",
"labelled_rubrics_score",
"reference_free_rubrics_score",
"ReferenceFreeRubricsScore",
"LabelledRubricsScore",
]
188 changes: 188 additions & 0 deletions src/ragas/metrics/_rubrics_based.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
from __future__ import annotations

import typing as t
from dataclasses import dataclass, field

import numpy as np
from langchain_core.pydantic_v1 import BaseModel, Field

from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions
from ragas.llms.prompt import Prompt
from ragas.metrics.base import EvaluationMode, MetricWithLLM

if t.TYPE_CHECKING:
from langchain_core.callbacks import Callbacks

from ragas.llms.prompt import PromptValue


class ScoreFeedback(BaseModel):
feedback: str = Field(..., description="The feedback for the response")
score: int = Field(..., description="The score given to the response")


class ScoreFeedbackAnswers(BaseModel):
__root__: t.List[ScoreFeedback]

def dicts(self) -> t.List[t.Dict]:
return self.dict()["__root__"]


_score_feedback_output_instructions = get_json_format_instructions(ScoreFeedbackAnswers)
_score_feedback_output_parser = RagasoutputParser(pydantic_object=ScoreFeedbackAnswers)

DEFAULT_REFERENCE_FREE_RUBRICS = {
"score1_description": "The response is incorrect or does not answer the question.",
"score2_description": "The response is partially correct but may include errors or incomplete information.",
"score3_description": "The response is generally correct but lacks clarity or completeness.",
"score4_description": "The response is correct and clear, with minor issues or missing details.",
"score5_description": "The response is completely accurate, clear, and answers the question directly.",
}


DEFAULT_WITH_REFERENCE_RUBRICS = {
"score1_description": "The response is incorrect, irrelevant, or does not align with the ground truth.",
"score2_description": "The response partially matches the ground truth but includes significant errors, omissions, or irrelevant information.",
"score3_description": "The response generally aligns with the ground truth but may lack detail, clarity, or have minor inaccuracies.",
"score4_description": "The response is mostly accurate and aligns well with the ground truth, with only minor issues or missing details.",
"score5_description": "The response is fully accurate, aligns completely with the ground truth, and is clear and detailed.",
}


WITH_REFERENCE_SCORING_PROMPT = Prompt(
name="prometheus_score",
output_format_instruction=_score_feedback_output_instructions,
instruction="""Given an question (which might contain an input along with it), a answer to evaluate, a ground_truth answer that gets a score of 5, and a score rubric representing evaluation criteria are given.
1. Write detailed feedback that assesses the quality of the answer strictly based on the given score rubric, without evaluating in general.
2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric.""",
examples=[
{
"question": "What is the capital of France?",
"answer": "The capital of France is Paris.",
"ground_truth": "The capital of France is Paris.",
"rubrics": DEFAULT_WITH_REFERENCE_RUBRICS,
"analysis": ScoreFeedbackAnswers.parse_obj(
[
{
"feedback": """The response is completely accurate and directly answers the question about the capital of France. It matches the reference answer perfectly and does not contain any errors or omissions. Given the rubric, this response deserves the highest score as it meets all the criteria for accuracy and clarity.""",
"score": 5,
}
]
).dicts(),
}
],
input_keys=["question", "answer", "ground_truth", "rubrics"],
output_key="analysis",
language="english",
)


WITHOUT_REFERENCE_SCORING_PROMPT = Prompt(
name="prometheus_score",
output_format_instruction=_score_feedback_output_instructions,
instruction="""Given an question (which might contain an input along with it), a answer to evaluate, and a score rubric representing evaluation criteria are given.
1. Write detailed feedback that assesses the quality of the answer strictly based on the given score rubric, without evaluating in general.
2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric.""",
examples=[
{
"question": "What is the capital of France?",
"answer": "The capital of France is Paris.",
"rubrics": DEFAULT_REFERENCE_FREE_RUBRICS,
"analysis": ScoreFeedbackAnswers.parse_obj(
[
{
"feedback": """The response is completely accurate and directly answers the question about the capital of France. It matches the reference answer perfectly and does not contain any errors or omissions. Given the rubric, this response deserves the highest score as it meets all the criteria for accuracy and clarity.""",
"score": 5,
}
]
).dicts(),
}
],
input_keys=["question", "answer", "rubrics"],
output_key="analysis",
language="english",
)


@dataclass
class LabelledRubricsScore(MetricWithLLM):
name: str = "labelled_rubrics_score" # type: ignore
evaluation_mode: EvaluationMode = EvaluationMode.qcg # type: ignore
rubrics: t.Dict[str, str] = field(
default_factory=lambda: DEFAULT_WITH_REFERENCE_RUBRICS
)
scoring_prompt: Prompt = field(
default_factory=lambda: WITH_REFERENCE_SCORING_PROMPT
)
max_retries: int = 1

async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
assert self.llm is not None, "LLM is not set"

prompt_value = self._create_prompt(row)

response = await self.llm.generate(prompt_value, callbacks=callbacks)

parsed_response = await _score_feedback_output_parser.aparse(
response.generations[0][0].text, prompt_value, self.llm, self.max_retries
)

if parsed_response is None:
return np.nan

score = parsed_response.dicts()[0]["score"]
return score

def _create_prompt(self, row: t.Dict) -> PromptValue:
question, contexts, answer, ground_truth = (
row["question"],
row["contexts"],
row["answer"],
row["ground_truth"],
)
contexts = "\n".join(contexts)
question = f"{question} answer using context: {contexts}"
return self.scoring_prompt.format(
question=question,
answer=answer,
ground_truth=ground_truth,
rubrics=self.rubrics,
)

def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None:
assert self.llm is not None, "LLM must be set to adapt the metric"
self.scoring_prompt.adapt(language, self.llm, cache_dir)

def save(self, cache_dir: t.Optional[str] = None) -> None:
self.scoring_prompt.save(cache_dir)


@dataclass
class ReferenceFreeRubricsScore(LabelledRubricsScore):
name: str = "reference_free_rubrics_score" # type: ignore
evaluation_mode: EvaluationMode = EvaluationMode.qga # type: ignore
rubrics: t.Dict[str, str] = field(
default_factory=lambda: DEFAULT_REFERENCE_FREE_RUBRICS
)
scoring_prompt: Prompt = field(
default_factory=lambda: WITHOUT_REFERENCE_SCORING_PROMPT
)
max_retries: int = 1

def _create_prompt(self, row: t.Dict) -> PromptValue:
question, contexts, answer = (
row["question"],
row["contexts"],
row["answer"],
)
contexts = "\n".join(contexts)
question = f"{question} answer using context: {contexts}"
return self.scoring_prompt.format(
question=question,
answer=answer,
rubrics=self.rubrics,
)


labelled_rubrics_score = LabelledRubricsScore()
reference_free_rubrics_score = ReferenceFreeRubricsScore()
Loading