From 4e5c7de6d8208f055bdb5693c91e1c54f593ad23 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 23 Sep 2024 19:10:41 +0530 Subject: [PATCH 1/9] remve BS --- src/ragas/metrics/_context_entities_recall.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ragas/metrics/_context_entities_recall.py b/src/ragas/metrics/_context_entities_recall.py index a17b75494..c76000bf3 100644 --- a/src/ragas/metrics/_context_entities_recall.py +++ b/src/ragas/metrics/_context_entities_recall.py @@ -139,7 +139,6 @@ class ContextEntityRecall(MetricWithLLM, SingleTurnMetric): context_entity_recall_prompt: Prompt = field( default_factory=lambda: TEXT_ENTITY_EXTRACTION ) - batch_size: int = 15 max_retries: int = 1 def _compute_score( @@ -195,4 +194,4 @@ def save(self, cache_dir: str | None = None) -> None: return self.context_entity_recall_prompt.save(cache_dir) -context_entity_recall = ContextEntityRecall(batch_size=15) +context_entity_recall = ContextEntityRecall() From 9160daa88405e7692080316ee8a7d1f06e160e7a Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 23 Sep 2024 19:10:52 +0530 Subject: [PATCH 2/9] rename metric --- src/ragas/metrics/_answer_relevance.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 3a66372e1..bf13db578 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -25,15 +25,15 @@ from ragas.llms.prompt import PromptValue -class AnswerRelevanceClassification(BaseModel): +class ResponseRelevanceClassification(BaseModel): question: str noncommittal: int _output_instructions = get_json_format_instructions( - pydantic_object=AnswerRelevanceClassification + pydantic_object=ResponseRelevanceClassification ) -_output_parser = RagasoutputParser(pydantic_object=AnswerRelevanceClassification) +_output_parser = RagasoutputParser(pydantic_object=ResponseRelevanceClassification) QUESTION_GEN = Prompt( @@ -44,7 +44,7 @@ class AnswerRelevanceClassification(BaseModel): { "answer": """Albert Einstein was born in Germany.""", "context": """Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time""", - "output": AnswerRelevanceClassification.parse_obj( + "output": ResponseRelevanceClassification.parse_obj( { "question": "Where was Albert Einstein born?", "noncommittal": 0, @@ -54,7 +54,7 @@ class AnswerRelevanceClassification(BaseModel): { "answer": """It can change its skin color based on the temperature of its environment.""", "context": """A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment.""", - "output": AnswerRelevanceClassification.parse_obj( + "output": ResponseRelevanceClassification.parse_obj( { "question": "What unique ability does the newly discovered species of frog have?", "noncommittal": 0, @@ -64,7 +64,7 @@ class AnswerRelevanceClassification(BaseModel): { "answer": """Everest""", "context": """The tallest mountain on Earth, measured from sea level, is a renowned peak located in the Himalayas.""", - "output": AnswerRelevanceClassification.parse_obj( + "output": ResponseRelevanceClassification.parse_obj( { "question": "What is the tallest mountain on Earth?", "noncommittal": 0, @@ -74,7 +74,7 @@ class AnswerRelevanceClassification(BaseModel): { "answer": """I don't know about the groundbreaking feature of the smartphone invented in 2023 as am unaware of information beyond 2022. """, "context": """In 2023, a groundbreaking invention was announced: a smartphone with a battery life of one month, revolutionizing the way people use mobile technology.""", - "output": AnswerRelevanceClassification.parse_obj( + "output": ResponseRelevanceClassification.parse_obj( { "question": "What was the groundbreaking feature of the smartphone invented in 2023?", "noncommittal": 1, @@ -89,7 +89,7 @@ class AnswerRelevanceClassification(BaseModel): @dataclass -class AnswerRelevancy(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric): +class ResponseRelevancy(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric): """ Scores the relevancy of the answer according to the given question. Answers with incomplete, redundant or unnecessary information is penalized. @@ -139,7 +139,7 @@ def calculate_similarity( ) def _calculate_score( - self, answers: t.Sequence[AnswerRelevanceClassification], row: t.Dict + self, answers: t.Sequence[ResponseRelevanceClassification], row: t.Dict ) -> float: question = row["user_input"] gen_questions = [answer.question for answer in answers] @@ -197,4 +197,9 @@ def save(self, cache_dir: str | None = None) -> None: self.question_generation.save(cache_dir) +class AnswerRelevancy(ResponseRelevancy): + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + return await super()._ascore(row, callbacks) + + answer_relevancy = AnswerRelevancy() From 4f63b289c015f3f4a2e635a2abcfa4d6df383600 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 23 Sep 2024 19:11:02 +0530 Subject: [PATCH 3/9] rename metric --- src/ragas/metrics/_answer_similarity.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py index 027e35a0a..b3e65736b 100644 --- a/src/ragas/metrics/_answer_similarity.py +++ b/src/ragas/metrics/_answer_similarity.py @@ -23,7 +23,7 @@ @dataclass -class AnswerSimilarity(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric): +class SemanticSimilarity(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric): """ Scores the semantic similarity of ground truth with generated answer. cross encoder score is used to quantify semantic similarity. @@ -91,4 +91,9 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: return score.tolist()[0] +class AnswerSimilarity(SemanticSimilarity): + async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + return await super()._ascore(row, callbacks) + + answer_similarity = AnswerSimilarity() From 41772dbf44b35578a91b3284631b23712905a3d9 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 23 Sep 2024 19:11:21 +0530 Subject: [PATCH 4/9] remove instances --- src/ragas/metrics/__init__.py | 14 +------------- src/ragas/metrics/_domain_specific_rubrics.py | 4 ---- src/ragas/metrics/_factual_correctness.py | 2 +- src/ragas/metrics/_noise_sensitivity.py | 8 +------- 4 files changed, 3 insertions(+), 25 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 6b476675d..244c055b2 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -13,21 +13,14 @@ ContextPrecision, ContextUtilization, context_precision, - context_utilization, ) from ragas.metrics._context_recall import ContextRecall, context_recall from ragas.metrics._domain_specific_rubrics import ( RubricsScoreWithoutReference, RubricsScoreWithReference, - rubrics_score_with_reference, - rubrics_score_without_reference, ) from ragas.metrics._faithfulness import Faithfulness, FaithulnesswithHHEM, faithfulness -from ragas.metrics._noise_sensitivity import ( - NoiseSensitivity, - noise_sensitivity_irrelevant, - noise_sensitivity_relevant, -) +from ragas.metrics._noise_sensitivity import NoiseSensitivity from ragas.metrics._summarization import SummarizationScore, summarization_score __all__ = [ @@ -41,7 +34,6 @@ "ContextPrecision", "context_precision", "ContextUtilization", - "context_utilization", "ContextRecall", "context_recall", "AspectCritic", @@ -52,10 +44,6 @@ "SummarizationScore", "summarization_score", "NoiseSensitivity", - "noise_sensitivity_irrelevant", - "noise_sensitivity_relevant", - "rubrics_score_with_reference", - "rubrics_score_without_reference", "RubricsScoreWithoutReference", "RubricsScoreWithReference", ] diff --git a/src/ragas/metrics/_domain_specific_rubrics.py b/src/ragas/metrics/_domain_specific_rubrics.py index 09509132a..959306b36 100644 --- a/src/ragas/metrics/_domain_specific_rubrics.py +++ b/src/ragas/metrics/_domain_specific_rubrics.py @@ -307,7 +307,3 @@ def _create_single_turn_prompt(self, row: t.Dict) -> SingleTurnWithReferenceInpu reference=ground_truth, rubrics=self.rubrics, ) - - -rubrics_score_with_reference = RubricsScoreWithReference() -rubrics_score_without_reference = RubricsScoreWithoutReference() diff --git a/src/ragas/metrics/_factual_correctness.py b/src/ragas/metrics/_factual_correctness.py index 15f574a94..42fabaa0e 100644 --- a/src/ragas/metrics/_factual_correctness.py +++ b/src/ragas/metrics/_factual_correctness.py @@ -9,11 +9,11 @@ from numpy.typing import NDArray from pydantic import BaseModel, Field -from ragas.experimental.prompt import PydanticPrompt from ragas.experimental.metrics._faithfulness import ( NLIStatementInput, NLIStatementPrompt, ) +from ragas.experimental.prompt import PydanticPrompt from ragas.metrics.base import ( MetricType, MetricWithLLM, diff --git a/src/ragas/metrics/_noise_sensitivity.py b/src/ragas/metrics/_noise_sensitivity.py index 122ddf03f..6633d9610 100644 --- a/src/ragas/metrics/_noise_sensitivity.py +++ b/src/ragas/metrics/_noise_sensitivity.py @@ -38,7 +38,7 @@ @dataclass class NoiseSensitivity(MetricWithLLM, SingleTurnMetric): name: str = "noise_sensitivity" # type: ignore - focus: str = "relevant" + focus: t.Literal["relevant", "irrelevant"] = "relevant" _required_columns: t.Dict[MetricType, t.Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { @@ -266,8 +266,6 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: assert self.llm is not None, "LLM is not set" - logger.info(f"Adapting Faithfulness metric to {language}") - self.nli_statements_message = self.nli_statements_message.adapt( language, self.llm, cache_dir ) @@ -280,7 +278,3 @@ def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: def save(self, cache_dir: t.Optional[str] = None) -> None: self.nli_statements_message.save(cache_dir) self.statement_prompt.save(cache_dir) - - -noise_sensitivity_relevant = NoiseSensitivity() -noise_sensitivity_irrelevant = NoiseSensitivity(focus="irrelevant") From 1401ae37e2037eac91381f4068b5497302514a3c Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 23 Sep 2024 19:26:41 +0530 Subject: [PATCH 5/9] remove and rename --- src/ragas/metrics/_rogue_score.py | 3 --- src/ragas/metrics/_sql_semantic_equivalence.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/ragas/metrics/_rogue_score.py b/src/ragas/metrics/_rogue_score.py index 78fd60548..9786b76ad 100644 --- a/src/ragas/metrics/_rogue_score.py +++ b/src/ragas/metrics/_rogue_score.py @@ -32,6 +32,3 @@ async def _single_turn_ascore( async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: return await self._single_turn_ascore(SingleTurnSample(**row), callbacks) - - -rouge_score = RougeScore() diff --git a/src/ragas/metrics/_sql_semantic_equivalence.py b/src/ragas/metrics/_sql_semantic_equivalence.py index 821c3f96e..63cc8e834 100644 --- a/src/ragas/metrics/_sql_semantic_equivalence.py +++ b/src/ragas/metrics/_sql_semantic_equivalence.py @@ -63,7 +63,7 @@ class EquivalencePrompt(PydanticPrompt[EquivalenceInput, EquivalenceOutput]): @dataclass -class LLMSqlEquivalenceWithReference(MetricWithLLM, SingleTurnMetric): +class LLMSQLEquivalence(MetricWithLLM, SingleTurnMetric): name: str = "llm_sql_equivalence_with_reference" # type: ignore _required_columns: t.Dict[MetricType, t.Set[str]] = field( default_factory=lambda: { From 85720871caeb296b61b577c5b7895ebf4a192f0f Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 23 Sep 2024 19:26:49 +0530 Subject: [PATCH 6/9] add metrics to init --- src/ragas/metrics/__init__.py | 56 +++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 244c055b2..ebb8f9fb4 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -2,9 +2,18 @@ import sys from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness -from ragas.metrics._answer_relevance import AnswerRelevancy, answer_relevancy -from ragas.metrics._answer_similarity import AnswerSimilarity, answer_similarity +from ragas.metrics._answer_relevance import ( + AnswerRelevancy, + ResponseRelevancy, + answer_relevancy, +) +from ragas.metrics._answer_similarity import ( + AnswerSimilarity, + SemanticSimilarity, + answer_similarity, +) from ragas.metrics._aspect_critic import AspectCritic +from ragas.metrics._bleu_score import BleuScore from ragas.metrics._context_entities_recall import ( ContextEntityRecall, context_entity_recall, @@ -12,16 +21,37 @@ from ragas.metrics._context_precision import ( ContextPrecision, ContextUtilization, + LLMContextPrecisionWithoutReference, + NonLLMContextPrecisionWithReference, context_precision, ) -from ragas.metrics._context_recall import ContextRecall, context_recall +from ragas.metrics._context_recall import ( + ContextRecall, + LLMContextRecall, + NonLLMContextRecall, + context_recall, +) +from ragas.metrics._datacompy_score import DataCompyScore from ragas.metrics._domain_specific_rubrics import ( RubricsScoreWithoutReference, RubricsScoreWithReference, ) +from ragas.metrics._factual_correctness import FactualCorrectness from ragas.metrics._faithfulness import Faithfulness, FaithulnesswithHHEM, faithfulness +from ragas.metrics._goal_accuracy import ( + AgentGoalAccuracyWithoutReference, + AgentGoalAccuracyWithReference, +) +from ragas.metrics._instance_specific_rubrics import ( + InstanceRubricsScoreWithoutReference, + InstanceRubricsWithReference, +) from ragas.metrics._noise_sensitivity import NoiseSensitivity +from ragas.metrics._rogue_score import RougeScore +from ragas.metrics._sql_semantic_equivalence import LLMSQLEquivalence +from ragas.metrics._string import ExactMatch, NonLLMStringSimilarity, StringPresence from ragas.metrics._summarization import SummarizationScore, summarization_score +from ragas.metrics._tool_call_accuracy import ToolCallAccuracy __all__ = [ "AnswerCorrectness", @@ -46,6 +76,26 @@ "NoiseSensitivity", "RubricsScoreWithoutReference", "RubricsScoreWithReference", + "LLMContextPrecisionWithoutReference", + "NonLLMContextPrecisionWithReference", + "LLMContextPrecisionWithoutReference", + "LLMContextRecall", + "NonLLMContextRecall", + "FactualCorrectness", + "InstanceRubricsScoreWithoutReference", + "InstanceRubricsWithReference", + "NonLLMStringSimilarity", + "ExactMatch", + "StringPresence", + "BleuScore", + "RougeScore", + "DataCompyScore", + "LLMSQLEquivalence", + "AgentGoalAccuracyWithoutReference", + "AgentGoalAccuracyWithReference", + "ToolCallAccuracy", + "ResponseRelevancy", + "SemanticSimilarity", ] current_module = sys.modules[__name__] From ab428a4ef5360e4c0dc273a0cf12945e606feba8 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 23 Sep 2024 19:55:53 +0530 Subject: [PATCH 7/9] optional import --- src/ragas/metrics/_rogue_score.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/ragas/metrics/_rogue_score.py b/src/ragas/metrics/_rogue_score.py index 9786b76ad..1a8a72fa8 100644 --- a/src/ragas/metrics/_rogue_score.py +++ b/src/ragas/metrics/_rogue_score.py @@ -2,7 +2,6 @@ from dataclasses import dataclass, field from langchain_core.callbacks import Callbacks -from rouge_score import rouge_scorer from ragas.dataset_schema import SingleTurnSample from ragas.metrics.base import MetricType, SingleTurnMetric @@ -18,6 +17,15 @@ class RougeScore(SingleTurnMetric): rogue_type: t.Literal["rouge1", "rougeL"] = "rougeL" measure_type: t.Literal["fmeasure", "precision", "recall"] = "fmeasure" + def __post_init__(self): + try: + from rouge_score import rouge_scorer + except ImportError as e: + raise ImportError( + f"{e.name} is required for rouge score. Please install it using `pip install {e.name}" + ) + self.rouge_scorer = rouge_scorer + def init(self, run_config: RunConfig): pass @@ -26,7 +34,7 @@ async def _single_turn_ascore( ) -> float: assert isinstance(sample.reference, str), "Sample reference must be a string" assert isinstance(sample.response, str), "Sample response must be a string" - scorer = rouge_scorer.RougeScorer([self.rogue_type], use_stemmer=True) + scorer = self.rouge_scorer.RougeScorer([self.rogue_type], use_stemmer=True) scores = scorer.score(sample.reference, sample.response) return getattr(scores[self.rogue_type], self.measure_type) From 7a1cdd73df9ec2fe6c1f4f985a2be84bf9339a8e Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 23 Sep 2024 19:59:01 +0530 Subject: [PATCH 8/9] add distance measure --- src/ragas/metrics/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index ebb8f9fb4..35f3e261f 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -49,7 +49,7 @@ from ragas.metrics._noise_sensitivity import NoiseSensitivity from ragas.metrics._rogue_score import RougeScore from ragas.metrics._sql_semantic_equivalence import LLMSQLEquivalence -from ragas.metrics._string import ExactMatch, NonLLMStringSimilarity, StringPresence +from ragas.metrics._string import ExactMatch, NonLLMStringSimilarity, StringPresence, DistanceMeasure from ragas.metrics._summarization import SummarizationScore, summarization_score from ragas.metrics._tool_call_accuracy import ToolCallAccuracy @@ -96,6 +96,7 @@ "ToolCallAccuracy", "ResponseRelevancy", "SemanticSimilarity", + "DistanceMeasure", ] current_module = sys.modules[__name__] From 13a33ee20e6b301d8b3f50046481dbc893a1789b Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 24 Sep 2024 09:44:54 +0530 Subject: [PATCH 9/9] add default factory --- src/ragas/metrics/_tool_call_accuracy.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ragas/metrics/_tool_call_accuracy.py b/src/ragas/metrics/_tool_call_accuracy.py index cc7c365c8..b56569219 100644 --- a/src/ragas/metrics/_tool_call_accuracy.py +++ b/src/ragas/metrics/_tool_call_accuracy.py @@ -27,7 +27,9 @@ class ToolCallAccuracy(MultiTurnMetric): } ) - arg_comparison_metric: SingleTurnMetric = ExactMatch() + arg_comparison_metric: SingleTurnMetric = field( + default_factory=lambda: ExactMatch() + ) def init(self, run_config): pass