From e906580b48eceeb4de98d47cf2969cf87475f105 Mon Sep 17 00:00:00 2001 From: Aakash Thatte Date: Thu, 15 Aug 2024 19:23:14 +0530 Subject: [PATCH 1/7] Fix for edge case --- src/ragas/metrics/_summarization.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ragas/metrics/_summarization.py b/src/ragas/metrics/_summarization.py index 885627680..ac5cfef1c 100644 --- a/src/ragas/metrics/_summarization.py +++ b/src/ragas/metrics/_summarization.py @@ -199,6 +199,8 @@ def _compute_conciseness_score(self, text, summary) -> float: ratio of the length of the summary to the length of the original text. This promotes shorter summaries. """ + if len(summary)>len(text) or len(summary)==0: + return 0 return 1 - (len(summary) / len(text)) async def _extract_keyphrases(self, text: str, callbacks: Callbacks) -> t.List[str]: From d390630fd05a6285e5fbc27f4808e03d42d9b120 Mon Sep 17 00:00:00 2001 From: Aakash Thatte Date: Thu, 15 Aug 2024 19:35:16 +0530 Subject: [PATCH 2/7] format --- src/ragas/metrics/_summarization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragas/metrics/_summarization.py b/src/ragas/metrics/_summarization.py index ac5cfef1c..81478b9ea 100644 --- a/src/ragas/metrics/_summarization.py +++ b/src/ragas/metrics/_summarization.py @@ -199,7 +199,7 @@ def _compute_conciseness_score(self, text, summary) -> float: ratio of the length of the summary to the length of the original text. This promotes shorter summaries. """ - if len(summary)>len(text) or len(summary)==0: + if len(summary) > len(text) or len(summary) == 0: return 0 return 1 - (len(summary) / len(text)) From c1e8c7d8ac83403f85eb2ac61138067f0ec55993 Mon Sep 17 00:00:00 2001 From: Aakash Thatte Date: Fri, 16 Aug 2024 21:26:11 +0530 Subject: [PATCH 3/7] refactor and add weighted average for scores --- src/ragas/metrics/_summarization.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/ragas/metrics/_summarization.py b/src/ragas/metrics/_summarization.py index 81478b9ea..518b227b4 100644 --- a/src/ragas/metrics/_summarization.py +++ b/src/ragas/metrics/_summarization.py @@ -145,6 +145,7 @@ class SummarizationScore(MetricWithLLM): name: str = "summary_score" # type: ignore max_retries: int = 1 length_penalty: bool = True + coeff: float = 0.5 evaluation_mode: EvaluationMode = EvaluationMode.ca # type: ignore[reportIncompatibleMethodOverride] question_generation_prompt: Prompt = field( default_factory=lambda: TEXT_GENERATE_QUESTIONS @@ -153,6 +154,11 @@ class SummarizationScore(MetricWithLLM): default_factory=lambda: TEXT_GENERATE_ANSWERS ) + weights = { + "qa_score": coeff, + "conciseness_score": 1-coeff, + } + def _get_extract_keyphrases_prompt(self, text) -> PromptValue: return TEXT_EXTRACT_KEYPHRASES.format(text=text) @@ -174,17 +180,17 @@ async def _ascore(self, row: Dict, callbacks: Callbacks) -> float: questions = await self._get_questions(text, keyphrases, callbacks) answers = await self._get_answers(questions, summary, callbacks) - scores = [] + scores = {} qa_score = self._compute_qa_score(answers) - scores.append(qa_score) + scores["qa_score"] = qa_score if self.length_penalty: conciseness_score = self._compute_conciseness_score(text, summary) - scores.append(conciseness_score) + scores["conciseness_score"] = conciseness_score return self._compute_score(scores) def _compute_score(self, scores) -> float: - """Returns average score of the different scores.""" - return sum(scores) / len(scores) + """Returns weighted average of the different scores.""" + return sum([scores[k] * self.weights[k] for k in scores]) def _compute_qa_score(self, answers: t.List[str]) -> float: """Returns a score between 0 and 1 reflecting the fraction of @@ -199,9 +205,7 @@ def _compute_conciseness_score(self, text, summary) -> float: ratio of the length of the summary to the length of the original text. This promotes shorter summaries. """ - if len(summary) > len(text) or len(summary) == 0: - return 0 - return 1 - (len(summary) / len(text)) + return 1 - min(len(summary), len(text)) / (len(text) + 1e-10) async def _extract_keyphrases(self, text: str, callbacks: Callbacks) -> t.List[str]: assert self.llm is not None, "LLM is not initialized" From b3e1c324a06ae4cf6e7da729ddde934a82848048 Mon Sep 17 00:00:00 2001 From: Aakash Thatte Date: Fri, 16 Aug 2024 21:26:27 +0530 Subject: [PATCH 4/7] update docs for summarization --- docs/concepts/metrics/summarization_score.md | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/docs/concepts/metrics/summarization_score.md b/docs/concepts/metrics/summarization_score.md index 539785278..466fc1737 100644 --- a/docs/concepts/metrics/summarization_score.md +++ b/docs/concepts/metrics/summarization_score.md @@ -11,20 +11,32 @@ We compute the question-answer score using the answers, which is a list of `1`s \text{QA score} = \frac{|\text{correctly answered questions}|}{|\text{total questions}|} ```` -We also introduce an option to penalize larger summaries by proving a conciseness score. If this option is enabled, the final score is calculated as the average of the summarization score and the conciseness score. This conciseness scores ensures that summaries that are just copies of the text do not get a high score, because they will obviously answer all questions correctly. +We also introduce an option to penalize larger summaries by proving a conciseness score. If this option is enabled, the final score is calculated as the weighted average of the summarization score and the conciseness score. This conciseness scores ensures that summaries that are just copies of the text do not get a high score, because they will obviously answer all questions correctly. Also, we do not want the summaries that are empty. We add a small value `1e-10` to the denominator to avoid division by zero. ```{math} :label: conciseness-score -\text{conciseness score} = 1 - \frac{\text{length of summary}}{\text{length of context}} +\text{conciseness score} = 1 - \frac{\min(\text{length of summary}, \text{length of context})}{\text{length of context} + \text{1e-10}} ```` +We also provide a coefficient `coeff`(default value 0.5) to control the weightage of the scores. + The final summarization score is then calculated as: ```{math} :label: summarization-score -\text{Summarization Score} = \frac{\text{QA score} + \text{conciseness score}}{2} +\text{Summarization Score} = \text{QA score}*\text{coeff} + \\ +\text{conciseness score}*\text{(1-coeff)} ```` +Internally, we use a dictionary of weights: +```{math} +```{code-block} python +weights = { + "qa_score": coeff, + "conciseness_score": 1-coeff +} +``` + ```{hint} **Summary**: JPMorgan Chase & Co. is an American multinational finance company headquartered in New York City. It is the largest bank in the United States and the world's largest by market capitalization as of 2023. Founded in 1799, it is a major provider of investment banking services, with US$3.9 trillion in total assets, and ranked #1 in the Forbes Global 2000 ranking in 2023. From dc0c81538644d5945b379b42fe389d405bdfb25f Mon Sep 17 00:00:00 2001 From: Aakash Thatte Date: Fri, 16 Aug 2024 21:36:31 +0530 Subject: [PATCH 5/7] fix junk --- docs/concepts/metrics/summarization_score.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/concepts/metrics/summarization_score.md b/docs/concepts/metrics/summarization_score.md index 466fc1737..a7f2b664d 100644 --- a/docs/concepts/metrics/summarization_score.md +++ b/docs/concepts/metrics/summarization_score.md @@ -29,7 +29,6 @@ The final summarization score is then calculated as: ```` Internally, we use a dictionary of weights: -```{math} ```{code-block} python weights = { "qa_score": coeff, From a9f5e59bf13190c2dd482b4c0b3a29dd13aa1545 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 23 Aug 2024 17:44:18 +0530 Subject: [PATCH 6/7] change prompt settings and minor fixes --- src/ragas/metrics/_summarization.py | 72 +++++++++++------------------ 1 file changed, 28 insertions(+), 44 deletions(-) diff --git a/src/ragas/metrics/_summarization.py b/src/ragas/metrics/_summarization.py index 518b227b4..243b51a46 100644 --- a/src/ragas/metrics/_summarization.py +++ b/src/ragas/metrics/_summarization.py @@ -8,7 +8,7 @@ from langchain.pydantic_v1 import BaseModel from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions -from ragas.llms.prompt import Prompt, PromptValue +from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM if t.TYPE_CHECKING: @@ -146,34 +146,18 @@ class SummarizationScore(MetricWithLLM): max_retries: int = 1 length_penalty: bool = True coeff: float = 0.5 - evaluation_mode: EvaluationMode = EvaluationMode.ca # type: ignore[reportIncompatibleMethodOverride] + evaluation_mode: EvaluationMode = EvaluationMode.ca # type: ignore question_generation_prompt: Prompt = field( default_factory=lambda: TEXT_GENERATE_QUESTIONS ) answer_generation_prompt: Prompt = field( default_factory=lambda: TEXT_GENERATE_ANSWERS ) - - weights = { - "qa_score": coeff, - "conciseness_score": 1-coeff, - } - - def _get_extract_keyphrases_prompt(self, text) -> PromptValue: - return TEXT_EXTRACT_KEYPHRASES.format(text=text) - - def _get_question_generation_prompt(self, text, keyphrases) -> PromptValue: - return TEXT_GENERATE_QUESTIONS.format(text=text, keyphrases=keyphrases) - - def _get_answer_generation_prompt( - self, questions: t.List, summary: str - ) -> PromptValue: - return TEXT_GENERATE_ANSWERS.format(summary=summary, questions=questions) + extract_keyphrases_prompt: Prompt = field( + default_factory=lambda: TEXT_EXTRACT_KEYPHRASES + ) async def _ascore(self, row: Dict, callbacks: Callbacks) -> float: - # text is the contexts provided - # summary is the summary generated by the model - # TODO: add support for the query used as well text: str = "\n".join(row["contexts"]) summary: str = row["summary"] keyphrases = await self._extract_keyphrases(text, callbacks) @@ -189,27 +173,21 @@ async def _ascore(self, row: Dict, callbacks: Callbacks) -> float: return self._compute_score(scores) def _compute_score(self, scores) -> float: - """Returns weighted average of the different scores.""" - return sum([scores[k] * self.weights[k] for k in scores]) + return ( + scores["qa_score"] * (1 - self.coeff) + + scores.get("conciseness_score", 0) * self.coeff + ) def _compute_qa_score(self, answers: t.List[str]) -> float: - """Returns a score between 0 and 1 reflecting the fraction of - correct answers, ie with a value 'yes' - """ correct = sum([1 for a in answers if a.lower() == "1"]) return correct / len(answers) def _compute_conciseness_score(self, text, summary) -> float: - """Returns the conciseness score of the summary. This is calculated as - (1- relative_length_of_summary), where relative_length_of_summary is the - ratio of the length of the summary to the length of the original text. - This promotes shorter summaries. - """ return 1 - min(len(summary), len(text)) / (len(text) + 1e-10) async def _extract_keyphrases(self, text: str, callbacks: Callbacks) -> t.List[str]: assert self.llm is not None, "LLM is not initialized" - p_value = self._get_extract_keyphrases_prompt(text) + p_value = self.extract_keyphrases_prompt.format(text=text) result = await self.llm.generate( prompt=p_value, callbacks=callbacks, @@ -229,7 +207,9 @@ async def _get_questions( self, text: str, keyphrases: list[str], callbacks: Callbacks ) -> t.List[str]: assert self.llm is not None, "LLM is not initialized" - p_value = self._get_question_generation_prompt(text, keyphrases) + p_value = self.question_generation_prompt.format( + text=text, keyphrases=keyphrases + ) result = await self.llm.generate( prompt=p_value, callbacks=callbacks, @@ -250,7 +230,9 @@ async def _get_answers( self, questions: t.List[str], summary: str, callbacks: Callbacks ) -> t.List[str]: assert self.llm is not None, "LLM is not initialized" - p_value = self._get_answer_generation_prompt(questions, summary) + p_value = self.answer_generation_prompt.format( + questions=questions, summary=summary + ) result = await self.llm.generate( prompt=p_value, callbacks=callbacks, @@ -267,17 +249,19 @@ async def _get_answers( return response.answers + def adapt(self, language: str, cache_dir: str | None = None) -> None: + assert self.llm is not None, "set LLM before use" -def adapt(self, language: str, cache_dir: str | None = None) -> None: - assert self.llm is not None, "set LLM before use" - - logger.info(f"Adapting summarization to {language}") - self.question_generation_prompt = self.question_generation_prompt.adapt( - language, self.llm, cache_dir - ) - self.answer_generation_prompt = self.answer_generation_prompt.adapt( - language, self.llm, cache_dir - ) + logger.info(f"Adapting summarization to {language}") + self.question_generation_prompt = self.question_generation_prompt.adapt( + language, self.llm, cache_dir + ) + self.answer_generation_prompt = self.answer_generation_prompt.adapt( + language, self.llm, cache_dir + ) + self.answer_generation_prompt = self.answer_generation_prompt.adapt( + language, self.llm, cache_dir + ) summarization_score = SummarizationScore() From f38b826ec0ae33da56797e228645127c60ded9a5 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 23 Aug 2024 17:44:29 +0530 Subject: [PATCH 7/7] improve docs --- docs/concepts/metrics/summarization_score.md | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/docs/concepts/metrics/summarization_score.md b/docs/concepts/metrics/summarization_score.md index a7f2b664d..188ec4be8 100644 --- a/docs/concepts/metrics/summarization_score.md +++ b/docs/concepts/metrics/summarization_score.md @@ -28,14 +28,6 @@ The final summarization score is then calculated as: \text{conciseness score}*\text{(1-coeff)} ```` -Internally, we use a dictionary of weights: -```{code-block} python -weights = { - "qa_score": coeff, - "conciseness_score": 1-coeff -} -``` - ```{hint} **Summary**: JPMorgan Chase & Co. is an American multinational finance company headquartered in New York City. It is the largest bank in the United States and the world's largest by market capitalization as of 2023. Founded in 1799, it is a major provider of investment banking services, with US$3.9 trillion in total assets, and ranked #1 in the Forbes Global 2000 ranking in 2023. @@ -72,13 +64,14 @@ weights = { ## Example ```{code-block} python -from datasets import Dataset from ragas.metrics import summarization_score from ragas import evaluate +from datasets import Dataset + data_samples = { - 'contexts' : [[c1], [c2]], - 'summary': [s1, s2] + 'contexts':[["A company is launching a new product, a smartphone app designed to help users track their fitness goals. The app allows users to set daily exercise targets, log their meals, and track their water intake. It also provides personalized workout recommendations and sends motivational reminders throughout the day."]], + 'summary':['A company is launching a fitness tracking app that helps users set exercise goals, log meals, and track water intake, with personalized workout suggestions and motivational reminders.'], } dataset = Dataset.from_dict(data_samples) score = evaluate(dataset,metrics=[summarization_score])