From 173c167d2a1a070072414c37f08781426e095ebd Mon Sep 17 00:00:00 2001 From: Vaishakh Raveendran Date: Thu, 18 Jul 2024 21:01:07 +0530 Subject: [PATCH 01/19] Updated regex_based.py from iter-v3 branch --- .../testset/extractors/regex_based.py | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/src/experimental/ragas_experimental/testset/extractors/regex_based.py b/src/experimental/ragas_experimental/testset/extractors/regex_based.py index 1c3812fbf..c7c83c55a 100644 --- a/src/experimental/ragas_experimental/testset/extractors/regex_based.py +++ b/src/experimental/ragas_experimental/testset/extractors/regex_based.py @@ -31,19 +31,16 @@ def extract_text(self, text): ) result = defaultdict(list) for m in matches: - m = {k: v for k, v in m.groupdict().items() if v is not None} - for key in m: - result[key].append(m[key]) - + m_dict = {k: v for k, v in m.groupdict().items() if v is not None} + for key, value in m_dict.items(): + result[key].append(value) return result def extract(self, node: t.Union[Node, LCDocument]) -> t.Any: return super().extract(node) def merge_extractors(self, *extractors) -> t.List[Extractor]: - if isinstance( - self, RulebasedExtractor - ): # Check if called by an initiated class + if isinstance(self, RulebasedExtractor): extractors = (self,) + extractors assert all( @@ -70,13 +67,28 @@ def merge_extractors(self, *extractors) -> t.List[Extractor]: added_indices.append(extractors.index(ext)) extractors_to_return = [] - for extractors in final_extractors: + for group_index, extractors in enumerate(final_extractors): if len(extractors) > 1: - pattern = "|".join([extractor.pattern for extractor in extractors]) - updated_regex = Regex(name="merged_extractor", pattern=pattern) + # Process each pattern individually + processed_patterns = [] + for extractor in extractors: + pattern = extractor.pattern + # Extract flags from the beginning of the pattern + flags = "" + if pattern.startswith("(?"): + flag_end = pattern.index(")") + flags = pattern[2:flag_end] + pattern = pattern[flag_end + 1:] + # Wrap the pattern in a non-capturing group with flags + processed_patterns.append(f"(?{flags}:{pattern})") + + # Join all processed patterns + merged_pattern = "|".join(processed_patterns) + + updated_regex = Regex(name="merged_extractor", pattern=merged_pattern) else: - pattern = extractors[0].pattern updated_regex = extractors[0].regex + extractors_to_return.append( RulebasedExtractor( attribute=extractors[0].attribute, @@ -86,10 +98,9 @@ def merge_extractors(self, *extractors) -> t.List[Extractor]: ) return extractors_to_return - links_extractor_pattern = r"(?i)\b(?:https?://|www\.)\S+\b" emails_extractor_pattern = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+" -markdown_headings = r"^(#{1,6})\s+(.*)" +markdown_headings_pattern = r"^(#{1,6})\s+(.*)" email_extractor = RulebasedExtractor( regex=Regex(name="email", pattern=emails_extractor_pattern) @@ -98,5 +109,6 @@ def merge_extractors(self, *extractors) -> t.List[Extractor]: regex=Regex(name="link", pattern=links_extractor_pattern) ) markdown_headings = RulebasedExtractor( - regex=Regex(name="markdown_headings", pattern=markdown_headings), is_multiline=True + regex=Regex(name="markdown_headings", pattern=markdown_headings_pattern), + is_multiline=True ) From c8fb5ffd25e3bb2d0b776115889b51710afdba5f Mon Sep 17 00:00:00 2001 From: Vaishakh Raveendran Date: Fri, 9 Aug 2024 21:41:26 +0530 Subject: [PATCH 02/19] New update --- .../ragas_experimental/testset/generators/simple.py | 6 +++--- .../testset/splitters/section_splitter.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/experimental/ragas_experimental/testset/generators/simple.py b/src/experimental/ragas_experimental/testset/generators/simple.py index edc034475..88e461d54 100644 --- a/src/experimental/ragas_experimental/testset/generators/simple.py +++ b/src/experimental/ragas_experimental/testset/generators/simple.py @@ -1,10 +1,10 @@ import typing as t - import numpy as np from langchain_core.documents import Document from ragas_experimental.testset.extractors import ( DocumentExtractor, email_extractor, + markdown_headings, headline_extractor, keyphrase_extractor, link_extractor, @@ -63,6 +63,7 @@ def _document_exraction(self, docs: t.Sequence[Document]) -> t.Sequence[Document summary_extractor, link_extractor, email_extractor, + markdown_headings, keyphrase_extractor, title_extractor, headline_extractor, @@ -89,6 +90,7 @@ def generate( link_extractor, email_extractor, keyphrase_extractor, + markdown_headings, title_extractor, headline_extractor, ] @@ -96,10 +98,8 @@ def generate( extractors=extractors, llm=self.llm, embedding=self.embedding ) docs = doc_extractor.extract(docs) - splitter = HeadlineSplitter(common_metadata_keys=["source", "title"]) nodes, relationships = splitter.split_documents(docs, "headlines") - nodes = doc_extractor.embed( nodes, ["page_content", "summary"], diff --git a/src/experimental/ragas_experimental/testset/splitters/section_splitter.py b/src/experimental/ragas_experimental/testset/splitters/section_splitter.py index f22328d64..368d8364c 100644 --- a/src/experimental/ragas_experimental/testset/splitters/section_splitter.py +++ b/src/experimental/ragas_experimental/testset/splitters/section_splitter.py @@ -1,6 +1,5 @@ import re import typing as t - import numpy as np from langchain_core.documents import Document as LCDocument from ragas_experimental.testset.graph import Node, NodeLevel, NodeType, Relationship From 7c980885b58facec190bf9cab5a3c18b91a2db6e Mon Sep 17 00:00:00 2001 From: Vaishakh Raveendran Date: Fri, 9 Aug 2024 21:58:21 +0530 Subject: [PATCH 03/19] update to_pandas() methods --- .../testset/generators/base.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/experimental/ragas_experimental/testset/generators/base.py b/src/experimental/ragas_experimental/testset/generators/base.py index ce40adc38..22fba0288 100644 --- a/src/experimental/ragas_experimental/testset/generators/base.py +++ b/src/experimental/ragas_experimental/testset/generators/base.py @@ -32,8 +32,24 @@ class TestDataset: def to_pandas(self): data = [] - for row in self.qac: - data.append(row.to_dict()) + for item in self.qac: + if isinstance(item, list): + for subitem in item: + if isinstance(subitem, QAC): + data.append(subitem.to_dict()) + elif isinstance(subitem, dict): + data.append(subitem) + else: + raise TypeError(f"Unexpected type in qac list: {type(subitem)}") + elif isinstance(item, QAC): + data.append(item.to_dict()) + elif isinstance(item, dict): + data.append(item) + elif isinstance(item, type) and issubclass(item, QAC): + pass + else: + raise TypeError(f"Unexpected type in qac: {type(item)}") + return pd.DataFrame(data) From 18c9c6c40fef9641a982ebb4543749bad9307d30 Mon Sep 17 00:00:00 2001 From: Vaishakh Raveendran Date: Fri, 9 Aug 2024 21:59:23 +0530 Subject: [PATCH 04/19] Add to_dict() method to QAC class --- .../ragas_experimental/testset/questions/base.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/experimental/ragas_experimental/testset/questions/base.py b/src/experimental/ragas_experimental/testset/questions/base.py index 1ab7c9c38..22890c10e 100644 --- a/src/experimental/ragas_experimental/testset/questions/base.py +++ b/src/experimental/ragas_experimental/testset/questions/base.py @@ -43,7 +43,15 @@ class QAC: style: t.Optional[QuestionStyle] = QuestionStyle.PERFECT_GRAMMAR length: t.Optional[QuestionLength] = QuestionLength.MEDIUM - + def to_dict(self): + return { + "question": self.question, + "answer": self.answer, + "source": [self.source] if self.source else None, + "name": self.name, + "style": self.style, + "length": self.length + } @dataclass class StyleLengthDistribution: style_length_distribution: t.Dict[ From 40eeea5be5be73e84ebffe6f668accdbb6276d16 Mon Sep 17 00:00:00 2001 From: Vaishakh Raveendran Date: Mon, 12 Aug 2024 23:34:58 +0530 Subject: [PATCH 05/19] Add prometheus-eval to metrics --- src/ragas/metrics/__init__.py | 2 + src/ragas/metrics/_prometheus.py | 129 +++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 src/ragas/metrics/_prometheus.py diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index e22d3fa57..f5f587fac 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -11,6 +11,7 @@ context_precision, context_utilization, ) +from ragas.metrics._prometheus import Prometheus from ragas.metrics._context_recall import ContextRecall, context_recall from ragas.metrics._faithfulness import Faithfulness, faithfulness from ragas.metrics._summarization import SummarizationScore, summarization_score @@ -36,4 +37,5 @@ "context_entity_recall", "SummarizationScore", "summarization_score", + "Prometheus", ] diff --git a/src/ragas/metrics/_prometheus.py b/src/ragas/metrics/_prometheus.py new file mode 100644 index 000000000..d693bf37c --- /dev/null +++ b/src/ragas/metrics/_prometheus.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +import typing as t +from enum import Enum +import numpy as np +from langchain_core.pydantic_v1 import BaseModel, Field + +from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions +from ragas.llms.prompt import Prompt +from ragas.metrics.base import EvaluationMode, MetricWithLLM + +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + + from ragas.llms.prompt import PromptValue + + +class PrometheusMode(Enum): + ABSOLUTE = "absolute" + RELATIVE = "relative" + + +class ScoreFeedback(BaseModel): + feedback: str = Field(..., description="The feedback for the response") + score: int = Field(..., description="The score given to the response") + + +class ScoreFeedbackAnswers(BaseModel): + __root__: t.List[ScoreFeedback] + + def dicts(self) -> t.List[t.Dict]: + return self.dict()["__root__"] + + +_score_feedback_output_instructions = get_json_format_instructions(ScoreFeedbackAnswers) +_score_feedback_output_parser = RagasoutputParser(pydantic_object=ScoreFeedbackAnswers) + + +PROMETHEUS_ABSOLUTE_PROMPT = Prompt( + name="prometheus_score", + output_format_instruction=_score_feedback_output_instructions, + instruction="""Given an instruction, a response, a reference answer, and a score rubric, evaluate the response by providing detailed feedback and assigning a score between 1 and 5. Format the output as JSON with 'feedback' and 'score' fields.""", + examples=[ + { + "question": "Struggling with a recent break-up, a person opens up about the intense feelings of loneliness and sadness. They ask for advice on how to cope with the heartbreak and move forward in life.", + "answer": """I'm genuinely sorry to hear about your break-up. This can be an immensely challenging time filled with intense emotions of loneliness and sorrow. It's important to remember that it's normal to experience these feelings; they are a testament to the deep connection you once had. Firstly, don't hesitate to embrace your emotions, let them flow freely. Ignoring them often complicates the healing process. It's natural to be upset, to shed tears, or to feel angry. This is all part of the journey towards healing. Self-care can be a healing balm in these tough times. This could mean indulging in your favourite book, journaling your thoughts, or even just a quiet cup of tea. Small acts of kindness to oneself can significantly improve your emotional well-being. Also, don't isolate yourself. Surround yourself with supportive individuals who can offer comfort and understanding. This might include friends, family, or even a mental health professional. They can provide much-needed emotional support and strategies to help you cope. Lastly, remember there's no set timeline for healing. It's different for everyone and it's okay to take as long as you need. Keep your focus on the future and the potential it holds. Though it may not seem possible now, you will emerge stronger from this experience. It's okay to ask for help, it's okay to grieve, and it's okay to take it one day at a time. You're not alone in this journey, and there are people ready and willing to help. It's okay to heal at your own pace.""", + "ground_truth": """I can only imagine how difficult this time must be for you. Heartbreak can feel like an overwhelming wave that washes over you, making you feel lost and alone. It's important to remember that it's okay to feel this way; your feelings are valid and it's natural to grieve the loss of a relationship. Firstly, it's crucial to allow yourself to feel the emotions fully. Suppressing or denying them often only prolongs the healing process. It's okay to cry, to feel sad, or to be angry. These are all part of the healing journey. Engaging in self-care activities can also provide a much-needed boost. This could be anything from taking a warm bath, writing in a journal, or simply drinking a cup of your favorite tea. Small acts of self-love can make a big difference in how you feel. Next, try to surround yourself with supportive people who understand your situation and provide comfort. Friends and family can be a great source of strength in times of heartbreak. If you feel comfortable, you might want to consider seeking professional help. Therapists and counselors are trained to provide assistance and tools to navigate through difficult times like these. Lastly, it's important to remember that it's okay to take your time to heal. Everyone has their own pace and there's no rush. Try to focus on the future and the possibilities it holds. While it may not seem like it now, you will come out stronger and more resilient from this experience. Remember, it's okay to ask for help and it's okay to feel the way you feel. You are not alone in this journey and there are people who care about you and want to help. It's okay to take one day at a time. Healing is a process, and it's okay to move through it at your own pace.""", + "rubrics": { + "criteria": "Is the model proficient in applying empathy and emotional intelligence to its responses when the user conveys emotions or faces challenging circumstances?", + "score1_description": "The model neglects to identify or react to the emotional tone of user inputs, giving responses that are unfitting or emotionally insensitive.", + "score2_description": "The model intermittently acknowledges emotional context but often responds without sufficient empathy or emotional understanding.", + "score3_description": "The model typically identifies emotional context and attempts to answer with empathy, yet the responses might sometimes miss the point or lack emotional profundity.", + "score4_description": "The model consistently identifies and reacts suitably to emotional context, providing empathetic responses. Nonetheless, there may still be sporadic oversights or deficiencies in emotional depth.", + "score5_description": "The model excels in identifying emotional context and persistently offers empathetic, emotionally aware responses that demonstrate a profound comprehension of the user's emotions or situation." + }, + "analysis": ScoreFeedbackAnswers.parse_obj( + [ + { + "feedback": """The response provided shows a high level of empathy and emotional intelligence. It effectively addresses the emotional distress expressed by the user. It acknowledges the user's pain and validates their feelings of loneliness and sadness, which is a crucial aspect of providing empathetic advice. The response also suggests practical steps for coping, such as embracing emotions, practicing self-care, and seeking support from friends, family, or professionals. Furthermore, the response reassures the user that healing is a personal process with no fixed timeline, offering comfort and understanding. It emphasizes the user's worth and potential to overcome the situation, which demonstrates a profound comprehension of the user's emotions and situation. By comparing the score rubric with the provided response, it is clear that the model exhibits an excellent ability to apply empathy and emotional intelligence. The response does not have any deficiencies in emotional depth and successfully meets the criteria for a score of 5.""", + "score": 5, + } + ] + ).dicts(), + } + ], + input_keys=["question", "answer", "ground_truth", "rubrics"], + output_key="analysis", + language="english", +) + +class Prometheus(MetricWithLLM): + name = "prometheus" + evaluation_mode = EvaluationMode.qga # Uses question, ground truth, answer + + def __init__( + self, + mode: PrometheusMode = PrometheusMode.ABSOLUTE, + rubrics: Optional[Dict] = None, + llm: Optional[BaseRagasLLM] = None, + max_retries: int = 1, + ): + super().__init__(llm=llm) + self.mode = mode + self.rubrics = rubrics + self.max_retries = max_retries + + + async def _ascore(self, row: Dict, callbacks: t.Callbacks, is_async: bool = False) -> float: + if self.mode == PrometheusMode.ABSOLUTE: + return await self._absolute_score(row, callbacks) + elif self.mode == PrometheusMode.RELATIVE: + return await self._relative_score(row, callbacks) + else: + raise ValueError(f"Invalid mode: {self.mode}") + + def _create_prompt(self, row: Dict) -> Prompt: + return PROMETHEUS_ABSOLUTE_PROMPT.format( + question=row.get('question', ''), + answer=row.get('answer', ''), + ground_truth=row.get('ground_truth', ''), + rubrics=self.rubrics, + ) + + async def _absolute_score(self, row: Dict, callbacks: t.Callbacks) -> float: + prompt_value = self._create_prompt(row) + + + response = await self.llm.generate(prompt_value, callbacks=callbacks) + + + parsed_response = await _score_feedback_output_parser.aparse( + response.generations[0][0].text, prompt_value, self.llm, self.max_retries + ) + + if parsed_response is None: + return np.nan + + score = parsed_response.dicts()[0]['score'] + return score + + async def _relative_score(self, row: Dict, callbacks: t.Callbacks) -> float: + # Implement relative scoring logic here, similar to absolute scoring + return 0.5 + + def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: + PROMETHEUS_ABSOLUTE_PROMPT.adapt(language, self.llm, cache_dir) + + def save(self, cache_dir: t.Optional[str] = None) -> None: + PROMETHEUS_ABSOLUTE_PROMPT.save(cache_dir) From 6d47d228a6167da6e90898d4932df5bcb6a19ea6 Mon Sep 17 00:00:00 2001 From: Vaishakh Raveendran <107925721+vaishakhRaveendran@users.noreply.github.com> Date: Mon, 12 Aug 2024 23:46:47 +0530 Subject: [PATCH 06/19] Update base.py --- .../ragas_experimental/testset/questions/base.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/experimental/ragas_experimental/testset/questions/base.py b/src/experimental/ragas_experimental/testset/questions/base.py index 22890c10e..a129d20ce 100644 --- a/src/experimental/ragas_experimental/testset/questions/base.py +++ b/src/experimental/ragas_experimental/testset/questions/base.py @@ -42,16 +42,7 @@ class QAC: name: t.Optional[str] = None style: t.Optional[QuestionStyle] = QuestionStyle.PERFECT_GRAMMAR length: t.Optional[QuestionLength] = QuestionLength.MEDIUM - - def to_dict(self): - return { - "question": self.question, - "answer": self.answer, - "source": [self.source] if self.source else None, - "name": self.name, - "style": self.style, - "length": self.length - } + @dataclass class StyleLengthDistribution: style_length_distribution: t.Dict[ From e9f02402914370274aea73f9af65231eb18c76a1 Mon Sep 17 00:00:00 2001 From: Vaishakh Raveendran Date: Mon, 12 Aug 2024 23:58:32 +0530 Subject: [PATCH 07/19] roll back --- .../ragas_experimental/testset/extractors/regex_based.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/experimental/ragas_experimental/testset/extractors/regex_based.py b/src/experimental/ragas_experimental/testset/extractors/regex_based.py index c7c83c55a..e69dad299 100644 --- a/src/experimental/ragas_experimental/testset/extractors/regex_based.py +++ b/src/experimental/ragas_experimental/testset/extractors/regex_based.py @@ -4,7 +4,6 @@ from dataclasses import dataclass from langchain_core.documents import Document as LCDocument - from ragas_experimental.testset.extractors.base import Extractor, Regex from ragas_experimental.testset.graph import Node @@ -78,7 +77,7 @@ def merge_extractors(self, *extractors) -> t.List[Extractor]: if pattern.startswith("(?"): flag_end = pattern.index(")") flags = pattern[2:flag_end] - pattern = pattern[flag_end + 1:] + pattern = pattern[flag_end + 1 :] # Wrap the pattern in a non-capturing group with flags processed_patterns.append(f"(?{flags}:{pattern})") @@ -98,6 +97,7 @@ def merge_extractors(self, *extractors) -> t.List[Extractor]: ) return extractors_to_return + links_extractor_pattern = r"(?i)\b(?:https?://|www\.)\S+\b" emails_extractor_pattern = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+" markdown_headings_pattern = r"^(#{1,6})\s+(.*)" @@ -110,5 +110,5 @@ def merge_extractors(self, *extractors) -> t.List[Extractor]: ) markdown_headings = RulebasedExtractor( regex=Regex(name="markdown_headings", pattern=markdown_headings_pattern), - is_multiline=True -) + is_multiline=True, +) \ No newline at end of file From 02fa52893d6fbd68b26f7bfbe90d70cd84d2d571 Mon Sep 17 00:00:00 2001 From: Vaishakh Raveendran Date: Tue, 13 Aug 2024 00:03:50 +0530 Subject: [PATCH 08/19] roll back --- .../testset/generators/base.py | 22 +++---------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/src/experimental/ragas_experimental/testset/generators/base.py b/src/experimental/ragas_experimental/testset/generators/base.py index 22fba0288..64299fb42 100644 --- a/src/experimental/ragas_experimental/testset/generators/base.py +++ b/src/experimental/ragas_experimental/testset/generators/base.py @@ -32,24 +32,8 @@ class TestDataset: def to_pandas(self): data = [] - for item in self.qac: - if isinstance(item, list): - for subitem in item: - if isinstance(subitem, QAC): - data.append(subitem.to_dict()) - elif isinstance(subitem, dict): - data.append(subitem) - else: - raise TypeError(f"Unexpected type in qac list: {type(subitem)}") - elif isinstance(item, QAC): - data.append(item.to_dict()) - elif isinstance(item, dict): - data.append(item) - elif isinstance(item, type) and issubclass(item, QAC): - pass - else: - raise TypeError(f"Unexpected type in qac: {type(item)}") - + for row in self.qac: + data.append(row.to_dict()) return pd.DataFrame(data) @@ -82,4 +66,4 @@ def generate_with_llamaindex_docs( distribution: QADistribution, ) -> TestDataset: docs = [doc.to_langchain_format() for doc in docs] - return self.generate(docs, test_size, distribution) + return self.generate(docs, test_size, distribution) \ No newline at end of file From 09dc1ef79e0bcfdb238d01f25d640f09bfd451ea Mon Sep 17 00:00:00 2001 From: Vaishakh Raveendran Date: Tue, 13 Aug 2024 00:05:12 +0530 Subject: [PATCH 09/19] roll back --- .../ragas_experimental/testset/generators/simple.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/experimental/ragas_experimental/testset/generators/simple.py b/src/experimental/ragas_experimental/testset/generators/simple.py index f0567dcc2..6609df06e 100644 --- a/src/experimental/ragas_experimental/testset/generators/simple.py +++ b/src/experimental/ragas_experimental/testset/generators/simple.py @@ -1,10 +1,10 @@ import typing as t + import numpy as np from langchain_core.documents import Document from ragas_experimental.testset.extractors import ( DocumentExtractor, email_extractor, - markdown_headings, headline_extractor, keyphrase_extractor, link_extractor, @@ -60,7 +60,6 @@ def _document_exraction(self, docs: t.Sequence[Document]) -> t.Sequence[Document summary_extractor, link_extractor, email_extractor, - markdown_headings, keyphrase_extractor, title_extractor, headline_extractor, @@ -87,7 +86,6 @@ def generate( link_extractor, email_extractor, keyphrase_extractor, - markdown_headings, title_extractor, headline_extractor, ] @@ -95,8 +93,10 @@ def generate( extractors=extractors, llm=self.llm, embedding=self.embedding ) docs = doc_extractor.extract(docs) + splitter = HeadlineSplitter(common_metadata_keys=["source", "title"]) nodes, relationships = splitter.split_documents(docs, "headlines") + nodes = doc_extractor.embed( nodes, ["page_content", "summary"], @@ -184,4 +184,4 @@ def generate( is_experiment=True, ) ) - return results + return results \ No newline at end of file From 8f8f150caf2b46553322502a604d31db69232641 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 13 Aug 2024 16:59:01 +0530 Subject: [PATCH 10/19] improve and shorten prompt --- src/ragas/metrics/_prometheus.py | 35 +++++++++++++------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/src/ragas/metrics/_prometheus.py b/src/ragas/metrics/_prometheus.py index d693bf37c..6c012dd4f 100644 --- a/src/ragas/metrics/_prometheus.py +++ b/src/ragas/metrics/_prometheus.py @@ -37,30 +37,22 @@ def dicts(self) -> t.List[t.Dict]: PROMETHEUS_ABSOLUTE_PROMPT = Prompt( - name="prometheus_score", - output_format_instruction=_score_feedback_output_instructions, - instruction="""Given an instruction, a response, a reference answer, and a score rubric, evaluate the response by providing detailed feedback and assigning a score between 1 and 5. Format the output as JSON with 'feedback' and 'score' fields.""", + name="prometheus_eval", + instruction="Evaluate the response based on the given question, reference answer, and rubric. Provide a 'feedback' and 'score' out of 5 in JSON format.", examples=[ { - "question": "Struggling with a recent break-up, a person opens up about the intense feelings of loneliness and sadness. They ask for advice on how to cope with the heartbreak and move forward in life.", - "answer": """I'm genuinely sorry to hear about your break-up. This can be an immensely challenging time filled with intense emotions of loneliness and sorrow. It's important to remember that it's normal to experience these feelings; they are a testament to the deep connection you once had. Firstly, don't hesitate to embrace your emotions, let them flow freely. Ignoring them often complicates the healing process. It's natural to be upset, to shed tears, or to feel angry. This is all part of the journey towards healing. Self-care can be a healing balm in these tough times. This could mean indulging in your favourite book, journaling your thoughts, or even just a quiet cup of tea. Small acts of kindness to oneself can significantly improve your emotional well-being. Also, don't isolate yourself. Surround yourself with supportive individuals who can offer comfort and understanding. This might include friends, family, or even a mental health professional. They can provide much-needed emotional support and strategies to help you cope. Lastly, remember there's no set timeline for healing. It's different for everyone and it's okay to take as long as you need. Keep your focus on the future and the potential it holds. Though it may not seem possible now, you will emerge stronger from this experience. It's okay to ask for help, it's okay to grieve, and it's okay to take it one day at a time. You're not alone in this journey, and there are people ready and willing to help. It's okay to heal at your own pace.""", - "ground_truth": """I can only imagine how difficult this time must be for you. Heartbreak can feel like an overwhelming wave that washes over you, making you feel lost and alone. It's important to remember that it's okay to feel this way; your feelings are valid and it's natural to grieve the loss of a relationship. Firstly, it's crucial to allow yourself to feel the emotions fully. Suppressing or denying them often only prolongs the healing process. It's okay to cry, to feel sad, or to be angry. These are all part of the healing journey. Engaging in self-care activities can also provide a much-needed boost. This could be anything from taking a warm bath, writing in a journal, or simply drinking a cup of your favorite tea. Small acts of self-love can make a big difference in how you feel. Next, try to surround yourself with supportive people who understand your situation and provide comfort. Friends and family can be a great source of strength in times of heartbreak. If you feel comfortable, you might want to consider seeking professional help. Therapists and counselors are trained to provide assistance and tools to navigate through difficult times like these. Lastly, it's important to remember that it's okay to take your time to heal. Everyone has their own pace and there's no rush. Try to focus on the future and the possibilities it holds. While it may not seem like it now, you will come out stronger and more resilient from this experience. Remember, it's okay to ask for help and it's okay to feel the way you feel. You are not alone in this journey and there are people who care about you and want to help. It's okay to take one day at a time. Healing is a process, and it's okay to move through it at your own pace.""", + "question": "Explain the concept of recursion in programming.", + "answer": "Recursion is when a function calls itself.", + "ground_truth": "Recursion is a programming technique where a function calls itself to solve smaller instances of a problem, often with a base case to prevent infinite loops.", "rubrics": { - "criteria": "Is the model proficient in applying empathy and emotional intelligence to its responses when the user conveys emotions or faces challenging circumstances?", - "score1_description": "The model neglects to identify or react to the emotional tone of user inputs, giving responses that are unfitting or emotionally insensitive.", - "score2_description": "The model intermittently acknowledges emotional context but often responds without sufficient empathy or emotional understanding.", - "score3_description": "The model typically identifies emotional context and attempts to answer with empathy, yet the responses might sometimes miss the point or lack emotional profundity.", - "score4_description": "The model consistently identifies and reacts suitably to emotional context, providing empathetic responses. Nonetheless, there may still be sporadic oversights or deficiencies in emotional depth.", - "score5_description": "The model excels in identifying emotional context and persistently offers empathetic, emotionally aware responses that demonstrate a profound comprehension of the user's emotions or situation." + "criteria": "Does the answer accurately explain recursion and its key components?", + "score1_description": "Fails to capture the essence of recursion, offering an incomplete or incorrect explanation.", + "score2_description": "Provides a very basic explanation of recursion but misses key components, such as the base case.", + "score3_description": "Correctly explains recursion but lacks depth, particularly in discussing the base case and its importance.", + "score4_description": "Gives a mostly accurate and detailed explanation of recursion, including the base case, with minor omissions.", + "score5_description": "Provides a clear and complete explanation of recursion, including a thorough discussion of the base case and its significance." }, - "analysis": ScoreFeedbackAnswers.parse_obj( - [ - { - "feedback": """The response provided shows a high level of empathy and emotional intelligence. It effectively addresses the emotional distress expressed by the user. It acknowledges the user's pain and validates their feelings of loneliness and sadness, which is a crucial aspect of providing empathetic advice. The response also suggests practical steps for coping, such as embracing emotions, practicing self-care, and seeking support from friends, family, or professionals. Furthermore, the response reassures the user that healing is a personal process with no fixed timeline, offering comfort and understanding. It emphasizes the user's worth and potential to overcome the situation, which demonstrates a profound comprehension of the user's emotions and situation. By comparing the score rubric with the provided response, it is clear that the model exhibits an excellent ability to apply empathy and emotional intelligence. The response does not have any deficiencies in emotional depth and successfully meets the criteria for a score of 5.""", - "score": 5, - } - ] - ).dicts(), + "analysis": {"feedback": "The answer is correct but lacks details on the base case, which is crucial for understanding recursion fully.", "score": 3} } ], input_keys=["question", "answer", "ground_truth", "rubrics"], @@ -68,9 +60,10 @@ def dicts(self) -> t.List[t.Dict]: language="english", ) + class Prometheus(MetricWithLLM): name = "prometheus" - evaluation_mode = EvaluationMode.qga # Uses question, ground truth, answer + evaluation_mode = EvaluationMode.qga def __init__( self, From 106c95382fc1f695f9497ac1fbe5e8f2296139a8 Mon Sep 17 00:00:00 2001 From: Vaishakh Raveendran Date: Wed, 14 Aug 2024 11:39:16 +0530 Subject: [PATCH 11/19] Removes the relative grading part and focuses on the absolute grading --- src/ragas/metrics/_prometheus.py | 40 +++++++------------------------- 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/src/ragas/metrics/_prometheus.py b/src/ragas/metrics/_prometheus.py index d693bf37c..de940baf7 100644 --- a/src/ragas/metrics/_prometheus.py +++ b/src/ragas/metrics/_prometheus.py @@ -1,7 +1,6 @@ from __future__ import annotations import typing as t -from enum import Enum import numpy as np from langchain_core.pydantic_v1 import BaseModel, Field @@ -15,11 +14,6 @@ from ragas.llms.prompt import PromptValue -class PrometheusMode(Enum): - ABSOLUTE = "absolute" - RELATIVE = "relative" - - class ScoreFeedback(BaseModel): feedback: str = Field(..., description="The feedback for the response") score: int = Field(..., description="The score given to the response") @@ -68,46 +62,26 @@ def dicts(self) -> t.List[t.Dict]: language="english", ) -class Prometheus(MetricWithLLM): - name = "prometheus" +class PrometheusAbsolute(MetricWithLLM): + name = "prometheus_absolute" evaluation_mode = EvaluationMode.qga # Uses question, ground truth, answer def __init__( self, - mode: PrometheusMode = PrometheusMode.ABSOLUTE, rubrics: Optional[Dict] = None, llm: Optional[BaseRagasLLM] = None, max_retries: int = 1, ): super().__init__(llm=llm) - self.mode = mode self.rubrics = rubrics self.max_retries = max_retries async def _ascore(self, row: Dict, callbacks: t.Callbacks, is_async: bool = False) -> float: - if self.mode == PrometheusMode.ABSOLUTE: - return await self._absolute_score(row, callbacks) - elif self.mode == PrometheusMode.RELATIVE: - return await self._relative_score(row, callbacks) - else: - raise ValueError(f"Invalid mode: {self.mode}") - - def _create_prompt(self, row: Dict) -> Prompt: - return PROMETHEUS_ABSOLUTE_PROMPT.format( - question=row.get('question', ''), - answer=row.get('answer', ''), - ground_truth=row.get('ground_truth', ''), - rubrics=self.rubrics, - ) - - async def _absolute_score(self, row: Dict, callbacks: t.Callbacks) -> float: prompt_value = self._create_prompt(row) - response = await self.llm.generate(prompt_value, callbacks=callbacks) - parsed_response = await _score_feedback_output_parser.aparse( response.generations[0][0].text, prompt_value, self.llm, self.max_retries ) @@ -118,9 +92,13 @@ async def _absolute_score(self, row: Dict, callbacks: t.Callbacks) -> float: score = parsed_response.dicts()[0]['score'] return score - async def _relative_score(self, row: Dict, callbacks: t.Callbacks) -> float: - # Implement relative scoring logic here, similar to absolute scoring - return 0.5 + def _create_prompt(self, row: Dict) -> Prompt: + return PROMETHEUS_ABSOLUTE_PROMPT.format( + question=row.get('question', ''), + answer=row.get('answer', ''), + ground_truth=row.get('ground_truth', ''), + rubrics=self.rubrics, + ) def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: PROMETHEUS_ABSOLUTE_PROMPT.adapt(language, self.llm, cache_dir) From 91335cbc56f4f62f9237dd43787f108dc733bdf1 Mon Sep 17 00:00:00 2001 From: Vaishakh Raveendran Date: Wed, 14 Aug 2024 11:40:09 +0530 Subject: [PATCH 12/19] Removes the relative grading part and focuses on the absolute grading --- src/ragas/metrics/_prometheus.py | 75 +++++++++++++------------------- 1 file changed, 30 insertions(+), 45 deletions(-) diff --git a/src/ragas/metrics/_prometheus.py b/src/ragas/metrics/_prometheus.py index 6c012dd4f..de940baf7 100644 --- a/src/ragas/metrics/_prometheus.py +++ b/src/ragas/metrics/_prometheus.py @@ -1,7 +1,6 @@ from __future__ import annotations import typing as t -from enum import Enum import numpy as np from langchain_core.pydantic_v1 import BaseModel, Field @@ -15,11 +14,6 @@ from ragas.llms.prompt import PromptValue -class PrometheusMode(Enum): - ABSOLUTE = "absolute" - RELATIVE = "relative" - - class ScoreFeedback(BaseModel): feedback: str = Field(..., description="The feedback for the response") score: int = Field(..., description="The score given to the response") @@ -37,22 +31,30 @@ def dicts(self) -> t.List[t.Dict]: PROMETHEUS_ABSOLUTE_PROMPT = Prompt( - name="prometheus_eval", - instruction="Evaluate the response based on the given question, reference answer, and rubric. Provide a 'feedback' and 'score' out of 5 in JSON format.", + name="prometheus_score", + output_format_instruction=_score_feedback_output_instructions, + instruction="""Given an instruction, a response, a reference answer, and a score rubric, evaluate the response by providing detailed feedback and assigning a score between 1 and 5. Format the output as JSON with 'feedback' and 'score' fields.""", examples=[ { - "question": "Explain the concept of recursion in programming.", - "answer": "Recursion is when a function calls itself.", - "ground_truth": "Recursion is a programming technique where a function calls itself to solve smaller instances of a problem, often with a base case to prevent infinite loops.", + "question": "Struggling with a recent break-up, a person opens up about the intense feelings of loneliness and sadness. They ask for advice on how to cope with the heartbreak and move forward in life.", + "answer": """I'm genuinely sorry to hear about your break-up. This can be an immensely challenging time filled with intense emotions of loneliness and sorrow. It's important to remember that it's normal to experience these feelings; they are a testament to the deep connection you once had. Firstly, don't hesitate to embrace your emotions, let them flow freely. Ignoring them often complicates the healing process. It's natural to be upset, to shed tears, or to feel angry. This is all part of the journey towards healing. Self-care can be a healing balm in these tough times. This could mean indulging in your favourite book, journaling your thoughts, or even just a quiet cup of tea. Small acts of kindness to oneself can significantly improve your emotional well-being. Also, don't isolate yourself. Surround yourself with supportive individuals who can offer comfort and understanding. This might include friends, family, or even a mental health professional. They can provide much-needed emotional support and strategies to help you cope. Lastly, remember there's no set timeline for healing. It's different for everyone and it's okay to take as long as you need. Keep your focus on the future and the potential it holds. Though it may not seem possible now, you will emerge stronger from this experience. It's okay to ask for help, it's okay to grieve, and it's okay to take it one day at a time. You're not alone in this journey, and there are people ready and willing to help. It's okay to heal at your own pace.""", + "ground_truth": """I can only imagine how difficult this time must be for you. Heartbreak can feel like an overwhelming wave that washes over you, making you feel lost and alone. It's important to remember that it's okay to feel this way; your feelings are valid and it's natural to grieve the loss of a relationship. Firstly, it's crucial to allow yourself to feel the emotions fully. Suppressing or denying them often only prolongs the healing process. It's okay to cry, to feel sad, or to be angry. These are all part of the healing journey. Engaging in self-care activities can also provide a much-needed boost. This could be anything from taking a warm bath, writing in a journal, or simply drinking a cup of your favorite tea. Small acts of self-love can make a big difference in how you feel. Next, try to surround yourself with supportive people who understand your situation and provide comfort. Friends and family can be a great source of strength in times of heartbreak. If you feel comfortable, you might want to consider seeking professional help. Therapists and counselors are trained to provide assistance and tools to navigate through difficult times like these. Lastly, it's important to remember that it's okay to take your time to heal. Everyone has their own pace and there's no rush. Try to focus on the future and the possibilities it holds. While it may not seem like it now, you will come out stronger and more resilient from this experience. Remember, it's okay to ask for help and it's okay to feel the way you feel. You are not alone in this journey and there are people who care about you and want to help. It's okay to take one day at a time. Healing is a process, and it's okay to move through it at your own pace.""", "rubrics": { - "criteria": "Does the answer accurately explain recursion and its key components?", - "score1_description": "Fails to capture the essence of recursion, offering an incomplete or incorrect explanation.", - "score2_description": "Provides a very basic explanation of recursion but misses key components, such as the base case.", - "score3_description": "Correctly explains recursion but lacks depth, particularly in discussing the base case and its importance.", - "score4_description": "Gives a mostly accurate and detailed explanation of recursion, including the base case, with minor omissions.", - "score5_description": "Provides a clear and complete explanation of recursion, including a thorough discussion of the base case and its significance." + "criteria": "Is the model proficient in applying empathy and emotional intelligence to its responses when the user conveys emotions or faces challenging circumstances?", + "score1_description": "The model neglects to identify or react to the emotional tone of user inputs, giving responses that are unfitting or emotionally insensitive.", + "score2_description": "The model intermittently acknowledges emotional context but often responds without sufficient empathy or emotional understanding.", + "score3_description": "The model typically identifies emotional context and attempts to answer with empathy, yet the responses might sometimes miss the point or lack emotional profundity.", + "score4_description": "The model consistently identifies and reacts suitably to emotional context, providing empathetic responses. Nonetheless, there may still be sporadic oversights or deficiencies in emotional depth.", + "score5_description": "The model excels in identifying emotional context and persistently offers empathetic, emotionally aware responses that demonstrate a profound comprehension of the user's emotions or situation." }, - "analysis": {"feedback": "The answer is correct but lacks details on the base case, which is crucial for understanding recursion fully.", "score": 3} + "analysis": ScoreFeedbackAnswers.parse_obj( + [ + { + "feedback": """The response provided shows a high level of empathy and emotional intelligence. It effectively addresses the emotional distress expressed by the user. It acknowledges the user's pain and validates their feelings of loneliness and sadness, which is a crucial aspect of providing empathetic advice. The response also suggests practical steps for coping, such as embracing emotions, practicing self-care, and seeking support from friends, family, or professionals. Furthermore, the response reassures the user that healing is a personal process with no fixed timeline, offering comfort and understanding. It emphasizes the user's worth and potential to overcome the situation, which demonstrates a profound comprehension of the user's emotions and situation. By comparing the score rubric with the provided response, it is clear that the model exhibits an excellent ability to apply empathy and emotional intelligence. The response does not have any deficiencies in emotional depth and successfully meets the criteria for a score of 5.""", + "score": 5, + } + ] + ).dicts(), } ], input_keys=["question", "answer", "ground_truth", "rubrics"], @@ -60,47 +62,26 @@ def dicts(self) -> t.List[t.Dict]: language="english", ) - -class Prometheus(MetricWithLLM): - name = "prometheus" - evaluation_mode = EvaluationMode.qga +class PrometheusAbsolute(MetricWithLLM): + name = "prometheus_absolute" + evaluation_mode = EvaluationMode.qga # Uses question, ground truth, answer def __init__( self, - mode: PrometheusMode = PrometheusMode.ABSOLUTE, rubrics: Optional[Dict] = None, llm: Optional[BaseRagasLLM] = None, max_retries: int = 1, ): super().__init__(llm=llm) - self.mode = mode self.rubrics = rubrics self.max_retries = max_retries async def _ascore(self, row: Dict, callbacks: t.Callbacks, is_async: bool = False) -> float: - if self.mode == PrometheusMode.ABSOLUTE: - return await self._absolute_score(row, callbacks) - elif self.mode == PrometheusMode.RELATIVE: - return await self._relative_score(row, callbacks) - else: - raise ValueError(f"Invalid mode: {self.mode}") - - def _create_prompt(self, row: Dict) -> Prompt: - return PROMETHEUS_ABSOLUTE_PROMPT.format( - question=row.get('question', ''), - answer=row.get('answer', ''), - ground_truth=row.get('ground_truth', ''), - rubrics=self.rubrics, - ) - - async def _absolute_score(self, row: Dict, callbacks: t.Callbacks) -> float: prompt_value = self._create_prompt(row) - response = await self.llm.generate(prompt_value, callbacks=callbacks) - parsed_response = await _score_feedback_output_parser.aparse( response.generations[0][0].text, prompt_value, self.llm, self.max_retries ) @@ -111,9 +92,13 @@ async def _absolute_score(self, row: Dict, callbacks: t.Callbacks) -> float: score = parsed_response.dicts()[0]['score'] return score - async def _relative_score(self, row: Dict, callbacks: t.Callbacks) -> float: - # Implement relative scoring logic here, similar to absolute scoring - return 0.5 + def _create_prompt(self, row: Dict) -> Prompt: + return PROMETHEUS_ABSOLUTE_PROMPT.format( + question=row.get('question', ''), + answer=row.get('answer', ''), + ground_truth=row.get('ground_truth', ''), + rubrics=self.rubrics, + ) def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: PROMETHEUS_ABSOLUTE_PROMPT.adapt(language, self.llm, cache_dir) From 7fe2b17054fae1d1625b86af9b6e53276c1e7312 Mon Sep 17 00:00:00 2001 From: Vaishakh Raveendran Date: Wed, 14 Aug 2024 11:41:20 +0530 Subject: [PATCH 13/19] update the _init_.py to add PrometheusAbsolute --- src/ragas/metrics/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index f5f587fac..e779f52e5 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -11,7 +11,7 @@ context_precision, context_utilization, ) -from ragas.metrics._prometheus import Prometheus +from ragas.metrics._prometheus import PrometheusAbsolute from ragas.metrics._context_recall import ContextRecall, context_recall from ragas.metrics._faithfulness import Faithfulness, faithfulness from ragas.metrics._summarization import SummarizationScore, summarization_score @@ -37,5 +37,5 @@ "context_entity_recall", "SummarizationScore", "summarization_score", - "Prometheus", + "PrometheusAbsolute", ] From 4056858942c5d7e40ac254836ab42a41ce93469e Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 14 Aug 2024 12:52:54 +0530 Subject: [PATCH 14/19] remove-rename and bug fixes --- src/ragas/metrics/_prometheus.py | 107 ------------------------- src/ragas/metrics/_rubrics_based.py | 120 ++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 107 deletions(-) delete mode 100644 src/ragas/metrics/_prometheus.py create mode 100644 src/ragas/metrics/_rubrics_based.py diff --git a/src/ragas/metrics/_prometheus.py b/src/ragas/metrics/_prometheus.py deleted file mode 100644 index de940baf7..000000000 --- a/src/ragas/metrics/_prometheus.py +++ /dev/null @@ -1,107 +0,0 @@ -from __future__ import annotations - -import typing as t -import numpy as np -from langchain_core.pydantic_v1 import BaseModel, Field - -from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions -from ragas.llms.prompt import Prompt -from ragas.metrics.base import EvaluationMode, MetricWithLLM - -if t.TYPE_CHECKING: - from langchain_core.callbacks import Callbacks - - from ragas.llms.prompt import PromptValue - - -class ScoreFeedback(BaseModel): - feedback: str = Field(..., description="The feedback for the response") - score: int = Field(..., description="The score given to the response") - - -class ScoreFeedbackAnswers(BaseModel): - __root__: t.List[ScoreFeedback] - - def dicts(self) -> t.List[t.Dict]: - return self.dict()["__root__"] - - -_score_feedback_output_instructions = get_json_format_instructions(ScoreFeedbackAnswers) -_score_feedback_output_parser = RagasoutputParser(pydantic_object=ScoreFeedbackAnswers) - - -PROMETHEUS_ABSOLUTE_PROMPT = Prompt( - name="prometheus_score", - output_format_instruction=_score_feedback_output_instructions, - instruction="""Given an instruction, a response, a reference answer, and a score rubric, evaluate the response by providing detailed feedback and assigning a score between 1 and 5. Format the output as JSON with 'feedback' and 'score' fields.""", - examples=[ - { - "question": "Struggling with a recent break-up, a person opens up about the intense feelings of loneliness and sadness. They ask for advice on how to cope with the heartbreak and move forward in life.", - "answer": """I'm genuinely sorry to hear about your break-up. This can be an immensely challenging time filled with intense emotions of loneliness and sorrow. It's important to remember that it's normal to experience these feelings; they are a testament to the deep connection you once had. Firstly, don't hesitate to embrace your emotions, let them flow freely. Ignoring them often complicates the healing process. It's natural to be upset, to shed tears, or to feel angry. This is all part of the journey towards healing. Self-care can be a healing balm in these tough times. This could mean indulging in your favourite book, journaling your thoughts, or even just a quiet cup of tea. Small acts of kindness to oneself can significantly improve your emotional well-being. Also, don't isolate yourself. Surround yourself with supportive individuals who can offer comfort and understanding. This might include friends, family, or even a mental health professional. They can provide much-needed emotional support and strategies to help you cope. Lastly, remember there's no set timeline for healing. It's different for everyone and it's okay to take as long as you need. Keep your focus on the future and the potential it holds. Though it may not seem possible now, you will emerge stronger from this experience. It's okay to ask for help, it's okay to grieve, and it's okay to take it one day at a time. You're not alone in this journey, and there are people ready and willing to help. It's okay to heal at your own pace.""", - "ground_truth": """I can only imagine how difficult this time must be for you. Heartbreak can feel like an overwhelming wave that washes over you, making you feel lost and alone. It's important to remember that it's okay to feel this way; your feelings are valid and it's natural to grieve the loss of a relationship. Firstly, it's crucial to allow yourself to feel the emotions fully. Suppressing or denying them often only prolongs the healing process. It's okay to cry, to feel sad, or to be angry. These are all part of the healing journey. Engaging in self-care activities can also provide a much-needed boost. This could be anything from taking a warm bath, writing in a journal, or simply drinking a cup of your favorite tea. Small acts of self-love can make a big difference in how you feel. Next, try to surround yourself with supportive people who understand your situation and provide comfort. Friends and family can be a great source of strength in times of heartbreak. If you feel comfortable, you might want to consider seeking professional help. Therapists and counselors are trained to provide assistance and tools to navigate through difficult times like these. Lastly, it's important to remember that it's okay to take your time to heal. Everyone has their own pace and there's no rush. Try to focus on the future and the possibilities it holds. While it may not seem like it now, you will come out stronger and more resilient from this experience. Remember, it's okay to ask for help and it's okay to feel the way you feel. You are not alone in this journey and there are people who care about you and want to help. It's okay to take one day at a time. Healing is a process, and it's okay to move through it at your own pace.""", - "rubrics": { - "criteria": "Is the model proficient in applying empathy and emotional intelligence to its responses when the user conveys emotions or faces challenging circumstances?", - "score1_description": "The model neglects to identify or react to the emotional tone of user inputs, giving responses that are unfitting or emotionally insensitive.", - "score2_description": "The model intermittently acknowledges emotional context but often responds without sufficient empathy or emotional understanding.", - "score3_description": "The model typically identifies emotional context and attempts to answer with empathy, yet the responses might sometimes miss the point or lack emotional profundity.", - "score4_description": "The model consistently identifies and reacts suitably to emotional context, providing empathetic responses. Nonetheless, there may still be sporadic oversights or deficiencies in emotional depth.", - "score5_description": "The model excels in identifying emotional context and persistently offers empathetic, emotionally aware responses that demonstrate a profound comprehension of the user's emotions or situation." - }, - "analysis": ScoreFeedbackAnswers.parse_obj( - [ - { - "feedback": """The response provided shows a high level of empathy and emotional intelligence. It effectively addresses the emotional distress expressed by the user. It acknowledges the user's pain and validates their feelings of loneliness and sadness, which is a crucial aspect of providing empathetic advice. The response also suggests practical steps for coping, such as embracing emotions, practicing self-care, and seeking support from friends, family, or professionals. Furthermore, the response reassures the user that healing is a personal process with no fixed timeline, offering comfort and understanding. It emphasizes the user's worth and potential to overcome the situation, which demonstrates a profound comprehension of the user's emotions and situation. By comparing the score rubric with the provided response, it is clear that the model exhibits an excellent ability to apply empathy and emotional intelligence. The response does not have any deficiencies in emotional depth and successfully meets the criteria for a score of 5.""", - "score": 5, - } - ] - ).dicts(), - } - ], - input_keys=["question", "answer", "ground_truth", "rubrics"], - output_key="analysis", - language="english", -) - -class PrometheusAbsolute(MetricWithLLM): - name = "prometheus_absolute" - evaluation_mode = EvaluationMode.qga # Uses question, ground truth, answer - - def __init__( - self, - rubrics: Optional[Dict] = None, - llm: Optional[BaseRagasLLM] = None, - max_retries: int = 1, - ): - super().__init__(llm=llm) - self.rubrics = rubrics - self.max_retries = max_retries - - - async def _ascore(self, row: Dict, callbacks: t.Callbacks, is_async: bool = False) -> float: - prompt_value = self._create_prompt(row) - - response = await self.llm.generate(prompt_value, callbacks=callbacks) - - parsed_response = await _score_feedback_output_parser.aparse( - response.generations[0][0].text, prompt_value, self.llm, self.max_retries - ) - - if parsed_response is None: - return np.nan - - score = parsed_response.dicts()[0]['score'] - return score - - def _create_prompt(self, row: Dict) -> Prompt: - return PROMETHEUS_ABSOLUTE_PROMPT.format( - question=row.get('question', ''), - answer=row.get('answer', ''), - ground_truth=row.get('ground_truth', ''), - rubrics=self.rubrics, - ) - - def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: - PROMETHEUS_ABSOLUTE_PROMPT.adapt(language, self.llm, cache_dir) - - def save(self, cache_dir: t.Optional[str] = None) -> None: - PROMETHEUS_ABSOLUTE_PROMPT.save(cache_dir) diff --git a/src/ragas/metrics/_rubrics_based.py b/src/ragas/metrics/_rubrics_based.py new file mode 100644 index 000000000..810d96b82 --- /dev/null +++ b/src/ragas/metrics/_rubrics_based.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +import typing as t +from dataclasses import dataclass, field + +import numpy as np +from langchain_core.pydantic_v1 import BaseModel, Field + +from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions +from ragas.llms.prompt import Prompt +from ragas.metrics.base import EvaluationMode, MetricWithLLM + +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + + from ragas.llms.prompt import PromptValue + + +class ScoreFeedback(BaseModel): + feedback: str = Field(..., description="The feedback for the response") + score: int = Field(..., description="The score given to the response") + + +class ScoreFeedbackAnswers(BaseModel): + __root__: t.List[ScoreFeedback] + + def dicts(self) -> t.List[t.Dict]: + return self.dict()["__root__"] + + +_score_feedback_output_instructions = get_json_format_instructions(ScoreFeedbackAnswers) +_score_feedback_output_parser = RagasoutputParser(pydantic_object=ScoreFeedbackAnswers) + +DEFAULT_RUBRICS = { + "criteria": "Is the response factually accurate and does it directly answer the question?", + "score1_description": "The response is incorrect or does not answer the question.", + "score2_description": "The response is partially correct but may include errors or incomplete information.", + "score3_description": "The response is generally correct but lacks clarity or completeness.", + "score4_description": "The response is correct and clear, with minor issues or missing details.", + "score5_description": "The response is completely accurate, clear, and answers the question directly.", +} + +SCORING_PROMPT = Prompt( + name="prometheus_score", + output_format_instruction=_score_feedback_output_instructions, + instruction="""An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing evaluation criteria are given. +1. Write detailed feedback that assesses the quality of the response strictly based on the given score rubric, without evaluating in general. +2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric.""", + examples=[ + { + "question": "What is the capital of France?", + "answer": "The capital of France is Paris.", + "ground_truth": "The capital of France is Paris.", + "rubrics": DEFAULT_RUBRICS, + "analysis": ScoreFeedbackAnswers.parse_obj( + [ + { + "feedback": """The response is completely accurate and directly answers the question about the capital of France. It matches the reference answer perfectly and does not contain any errors or omissions. Given the rubric, this response deserves the highest score as it meets all the criteria for accuracy and clarity.""", + "score": 5, + } + ] + ).dicts(), + } + ], + input_keys=["question", "answer", "ground_truth", "rubrics"], + output_key="analysis", + language="english", +) + + +@dataclass +class RubricsBasedScore(MetricWithLLM): + name: str = "absolute_rubrics_score" # type: ignore + evaluation_mode: EvaluationMode = EvaluationMode.qga # type: ignore + rubrics: t.Optional[t.Dict[str, str]] = None + scoring_prompt: Prompt = field(default_factory=lambda: SCORING_PROMPT) + max_retries: int = 1 + + def __post_init__(self): + self.rubrics = DEFAULT_RUBRICS if self.rubrics is None else self.rubrics + + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM is not set" + + prompt_value = self._create_prompt(row) + + response = await self.llm.generate(prompt_value, callbacks=callbacks) + + parsed_response = await _score_feedback_output_parser.aparse( + response.generations[0][0].text, prompt_value, self.llm, self.max_retries + ) + + if parsed_response is None: + return np.nan + + score = parsed_response.dicts()[0]["score"] + return score + + def _create_prompt(self, row: t.Dict) -> PromptValue: + question, answer, ground_truth = ( + row["question"], + row["answer"], + row["ground_truth"], + ) + return self.scoring_prompt.format( + question=question, + answer=answer, + ground_truth=ground_truth, + rubrics=self.rubrics, + ) + + def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: + assert self.llm is not None, "LLM must be set to adapt the metric" + self.scoring_prompt.adapt(language, self.llm, cache_dir) + + def save(self, cache_dir: t.Optional[str] = None) -> None: + self.scoring_prompt.save(cache_dir) + + +absolute_rubrics_score = RubricsBasedScore() From 64c70e009b481a616017ca7e7904cc99966dd431 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 14 Aug 2024 12:53:13 +0530 Subject: [PATCH 15/19] add rubrics to init --- src/ragas/metrics/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index e779f52e5..8ea72ff12 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -11,9 +11,9 @@ context_precision, context_utilization, ) -from ragas.metrics._prometheus import PrometheusAbsolute from ragas.metrics._context_recall import ContextRecall, context_recall from ragas.metrics._faithfulness import Faithfulness, faithfulness +from ragas.metrics._rubrics_based import RubricsBasedScore, absolute_rubrics_score from ragas.metrics._summarization import SummarizationScore, summarization_score from ragas.metrics.critique import AspectCritique @@ -37,5 +37,6 @@ "context_entity_recall", "SummarizationScore", "summarization_score", - "PrometheusAbsolute", + "RubricsBasedScore", + "absolute_rubrics_score", ] From 27d194d2ecbcf9f78283609ff95ea27454c6b8f9 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 14 Aug 2024 12:55:02 +0530 Subject: [PATCH 16/19] remove format changes --- .../ragas_experimental/testset/extractors/regex_based.py | 2 +- src/experimental/ragas_experimental/testset/generators/base.py | 2 +- .../ragas_experimental/testset/generators/simple.py | 2 +- src/experimental/ragas_experimental/testset/questions/base.py | 3 ++- .../ragas_experimental/testset/splitters/section_splitter.py | 1 + 5 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/experimental/ragas_experimental/testset/extractors/regex_based.py b/src/experimental/ragas_experimental/testset/extractors/regex_based.py index e69dad299..bea66a5de 100644 --- a/src/experimental/ragas_experimental/testset/extractors/regex_based.py +++ b/src/experimental/ragas_experimental/testset/extractors/regex_based.py @@ -111,4 +111,4 @@ def merge_extractors(self, *extractors) -> t.List[Extractor]: markdown_headings = RulebasedExtractor( regex=Regex(name="markdown_headings", pattern=markdown_headings_pattern), is_multiline=True, -) \ No newline at end of file +) diff --git a/src/experimental/ragas_experimental/testset/generators/base.py b/src/experimental/ragas_experimental/testset/generators/base.py index 64299fb42..ce40adc38 100644 --- a/src/experimental/ragas_experimental/testset/generators/base.py +++ b/src/experimental/ragas_experimental/testset/generators/base.py @@ -66,4 +66,4 @@ def generate_with_llamaindex_docs( distribution: QADistribution, ) -> TestDataset: docs = [doc.to_langchain_format() for doc in docs] - return self.generate(docs, test_size, distribution) \ No newline at end of file + return self.generate(docs, test_size, distribution) diff --git a/src/experimental/ragas_experimental/testset/generators/simple.py b/src/experimental/ragas_experimental/testset/generators/simple.py index 6609df06e..4d845a675 100644 --- a/src/experimental/ragas_experimental/testset/generators/simple.py +++ b/src/experimental/ragas_experimental/testset/generators/simple.py @@ -184,4 +184,4 @@ def generate( is_experiment=True, ) ) - return results \ No newline at end of file + return results diff --git a/src/experimental/ragas_experimental/testset/questions/base.py b/src/experimental/ragas_experimental/testset/questions/base.py index a129d20ce..1ab7c9c38 100644 --- a/src/experimental/ragas_experimental/testset/questions/base.py +++ b/src/experimental/ragas_experimental/testset/questions/base.py @@ -42,7 +42,8 @@ class QAC: name: t.Optional[str] = None style: t.Optional[QuestionStyle] = QuestionStyle.PERFECT_GRAMMAR length: t.Optional[QuestionLength] = QuestionLength.MEDIUM - + + @dataclass class StyleLengthDistribution: style_length_distribution: t.Dict[ diff --git a/src/experimental/ragas_experimental/testset/splitters/section_splitter.py b/src/experimental/ragas_experimental/testset/splitters/section_splitter.py index 368d8364c..f22328d64 100644 --- a/src/experimental/ragas_experimental/testset/splitters/section_splitter.py +++ b/src/experimental/ragas_experimental/testset/splitters/section_splitter.py @@ -1,5 +1,6 @@ import re import typing as t + import numpy as np from langchain_core.documents import Document as LCDocument from ragas_experimental.testset.graph import Node, NodeLevel, NodeType, Relationship From 98b2a5615447871017c7b9d80f9c69de9b59ef10 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 14 Aug 2024 19:01:09 +0530 Subject: [PATCH 17/19] change metric to domain specific eval --- src/ragas/metrics/_rubrics_based.py | 100 ++++++++++++++++++++++++---- 1 file changed, 87 insertions(+), 13 deletions(-) diff --git a/src/ragas/metrics/_rubrics_based.py b/src/ragas/metrics/_rubrics_based.py index 810d96b82..fe50ab9a9 100644 --- a/src/ragas/metrics/_rubrics_based.py +++ b/src/ragas/metrics/_rubrics_based.py @@ -31,8 +31,7 @@ def dicts(self) -> t.List[t.Dict]: _score_feedback_output_instructions = get_json_format_instructions(ScoreFeedbackAnswers) _score_feedback_output_parser = RagasoutputParser(pydantic_object=ScoreFeedbackAnswers) -DEFAULT_RUBRICS = { - "criteria": "Is the response factually accurate and does it directly answer the question?", +DEFAULT_REFERENCE_FREE_RUBRICS = { "score1_description": "The response is incorrect or does not answer the question.", "score2_description": "The response is partially correct but may include errors or incomplete information.", "score3_description": "The response is generally correct but lacks clarity or completeness.", @@ -40,18 +39,28 @@ def dicts(self) -> t.List[t.Dict]: "score5_description": "The response is completely accurate, clear, and answers the question directly.", } -SCORING_PROMPT = Prompt( + +DEFAULT_WITH_REFERENCE_RUBRICS = { + "score1_description": "The response is incorrect, irrelevant, or does not align with the ground truth.", + "score2_description": "The response partially matches the ground truth but includes significant errors, omissions, or irrelevant information.", + "score3_description": "The response generally aligns with the ground truth but may lack detail, clarity, or have minor inaccuracies.", + "score4_description": "The response is mostly accurate and aligns well with the ground truth, with only minor issues or missing details.", + "score5_description": "The response is fully accurate, aligns completely with the ground truth, and is clear and detailed.", +} + + +WITH_REFERENCE_SCORING_PROMPT = Prompt( name="prometheus_score", output_format_instruction=_score_feedback_output_instructions, - instruction="""An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing evaluation criteria are given. -1. Write detailed feedback that assesses the quality of the response strictly based on the given score rubric, without evaluating in general. + instruction="""Given an question (which might contain an input along with it), a answer to evaluate, a ground_truth answer that gets a score of 5, and a score rubric representing evaluation criteria are given. +1. Write detailed feedback that assesses the quality of the answer strictly based on the given score rubric, without evaluating in general. 2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric.""", examples=[ { "question": "What is the capital of France?", "answer": "The capital of France is Paris.", "ground_truth": "The capital of France is Paris.", - "rubrics": DEFAULT_RUBRICS, + "rubrics": DEFAULT_WITH_REFERENCE_RUBRICS, "analysis": ScoreFeedbackAnswers.parse_obj( [ { @@ -68,16 +77,47 @@ def dicts(self) -> t.List[t.Dict]: ) +WITHOUT_REFERENCE_SCORING_PROMPT = Prompt( + name="prometheus_score", + output_format_instruction=_score_feedback_output_instructions, + instruction="""Given an question (which might contain an input along with it), a answer to evaluate, and a score rubric representing evaluation criteria are given. +1. Write detailed feedback that assesses the quality of the answer strictly based on the given score rubric, without evaluating in general. +2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric.""", + examples=[ + { + "question": "What is the capital of France?", + "answer": "The capital of France is Paris.", + "rubrics": DEFAULT_REFERENCE_FREE_RUBRICS, + "analysis": ScoreFeedbackAnswers.parse_obj( + [ + { + "feedback": """The response is completely accurate and directly answers the question about the capital of France. It matches the reference answer perfectly and does not contain any errors or omissions. Given the rubric, this response deserves the highest score as it meets all the criteria for accuracy and clarity.""", + "score": 5, + } + ] + ).dicts(), + } + ], + input_keys=["question", "answer", "rubrics"], + output_key="analysis", + language="english", +) + + @dataclass -class RubricsBasedScore(MetricWithLLM): - name: str = "absolute_rubrics_score" # type: ignore - evaluation_mode: EvaluationMode = EvaluationMode.qga # type: ignore +class LabelledRubricsScore(MetricWithLLM): + name: str = "labelled_rubrics_score" # type: ignore + evaluation_mode: EvaluationMode = EvaluationMode.qcg # type: ignore rubrics: t.Optional[t.Dict[str, str]] = None - scoring_prompt: Prompt = field(default_factory=lambda: SCORING_PROMPT) + scoring_prompt: Prompt = field( + default_factory=lambda: WITH_REFERENCE_SCORING_PROMPT + ) max_retries: int = 1 def __post_init__(self): - self.rubrics = DEFAULT_RUBRICS if self.rubrics is None else self.rubrics + self.rubrics = ( + DEFAULT_WITH_REFERENCE_RUBRICS if self.rubrics is None else self.rubrics + ) async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "LLM is not set" @@ -97,11 +137,14 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: return score def _create_prompt(self, row: t.Dict) -> PromptValue: - question, answer, ground_truth = ( + question, contexts, answer, ground_truth = ( row["question"], + row["contexts"], row["answer"], row["ground_truth"], ) + contexts = "\n".join(contexts) + question = f"{question} answer using context: {contexts}" return self.scoring_prompt.format( question=question, answer=answer, @@ -117,4 +160,35 @@ def save(self, cache_dir: t.Optional[str] = None) -> None: self.scoring_prompt.save(cache_dir) -absolute_rubrics_score = RubricsBasedScore() +@dataclass +class ReferenceFreeRubricsScore(LabelledRubricsScore): + name: str = "reference_free_rubrics_score" # type: ignore + evaluation_mode: EvaluationMode = EvaluationMode.qga # type: ignore + rubrics: t.Optional[t.Dict[str, str]] = None + scoring_prompt: Prompt = field( + default_factory=lambda: WITHOUT_REFERENCE_SCORING_PROMPT + ) + max_retries: int = 1 + + def __post_init__(self): + self.rubrics = ( + DEFAULT_REFERENCE_FREE_RUBRICS if self.rubrics is None else self.rubrics + ) + + def _create_prompt(self, row: t.Dict) -> PromptValue: + question, contexts, answer = ( + row["question"], + row["contexts"], + row["answer"], + ) + contexts = "\n".join(contexts) + question = f"{question} answer using context: {contexts}" + return self.scoring_prompt.format( + question=question, + answer=answer, + rubrics=self.rubrics, + ) + + +labelled_rubrics_score = LabelledRubricsScore() +reference_free_rubrics_score = ReferenceFreeRubricsScore() From 81b5f89b3ab092b152d493dd5e05ba4f9a299d18 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 14 Aug 2024 19:01:20 +0530 Subject: [PATCH 18/19] add required imports --- src/ragas/metrics/__init__.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 0f3621991..7236d95e8 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -13,7 +13,12 @@ ) from ragas.metrics._context_recall import ContextRecall, context_recall from ragas.metrics._faithfulness import Faithfulness, FaithulnesswithHHEM, faithfulness -from ragas.metrics._rubrics_based import RubricsBasedScore, absolute_rubrics_score +from ragas.metrics._rubrics_based import ( + LabelledRubricsScore, + ReferenceFreeRubricsScore, + labelled_rubrics_score, + reference_free_rubrics_score, +) from ragas.metrics._summarization import SummarizationScore, summarization_score from ragas.metrics.critique import AspectCritique @@ -38,6 +43,8 @@ "context_entity_recall", "SummarizationScore", "summarization_score", - "RubricsBasedScore", - "absolute_rubrics_score", + "labelled_rubrics_score", + "reference_free_rubrics_score", + "ReferenceFreeRubricsScore", + "LabelledRubricsScore", ] From 5114296e5c2084d43633e59378e7b2e9c1be948a Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 14 Aug 2024 22:35:29 +0530 Subject: [PATCH 19/19] add default rubrics --- src/ragas/metrics/_rubrics_based.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/src/ragas/metrics/_rubrics_based.py b/src/ragas/metrics/_rubrics_based.py index fe50ab9a9..e4dfdfee5 100644 --- a/src/ragas/metrics/_rubrics_based.py +++ b/src/ragas/metrics/_rubrics_based.py @@ -108,17 +108,14 @@ def dicts(self) -> t.List[t.Dict]: class LabelledRubricsScore(MetricWithLLM): name: str = "labelled_rubrics_score" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qcg # type: ignore - rubrics: t.Optional[t.Dict[str, str]] = None + rubrics: t.Dict[str, str] = field( + default_factory=lambda: DEFAULT_WITH_REFERENCE_RUBRICS + ) scoring_prompt: Prompt = field( default_factory=lambda: WITH_REFERENCE_SCORING_PROMPT ) max_retries: int = 1 - def __post_init__(self): - self.rubrics = ( - DEFAULT_WITH_REFERENCE_RUBRICS if self.rubrics is None else self.rubrics - ) - async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "LLM is not set" @@ -164,17 +161,14 @@ def save(self, cache_dir: t.Optional[str] = None) -> None: class ReferenceFreeRubricsScore(LabelledRubricsScore): name: str = "reference_free_rubrics_score" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qga # type: ignore - rubrics: t.Optional[t.Dict[str, str]] = None + rubrics: t.Dict[str, str] = field( + default_factory=lambda: DEFAULT_REFERENCE_FREE_RUBRICS + ) scoring_prompt: Prompt = field( default_factory=lambda: WITHOUT_REFERENCE_SCORING_PROMPT ) max_retries: int = 1 - def __post_init__(self): - self.rubrics = ( - DEFAULT_REFERENCE_FREE_RUBRICS if self.rubrics is None else self.rubrics - ) - def _create_prompt(self, row: t.Dict) -> PromptValue: question, contexts, answer = ( row["question"],