diff --git a/src/ragas/llms/langchain.py b/src/ragas/llms/langchain.py index a79ce798e..f6e88de5a 100644 --- a/src/ragas/llms/langchain.py +++ b/src/ragas/llms/langchain.py @@ -25,6 +25,7 @@ def isOpenAI(llm: BaseLLM | BaseChatModel) -> bool: def isBedrock(llm: BaseLLM | BaseChatModel) -> bool: return isinstance(llm, Bedrock) or isinstance(llm, BedrockChat) + def isAmazonAPIGateway(llm: BaseLLM | BaseChatModel) -> bool: return isinstance(llm, AmazonAPIGateway) diff --git a/src/ragas/llms/llamaindex.py b/src/ragas/llms/llamaindex.py index 63b01987a..d93afacfd 100644 --- a/src/ragas/llms/llamaindex.py +++ b/src/ragas/llms/llamaindex.py @@ -10,6 +10,7 @@ if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks from langchain.prompts import ChatPromptTemplate + try: from llama_index.llms.base import LLM as LiLLM except ImportError: diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index b8b96b59c..531f1dfcf 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -12,6 +12,7 @@ from ragas.embeddings.base import embedding_factory from ragas.exceptions import OpenAIKeyNotFound from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.utils import load_as_json if t.TYPE_CHECKING: from langchain.callbacks.manager import CallbackManager @@ -21,13 +22,46 @@ QUESTION_GEN = HumanMessagePromptTemplate.from_template( """ -Generate question for the given answer. -Answer:\nThe PSLV-C56 mission is scheduled to be launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC. It will be launched from the Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India -Question: When is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from? +Generate a question for the given answer and Identify if answer is noncommittal -Answer:{answer} -Question: -""" # noqa: E501 +Answer: +Albert Einstein was born in Germany. +Context: +Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time +Output: +{{"question":"Where was Albert Einstein born?","noncommittal":false}} + + +Answer: +It can change its skin color based on the temperature of its environment. +Context: +A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment. +Output: +{{"question":"What unique ability does the newly discovered species of frog have?","noncommittal":false}} + + +Answer: +Everest +Context: +The tallest mountain on Earth, measured from sea level, is a renowned peak located in the Himalayas. +Output: +{{"question":"What is the tallest mountain on Earth?","noncommittal":false}} + + +Answer: +I don't know about the groundbreaking feature of the smartphone invented in 2023 as am unware of information beyong 2022. +Context: +In 2023, a groundbreaking invention was announced: a smartphone with a battery life of one month, revolutionizing the way people use mobile technology. +Output: +{{"question":"What was the groundbreaking feature of the smartphone invented in 2023?", "noncommittal":true}} + + + +Answer: +{answer} +Context: +{context} +Output:""" # noqa: E501 ) @@ -53,7 +87,7 @@ class AnswerRelevancy(MetricWithLLM): """ name: str = "answer_relevancy" - evaluation_mode: EvaluationMode = EvaluationMode.qa + evaluation_mode: EvaluationMode = EvaluationMode.qac batch_size: int = 15 strictness: int = 3 embeddings: RagasEmbeddings = field(default_factory=embedding_factory) @@ -71,13 +105,17 @@ def _score_batch( callbacks: t.Optional[CallbackManager] = None, callback_group_name: str = "batch", ) -> list[float]: - questions, answers = dataset["question"], dataset["answer"] + questions, answers, contexts = ( + dataset["question"], + dataset["answer"], + dataset["contexts"], + ) with trace_as_chain_group( callback_group_name, callback_manager=callbacks ) as batch_group: prompts = [] - for ans in answers: - human_prompt = QUESTION_GEN.format(answer=ans) + for ans, ctx in zip(answers, contexts): + human_prompt = QUESTION_GEN.format(answer=ans, context="\n".join(ctx)) prompts.append(ChatPromptTemplate.from_messages([human_prompt])) results = self.llm.generate( @@ -85,15 +123,13 @@ def _score_batch( n=self.strictness, callbacks=batch_group, ) - results = [[i.text for i in r] for r in results.generations] - + results = [[load_as_json(i.text) for i in r] for r in results.generations] scores = [] - for question, gen_questions in zip(questions, results): - if question is not None and question != "" and len(gen_questions) > 0: - cosine_sim = self.calculate_similarity(question, gen_questions) - scores.append(cosine_sim.mean()) - else: - scores.append(0.0) + for question, result in zip(questions, results): + gen_questions = [item.get("question", "") for item in result] + committal = np.any([item.get("noncommittal", False) for item in result]) + cosine_sim = self.calculate_similarity(question, gen_questions) + scores.append(cosine_sim.mean() * int(not committal)) return scores diff --git a/src/ragas/testset/testset_generator.py b/src/ragas/testset/testset_generator.py index c12865a90..2271ee351 100644 --- a/src/ragas/testset/testset_generator.py +++ b/src/ragas/testset/testset_generator.py @@ -58,7 +58,16 @@ "conditional": "_condition_question", } -DataRow = namedtuple("DataRow", ["question", "ground_truth_context", "ground_truth", "question_type", "episode_done"]) +DataRow = namedtuple( + "DataRow", + [ + "question", + "ground_truth_context", + "ground_truth", + "question_type", + "episode_done", + ], +) @dataclass @@ -73,11 +82,11 @@ def to_pandas(self) -> pd.DataFrame: data_samples = [] for data in self.test_data: data = { - "question": data.question, - "ground_truth_context": data.ground_truth_context, - "ground_truth": data.ground_truth, - "question_type": data.question_type, - "episode_done": data.episode_done, + "question": data.question, + "ground_truth_context": data.ground_truth_context, + "ground_truth": data.ground_truth, + "question_type": data.question_type, + "episode_done": data.episode_done, } data_samples.append(data) @@ -394,11 +403,13 @@ def generate( context = self._generate_context(question, text_chunk) is_conv = len(context) > 1 answer = self._generate_answer(question, context) - for i, (qstn, ctx, ans) in enumerate(zip(question.split("\n"), context, answer)): - episode_done = False if is_conv and i==0 else True + for i, (qstn, ctx, ans) in enumerate( + zip(question.split("\n"), context, answer) + ): + episode_done = False if is_conv and i == 0 else True samples.append( - DataRow(qstn, [ctx], [ans], evolve_type, episode_done) - ) + DataRow(qstn, [ctx], [ans], evolve_type, episode_done) + ) count += 1 pbar.update(count) diff --git a/src/ragas/validation.py b/src/ragas/validation.py index d82c3dc15..bf0b3e4c2 100644 --- a/src/ragas/validation.py +++ b/src/ragas/validation.py @@ -9,11 +9,9 @@ def remap_column_names(dataset: Dataset, column_map: dict[str, str]) -> Dataset: """ Remap the column names in case dataset uses different column names """ - + inverse_column_map = {v: k for k, v in column_map.items()} - return dataset.rename_columns( - inverse_column_map - ) + return dataset.rename_columns(inverse_column_map) def validate_column_dtypes(ds: Dataset): diff --git a/tests/unit/test_validation.py b/tests/unit/test_validation.py index 93bfcfcb6..1164d1d39 100644 --- a/tests/unit/test_validation.py +++ b/tests/unit/test_validation.py @@ -103,9 +103,7 @@ def test_column_remap(column_map): } ) remapped_dataset = remap_column_names(TEST_DATASET, column_map) - assert all( - col in remapped_dataset.column_names for col in column_map.keys() - ) + assert all(col in remapped_dataset.column_names for col in column_map.keys()) def test_column_remap_omit():