From c5b586e7fe0e4e49e6e8950216005aa7db2a9974 Mon Sep 17 00:00:00 2001 From: Shahul ES Date: Wed, 15 Nov 2023 00:23:19 +0530 Subject: [PATCH 1/2] fix: set temperate to near zero value (#267) fixes : #264 --- src/ragas/llms/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index 97fb546af..9cd094b76 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -122,11 +122,11 @@ def generate( self, prompts: list[ChatPromptTemplate], n: int = 1, - temperature: float = 0, + temperature: float = 1e-8, callbacks: t.Optional[Callbacks] = None, ) -> LLMResult: # set temperature to 0.2 for multiple completions - temperature = 0.2 if n > 1 else 0 + temperature = 0.2 if n > 1 else 1e-8 if isBedrock(self.llm) and ("model_kwargs" in self.llm.__dict__): self.llm.model_kwargs = {"temperature": temperature} else: From 28020193687c13b38e9d63b62155ad6bfd92a0be Mon Sep 17 00:00:00 2001 From: Jithin James Date: Wed, 15 Nov 2023 15:47:38 +0530 Subject: [PATCH 2/2] feat: add native support for OpenAI and AzureOpenAI (#261) - Add support for OpenAI and AzureOpenai both embeddings and LLMs - rework `RagasLLM` with an async version of generate - checks for API_KEYs and tests that ensure it is working --- Makefile | 2 +- docs/howtos/customisations/azure-openai.ipynb | 2 +- docs/howtos/integrations/langfuse.ipynb | 117 ++++++---- pyproject.toml | 3 +- src/ragas/async_utils.py | 67 +++--- src/ragas/embeddings/__init__.py | 14 +- src/ragas/embeddings/base.py | 77 ++++++- src/ragas/evaluation.py | 2 + src/ragas/exceptions.py | 7 + src/ragas/llms/__init__.py | 10 +- src/ragas/llms/base.py | 145 ++---------- src/ragas/llms/langchain.py | 217 ++++++++++++++++++ src/ragas/llms/llamaindex.py | 4 +- src/ragas/llms/openai.py | 143 ++++++++++++ src/ragas/metrics/__init__.py | 4 +- src/ragas/metrics/_answer_correctness.py | 2 +- src/ragas/metrics/_answer_relevance.py | 2 - src/ragas/metrics/_answer_similarity.py | 4 +- src/ragas/metrics/_context_precision.py | 5 +- src/ragas/metrics/_context_relevancy.py | 1 + src/ragas/metrics/base.py | 23 +- src/ragas/metrics/critique.py | 9 +- src/ragas/testset/testset_generator.py | 15 +- src/ragas/utils.py | 2 + tests/benchmarks/benchmark_eval.py | 15 +- tests/unit/test_embeddings.py | 1 + tests/unit/test_import.py | 35 +-- tests/unit/test_llm.py | 138 +++++++++++ tests/unit/test_simple.py | 7 - 29 files changed, 785 insertions(+), 288 deletions(-) create mode 100644 src/ragas/llms/langchain.py create mode 100644 src/ragas/llms/openai.py create mode 100644 tests/unit/test_embeddings.py create mode 100644 tests/unit/test_llm.py diff --git a/Makefile b/Makefile index a0d426d97..bb2db45b3 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ lint: ## Running lint checker: ruff @ruff check src docs tests type: ## Running type checker: pyright @echo "(pyright) Typechecking codebase..." - @pyright src + PYRIGHT_PYTHON_FORCE_VERSION=latest pyright src clean: ## Clean all generated files @echo "Cleaning all generated files..." @cd $(GIT_ROOT)/docs && make clean diff --git a/docs/howtos/customisations/azure-openai.ipynb b/docs/howtos/customisations/azure-openai.ipynb index 80b9d918a..f336bf34b 100644 --- a/docs/howtos/customisations/azure-openai.ipynb +++ b/docs/howtos/customisations/azure-openai.ipynb @@ -164,7 +164,7 @@ " openai_api_base=\"https://your-endpoint.openai.azure.com/\",\n", " openai_api_type=\"azure\",\n", ")\n", - "# wrapper around azure_model \n", + "# wrapper around azure_model\n", "ragas_azure_model = LangchainLLM(azure_model)\n", "# patch the new RagasLLM instance\n", "answer_relevancy.llm = ragas_azure_model\n", diff --git a/docs/howtos/integrations/langfuse.ipynb b/docs/howtos/integrations/langfuse.ipynb index 746bf6326..2c217a51c 100644 --- a/docs/howtos/integrations/langfuse.ipynb +++ b/docs/howtos/integrations/langfuse.ipynb @@ -25,13 +25,14 @@ "outputs": [], "source": [ "import os\n", + "\n", "# TODO REMOVE ENVIRONMENT VARIABLES!!!\n", "# get keys for your project from https://cloud.langfuse.com\n", "os.environ[\"LANGFUSE_PUBLIC_KEY\"] = \"\"\n", "os.environ[\"LANGFUSE_SECRET_KEY\"] = \"\"\n", - " \n", + "\n", "# your openai key\n", - "#os.environ[\"OPENAI_API_KEY\"] = \"\"" + "# os.environ[\"OPENAI_API_KEY\"] = \"\"" ] }, { @@ -86,7 +87,7 @@ "source": [ "from datasets import load_dataset\n", "\n", - "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")['baseline']\n", + "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")[\"baseline\"]\n", "fiqa_eval" ] }, @@ -180,7 +181,7 @@ ], "source": [ "row = fiqa_eval[0]\n", - "row['question'], row['answer']" + "row[\"question\"], row[\"answer\"]" ] }, { @@ -199,7 +200,7 @@ "outputs": [], "source": [ "from langfuse import Langfuse\n", - " \n", + "\n", "langfuse = Langfuse()" ] }, @@ -223,7 +224,7 @@ " for m in metrics:\n", " print(f\"calculating {m.name}\")\n", " scores[m.name] = m.score_single(\n", - " {'question': query, 'contexts': chunks, 'answer': answer}\n", + " {\"question\": query, \"contexts\": chunks, \"answer\": answer}\n", " )\n", " return scores" ] @@ -272,26 +273,38 @@ } ], "source": [ - "from langfuse.model import CreateTrace, CreateSpan, CreateGeneration, CreateEvent, CreateScore\n", + "from langfuse.model import (\n", + " CreateTrace,\n", + " CreateSpan,\n", + " CreateGeneration,\n", + " CreateEvent,\n", + " CreateScore,\n", + ")\n", "\n", "# start a new trace when you get a question\n", - "question = row['question']\n", - "trace = langfuse.trace(CreateTrace(name = \"rag\"))\n", + "question = row[\"question\"]\n", + "trace = langfuse.trace(CreateTrace(name=\"rag\"))\n", "\n", "# retrieve the relevant chunks\n", "# chunks = get_similar_chunks(question)\n", - "contexts = row['contexts']\n", + "contexts = row[\"contexts\"]\n", "# pass it as span\n", - "trace.span(CreateSpan(\n", - " name = \"retrieval\", input={'question': question}, output={'contexts': contexts}\n", - "))\n", + "trace.span(\n", + " CreateSpan(\n", + " name=\"retrieval\", input={\"question\": question}, output={\"contexts\": contexts}\n", + " )\n", + ")\n", "\n", "# use llm to generate a answer with the chunks\n", "# answer = get_response_from_llm(question, chunks)\n", - "answer = row['answer']\n", - "trace.span(CreateSpan(\n", - " name = \"generation\", input={'question': question, 'contexts': contexts}, output={'answer': answer}\n", - "))\n", + "answer = row[\"answer\"]\n", + "trace.span(\n", + " CreateSpan(\n", + " name=\"generation\",\n", + " input={\"question\": question, \"contexts\": contexts},\n", + " output={\"answer\": answer},\n", + " )\n", + ")\n", "\n", "# compute scores for the question, context, answer tuple\n", "ragas_scores = score_with_ragas(question, contexts, answer)\n", @@ -357,20 +370,31 @@ "metadata": {}, "outputs": [], "source": [ - "from langfuse.model import CreateTrace, CreateSpan, CreateGeneration, CreateEvent, CreateScore\n", + "from langfuse.model import (\n", + " CreateTrace,\n", + " CreateSpan,\n", + " CreateGeneration,\n", + " CreateEvent,\n", + " CreateScore,\n", + ")\n", + "\n", "# fiqa traces\n", "for interaction in fiqa_eval.select(range(10, 20)):\n", - " trace = langfuse.trace(CreateTrace(name = \"rag\"))\n", - " trace.span(CreateSpan(\n", - " name = \"retrieval\", \n", - " input={'question': question}, \n", - " output={'contexts': contexts}\n", - " ))\n", - " trace.span(CreateSpan(\n", - " name = \"generation\", \n", - " input={'question': question, 'contexts': contexts}, \n", - " output={'answer': answer}\n", - " ))\n", + " trace = langfuse.trace(CreateTrace(name=\"rag\"))\n", + " trace.span(\n", + " CreateSpan(\n", + " name=\"retrieval\",\n", + " input={\"question\": question},\n", + " output={\"contexts\": contexts},\n", + " )\n", + " )\n", + " trace.span(\n", + " CreateSpan(\n", + " name=\"generation\",\n", + " input={\"question\": question, \"contexts\": contexts},\n", + " output={\"answer\": answer},\n", + " )\n", + " )\n", "\n", "# await that Langfuse SDK has processed all events before trying to retrieve it in the next step\n", "langfuse.flush()" @@ -393,12 +417,10 @@ "source": [ "def get_traces(name=None, limit=None, user_id=None):\n", " all_data = []\n", - " page = 1 \n", + " page = 1\n", "\n", " while True:\n", - " response = langfuse.client.trace.list(\n", - " name=name, page=page, user_id=user_id\n", - " )\n", + " response = langfuse.client.trace.list(name=name, page=page, user_id=user_id)\n", " if not response.data:\n", " break\n", " page += 1\n", @@ -430,7 +452,7 @@ "from random import sample\n", "\n", "NUM_TRACES_TO_SAMPLE = 3\n", - "traces = get_traces(name='rag', limit=5)\n", + "traces = get_traces(name=\"rag\", limit=5)\n", "traces_sample = sample(traces, NUM_TRACES_TO_SAMPLE)\n", "\n", "len(traces_sample)" @@ -464,15 +486,15 @@ "for t in traces_sample:\n", " observations = [langfuse.client.observations.get(o) for o in t.observations]\n", " for o in observations:\n", - " if o.name == 'retrieval':\n", - " question = o.input['question']\n", - " contexts = o.output['contexts']\n", - " if o.name=='generation':\n", - " answer = o.output['answer']\n", - " evaluation_batch['question'].append(question)\n", - " evaluation_batch['contexts'].append(contexts)\n", - " evaluation_batch['answer'].append(answer)\n", - " evaluation_batch['trace_id'].append(t.id)" + " if o.name == \"retrieval\":\n", + " question = o.input[\"question\"]\n", + " contexts = o.output[\"contexts\"]\n", + " if o.name == \"generation\":\n", + " answer = o.output[\"answer\"]\n", + " evaluation_batch[\"question\"].append(question)\n", + " evaluation_batch[\"contexts\"].append(contexts)\n", + " evaluation_batch[\"answer\"].append(answer)\n", + " evaluation_batch[\"trace_id\"].append(t.id)" ] }, { @@ -671,10 +693,11 @@ "\n", "for _, row in df.iterrows():\n", " for metric_name in [\"faithfulness\", \"answer_relevancy\"]:\n", - " langfuse.score(InitialScore(\n", - " name=metric_name,\n", - " value=row[metric_name],\n", - " trace_id=row[\"trace_id\"]))" + " langfuse.score(\n", + " InitialScore(\n", + " name=metric_name, value=row[metric_name], trace_id=row[\"trace_id\"]\n", + " )\n", + " )" ] }, { diff --git a/pyproject.toml b/pyproject.toml index 7de56752a..e0fc5a39c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,9 +6,10 @@ dependencies = [ "sentence-transformers", "datasets", "tiktoken", - "langchain>=0.0.288", + "langchain", "openai", "pysbd>=0.3.4", + "nest-asyncio", ] dynamic = ["version", "readme"] diff --git a/src/ragas/async_utils.py b/src/ragas/async_utils.py index cd39a8661..05c09765b 100644 --- a/src/ragas/async_utils.py +++ b/src/ragas/async_utils.py @@ -1,7 +1,6 @@ """Async utils.""" import asyncio -from itertools import zip_longest -from typing import Any, Coroutine, Iterable, List +from typing import Any, Coroutine, List def run_async_tasks( @@ -10,50 +9,40 @@ def run_async_tasks( progress_bar_desc: str = "Running async tasks", ) -> List[Any]: """Run a list of async tasks.""" - tasks_to_execute: List[Any] = tasks - if show_progress: + + # if running in notebook, use nest_asyncio to hijack the event loop + try: + loop = asyncio.get_running_loop() try: import nest_asyncio - from tqdm.asyncio import tqdm - - # jupyter notebooks already have an event loop running - # we need to reuse it instead of creating a new one + except ImportError: + raise RuntimeError( + "nest_asyncio is required to run async tasks in jupyter. Please install it via `pip install nest_asyncio`." # noqa + ) + else: nest_asyncio.apply() - loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + + # gather tasks to run + if show_progress: + from tqdm.asyncio import tqdm + + async def _gather() -> List[Any]: + "gather tasks and show progress bar" + return await tqdm.gather(*tasks_to_execute, desc=progress_bar_desc) + + else: # don't show_progress - async def _tqdm_gather() -> List[Any]: - return await tqdm.gather(*tasks_to_execute, desc=progress_bar_desc) + async def _gather() -> List[Any]: + return await asyncio.gather(*tasks_to_execute) - tqdm_outputs: List[Any] = loop.run_until_complete(_tqdm_gather()) - return tqdm_outputs + try: + outputs: List[Any] = loop.run_until_complete(_gather()) + except Exception as e: # run the operation w/o tqdm on hitting a fatal # may occur in some environments where tqdm.asyncio # is not supported - except ImportError as e: - print(e) - except Exception: - pass - - async def _gather() -> List[Any]: - return await asyncio.gather(*tasks_to_execute) - - outputs: List[Any] = asyncio.run(_gather()) + raise RuntimeError("Fatal error occurred while running async tasks.", e) return outputs - - -def chunks(iterable: Iterable, size: int) -> Iterable: - args = [iter(iterable)] * size - return zip_longest(*args, fillvalue=None) - - -async def batch_gather( - tasks: List[Coroutine], batch_size: int = 10, verbose: bool = False -) -> List[Any]: - output: List[Any] = [] - for task_chunk in chunks(tasks, batch_size): - output_chunk = await asyncio.gather(*task_chunk) - output.extend(output_chunk) - if verbose: - print(f"Completed {len(output)} out of {len(tasks)} tasks") - return output diff --git a/src/ragas/embeddings/__init__.py b/src/ragas/embeddings/__init__.py index ee5e7caba..1eba70cdc 100644 --- a/src/ragas/embeddings/__init__.py +++ b/src/ragas/embeddings/__init__.py @@ -1,3 +1,13 @@ -from ragas.embeddings.base import HuggingfaceEmbeddings, OpenAIEmbeddings +from ragas.embeddings.base import ( + AzureOpenAIEmbeddings, + HuggingfaceEmbeddings, + OpenAIEmbeddings, + RagasEmbeddings, +) -__all__ = ["HuggingfaceEmbeddings", "OpenAIEmbeddings"] +__all__ = [ + "HuggingfaceEmbeddings", + "OpenAIEmbeddings", + "AzureOpenAIEmbeddings", + "RagasEmbeddings", +] diff --git a/src/ragas/embeddings/base.py b/src/ragas/embeddings/base.py index 084fb8f13..634a7da15 100644 --- a/src/ragas/embeddings/base.py +++ b/src/ragas/embeddings/base.py @@ -2,16 +2,80 @@ import os import typing as t -from dataclasses import dataclass, field +from dataclasses import field from typing import List import numpy as np -from langchain.embeddings import OpenAIEmbeddings -from langchain.schema.embeddings import Embeddings as RagasEmbeddings +from langchain.embeddings import AzureOpenAIEmbeddings as BaseAzureOpenAIEmbeddings +from langchain.embeddings import OpenAIEmbeddings as BaseOpenAIEmbeddings +from langchain.schema.embeddings import Embeddings +from pydantic.dataclasses import dataclass + +from ragas.exceptions import AzureOpenAIKeyNotFound, OpenAIKeyNotFound +from ragas.utils import NO_KEY DEFAULT_MODEL_NAME = "BAAI/bge-small-en-v1.5" +class RagasEmbeddings(Embeddings): + def validate_api_key(self): + """ + Validates that the api key is set for the Embeddings + """ + pass + + +class OpenAIEmbeddings(BaseOpenAIEmbeddings, RagasEmbeddings): + api_key: str = NO_KEY + + def __init__(self, api_key: str = NO_KEY): + # api key + key_from_env = os.getenv("OPENAI_API_KEY", NO_KEY) + if key_from_env != NO_KEY: + openai_api_key = key_from_env + else: + openai_api_key = api_key + super(BaseOpenAIEmbeddings, self).__init__(openai_api_key=openai_api_key) + self.api_key = openai_api_key + + def validate_api_key(self): + if self.openai_api_key == NO_KEY: + raise OpenAIKeyNotFound + + +class AzureOpenAIEmbeddings(BaseAzureOpenAIEmbeddings, RagasEmbeddings): + azure_endpoint: t.Optional[str] = None + deployment: t.Optional[str] = None + api_version: t.Optional[str] = None + api_key: str = NO_KEY + + def __init__( + self, + api_version: t.Optional[str] = None, + azure_endpoint: t.Optional[str] = None, + deployment: t.Optional[str] = None, + api_key: str = NO_KEY, + ): + # api key + key_from_env = os.getenv("AZURE_OPENAI_API_KEY", NO_KEY) + if key_from_env != NO_KEY: + openai_api_key = key_from_env + else: + openai_api_key = api_key + + super(BaseAzureOpenAIEmbeddings, self).__init__( + azure_endpoint=azure_endpoint, # type: ignore (pydantic bug I think) + deployment=deployment, + api_version=api_version, + api_key=openai_api_key, + ) + self.api_key = openai_api_key + + def validate_api_key(self): + if self.openai_api_key == NO_KEY: + raise AzureOpenAIKeyNotFound + + @dataclass class HuggingfaceEmbeddings(RagasEmbeddings): model_name: str = DEFAULT_MODEL_NAME @@ -52,6 +116,10 @@ def __post_init__(self): self.model_name, cache_folder=self.cache_folder, **self.model_kwargs ) + # ensure outputs are tensors + if "convert_to_tensor" not in self.encode_kwargs: + self.encode_kwargs["convert_to_tensor"] = True + def embed_query(self, text: str) -> List[float]: return self.embed_documents([text])[0] @@ -84,6 +152,5 @@ def predict(self, texts: List[List[str]]) -> List[List[float]]: def embedding_factory() -> RagasEmbeddings: - oai_key = os.getenv("OPENAI_API_KEY", "no-key") - openai_embeddings = OpenAIEmbeddings(openai_api_key=oai_key) + openai_embeddings = OpenAIEmbeddings() return openai_embeddings diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 7d8aa45a1..e7e270b03 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -1,5 +1,6 @@ from __future__ import annotations +import typing as t from dataclasses import dataclass, field import numpy as np @@ -133,6 +134,7 @@ def __post_init__(self): value = np.mean(self.scores[cn]) self[cn] = value if cn not in self.binary_columns: + value = t.cast(float, value) values.append(value + 1e-10) def to_pandas(self, batch_size: int | None = None, batched: bool = False): diff --git a/src/ragas/exceptions.py b/src/ragas/exceptions.py index d26091393..6459d9e8a 100644 --- a/src/ragas/exceptions.py +++ b/src/ragas/exceptions.py @@ -16,3 +16,10 @@ class OpenAIKeyNotFound(RagasException): def __init__(self): super().__init__(self.message) + + +class AzureOpenAIKeyNotFound(RagasException): + message: str = "AzureOpenAI API key not found! Seems like your trying to use Ragas metrics with AzureOpenAI endpoints. Please set 'AZURE_OPENAI_API_KEY' environment variable" # noqa + + def __init__(self): + super().__init__(self.message) diff --git a/src/ragas/llms/__init__.py b/src/ragas/llms/__init__.py index 43094b7d2..6f48ae530 100644 --- a/src/ragas/llms/__init__.py +++ b/src/ragas/llms/__init__.py @@ -1,4 +1,10 @@ -from ragas.llms.base import BaseRagasLLM, LangchainLLM, llm_factory +from ragas.llms.base import RagasLLM +from ragas.llms.langchain import LangchainLLM from ragas.llms.llamaindex import LlamaIndexLLM +from ragas.llms.openai import OpenAI -__all__ = ["BaseRagasLLM", "LangchainLLM", "LlamaIndexLLM", "llm_factory"] +__all__ = ["RagasLLM", "LangchainLLM", "LlamaIndexLLM", "llm_factory", "OpenAI"] + + +def llm_factory(model="gpt-3.5-turbo-16k") -> RagasLLM: + return OpenAI(model=model) diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index 9cd094b76..b291bf72b 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -1,45 +1,16 @@ from __future__ import annotations -import os import typing as t from abc import ABC, abstractmethod -from langchain.chat_models import AzureChatOpenAI, BedrockChat, ChatOpenAI, ChatVertexAI -from langchain.chat_models.base import BaseChatModel -from langchain.llms import AzureOpenAI, Bedrock, OpenAI, VertexAI -from langchain.llms.base import BaseLLM from langchain.schema import LLMResult -from ragas.async_utils import run_async_tasks - if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks from langchain.prompts import ChatPromptTemplate -def isOpenAI(llm: BaseLLM | BaseChatModel) -> bool: - return isinstance(llm, OpenAI) or isinstance(llm, ChatOpenAI) - - -def isBedrock(llm: BaseLLM | BaseChatModel) -> bool: - return isinstance(llm, Bedrock) or isinstance(llm, BedrockChat) - - -# have to specify it twice for runtime and static checks -MULTIPLE_COMPLETION_SUPPORTED = [ - OpenAI, - ChatOpenAI, - AzureOpenAI, - AzureChatOpenAI, - ChatVertexAI, - VertexAI, -] -MultipleCompletionSupportedLLM = t.Union[ - OpenAI, ChatOpenAI, AzureOpenAI, AzureChatOpenAI, ChatVertexAI, VertexAI -] - - -class BaseRagasLLM(ABC): +class RagasLLM(ABC): """ BaseLLM is the base class for all LLMs. It provides a consistent interface for other classes that interact with LLMs like Langchains, LlamaIndex, LiteLLM etc. Handles @@ -57,118 +28,28 @@ class BaseRagasLLM(ABC): def llm(self): ... + def validate_api_key(self): + """ + Validates that the api key is set for the LLM + """ + pass + @abstractmethod def generate( - self, - prompts: list[str], - n: int = 1, - temperature: float = 0, - callbacks: t.Optional[Callbacks] = None, - ) -> list[list[str]]: - ... - - -class LangchainLLM(BaseRagasLLM): - n_completions_supported: bool = True - - def __init__(self, llm: BaseLLM | BaseChatModel): - self.langchain_llm = llm - - @property - def llm(self): - return self.langchain_llm - - @staticmethod - def llm_supports_completions(llm): - for llm_type in MULTIPLE_COMPLETION_SUPPORTED: - if isinstance(llm, llm_type): - return True - - def generate_multiple_completions( self, prompts: list[ChatPromptTemplate], n: int = 1, + temperature: float = 0, callbacks: t.Optional[Callbacks] = None, ) -> LLMResult: - self.langchain_llm = t.cast(MultipleCompletionSupportedLLM, self.langchain_llm) - old_n = self.langchain_llm.n - self.langchain_llm.n = n - - if isinstance(self.llm, BaseLLM): - ps = [p.format() for p in prompts] - result = self.llm.generate(ps, callbacks=callbacks) - else: # if BaseChatModel - ps = [p.format_messages() for p in prompts] - result = self.llm.generate(ps, callbacks=callbacks) - self.llm.n = old_n - - return result - - async def generate_completions( - self, - prompts: list[ChatPromptTemplate], - callbacks: t.Optional[Callbacks] = None, - ) -> LLMResult: - if isinstance(self.llm, BaseLLM): - ps = [p.format() for p in prompts] - result = await self.llm.agenerate(ps, callbacks=callbacks) - else: # if BaseChatModel - ps = [p.format_messages() for p in prompts] - result = await self.llm.agenerate(ps, callbacks=callbacks) - - return result + ... - def generate( + @abstractmethod + async def agenerate( self, - prompts: list[ChatPromptTemplate], + prompts: ChatPromptTemplate, n: int = 1, temperature: float = 1e-8, callbacks: t.Optional[Callbacks] = None, ) -> LLMResult: - # set temperature to 0.2 for multiple completions - temperature = 0.2 if n > 1 else 1e-8 - if isBedrock(self.llm) and ("model_kwargs" in self.llm.__dict__): - self.llm.model_kwargs = {"temperature": temperature} - else: - self.llm.temperature = temperature - - if self.llm_supports_completions(self.llm): - return self.generate_multiple_completions(prompts, n, callbacks) - else: # call generate_completions n times to mimic multiple completions - list_llmresults = run_async_tasks( - [self.generate_completions(prompts, callbacks) for _ in range(n)] - ) - - # fill results as if the LLM supported multiple completions - generations = [] - for i in range(len(prompts)): - completions = [] - for result in list_llmresults: - completions.append(result.generations[i][0]) - generations.append(completions) - - # compute total token usage by adding individual token usage - llm_output = list_llmresults[0].llm_output - if (llm_output is not None) and ("token_usage" in llm_output): - sum_prompt_tokens = 0 - sum_completion_tokens = 0 - sum_total_tokens = 0 - for result in list_llmresults: - token_usage = result.llm_output["token_usage"] - sum_prompt_tokens += token_usage["prompt_tokens"] - sum_completion_tokens += token_usage["completion_tokens"] - sum_total_tokens += token_usage["total_tokens"] - - llm_output["token_usage"] = { - "prompt_tokens": sum_prompt_tokens, - "completion_tokens": sum_completion_tokens, - "sum_total_tokens": sum_total_tokens, - } - - return LLMResult(generations=generations, llm_output=llm_output) - - -def llm_factory() -> LangchainLLM: - oai_key = os.getenv("OPENAI_API_KEY", "no-key") - openai_llm = ChatOpenAI(openai_api_key=oai_key) - return LangchainLLM(llm=openai_llm) + ... diff --git a/src/ragas/llms/langchain.py b/src/ragas/llms/langchain.py new file mode 100644 index 000000000..1fe3099c7 --- /dev/null +++ b/src/ragas/llms/langchain.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +import typing as t + +from langchain.chat_models import AzureChatOpenAI, BedrockChat, ChatOpenAI, ChatVertexAI +from langchain.chat_models.base import BaseChatModel +from langchain.llms import AzureOpenAI, Bedrock, OpenAI, VertexAI +from langchain.llms.base import BaseLLM +from langchain.schema import LLMResult + +from ragas.async_utils import run_async_tasks +from ragas.exceptions import AzureOpenAIKeyNotFound, OpenAIKeyNotFound +from ragas.llms.base import RagasLLM +from ragas.utils import NO_KEY + +if t.TYPE_CHECKING: + from langchain.callbacks.base import Callbacks + from langchain.prompts import ChatPromptTemplate + + +def isOpenAI(llm: BaseLLM | BaseChatModel) -> bool: + return isinstance(llm, OpenAI) or isinstance(llm, ChatOpenAI) + + +def isBedrock(llm: BaseLLM | BaseChatModel) -> bool: + return isinstance(llm, Bedrock) or isinstance(llm, BedrockChat) + + +# have to specify it twice for runtime and static checks +MULTIPLE_COMPLETION_SUPPORTED = [ + OpenAI, + ChatOpenAI, + AzureOpenAI, + AzureChatOpenAI, + ChatVertexAI, + VertexAI, +] +MultipleCompletionSupportedLLM = t.Union[ + OpenAI, ChatOpenAI, AzureOpenAI, AzureChatOpenAI, ChatVertexAI, VertexAI +] + + +def _compute_token_usage_langchain(list_llmresults: t.List[LLMResult]) -> t.Dict: + # compute total token usage by adding individual token usage + llm_output = list_llmresults[0].llm_output + if llm_output is None: + return {} + if (llm_output is not None) and ("token_usage" in llm_output): + sum_prompt_tokens = 0 + sum_completion_tokens = 0 + sum_total_tokens = 0 + for result in list_llmresults: + if result.llm_output is None: + continue + token_usage = result.llm_output["token_usage"] + sum_prompt_tokens += token_usage["prompt_tokens"] + sum_completion_tokens += token_usage["completion_tokens"] + sum_total_tokens += token_usage["total_tokens"] + + llm_output["token_usage"] = { + "prompt_tokens": sum_prompt_tokens, + "completion_tokens": sum_completion_tokens, + "sum_total_tokens": sum_total_tokens, + } + + return llm_output + + +class LangchainLLM(RagasLLM): + n_completions_supported: bool = True + + def __init__(self, llm: BaseLLM | BaseChatModel): + self.langchain_llm = llm + + @property + def llm(self): + return self.langchain_llm + + def validate_api_key(self): + # if langchain OpenAI or ChatOpenAI + if isinstance(self.llm, ChatOpenAI) or isinstance(self.llm, OpenAI): + # make sure the type is LangchainLLM with ChatOpenAI + self.langchain_llm = t.cast(ChatOpenAI, self.langchain_llm) + # raise error if no api key + if self.langchain_llm.openai_api_key == NO_KEY: + raise OpenAIKeyNotFound + + # if langchain AzureOpenAI or ChatAzurerOpenAI + elif isinstance(self.llm, AzureChatOpenAI) or isinstance(self.llm, AzureOpenAI): + self.langchain_llm = t.cast(AzureChatOpenAI, self.langchain_llm) + # raise error if no api key + if self.langchain_llm.openai_api_key == NO_KEY: + raise AzureOpenAIKeyNotFound + + @staticmethod + def llm_supports_completions(llm): + for llm_type in MULTIPLE_COMPLETION_SUPPORTED: + if isinstance(llm, llm_type): + return True + + def _generate_multiple_completions( + self, + prompts: list[ChatPromptTemplate], + n: int = 1, + callbacks: t.Optional[Callbacks] = None, + ) -> LLMResult: + self.langchain_llm = t.cast(MultipleCompletionSupportedLLM, self.langchain_llm) + old_n = self.langchain_llm.n + self.langchain_llm.n = n + + if isinstance(self.llm, BaseLLM): + ps = [p.format() for p in prompts] + result = self.llm.generate(ps, callbacks=callbacks) + else: # if BaseChatModel + ps = [p.format_messages() for p in prompts] + result = self.llm.generate(ps, callbacks=callbacks) + self.llm.n = old_n + + return result + + async def generate_completions( + self, + prompts: list[ChatPromptTemplate], + callbacks: t.Optional[Callbacks] = None, + ) -> LLMResult: + if isinstance(self.llm, BaseLLM): + ps = [p.format() for p in prompts] + result = await self.llm.agenerate(ps, callbacks=callbacks) + else: # if BaseChatModel + ps = [p.format_messages() for p in prompts] + result = await self.llm.agenerate(ps, callbacks=callbacks) + + return result + + async def agenerate( + self, + prompt: ChatPromptTemplate, + n: int = 1, + callbacks: t.Optional[Callbacks] = None, + ) -> LLMResult: + temperature = 0.2 if n > 1 else 0 + if isBedrock(self.llm) and ("model_kwargs" in self.llm.__dict__): + self.llm.model_kwargs = {"temperature": temperature} + else: + self.llm.temperature = temperature + + if self.llm_supports_completions(self.llm): + self.langchain_llm = t.cast( + MultipleCompletionSupportedLLM, self.langchain_llm + ) + old_n = self.langchain_llm.n + self.langchain_llm.n = n + if isinstance(self.llm, BaseLLM): + result = await self.llm.agenerate( + [prompt.format()], callbacks=callbacks + ) + else: # if BaseChatModel + result = await self.llm.agenerate( + [prompt.format_messages()], callbacks=callbacks + ) + self.langchain_llm.n = old_n + else: + if isinstance(self.llm, BaseLLM): + list_llmresults: list[LLMResult] = run_async_tasks( + [ + self.llm.agenerate([prompt.format()], callbacks=callbacks) + for _ in range(n) + ] + ) + else: + list_llmresults: list[LLMResult] = run_async_tasks( + [ + self.llm.agenerate( + [prompt.format_messages()], callbacks=callbacks + ) + for _ in range(n) + ] + ) + + # fill results as if the LLM supported multiple completions + generations = [r.generations[0][0] for r in list_llmresults] + llm_output = _compute_token_usage_langchain(list_llmresults) + result = LLMResult(generations=[generations], llm_output=llm_output) + + return result + + def generate( + self, + prompts: list[ChatPromptTemplate], + n: int = 1, + temperature: float = 1e-8, + callbacks: t.Optional[Callbacks] = None, + ) -> LLMResult: + # set temperature to 0.2 for multiple completions + temperature = 0.2 if n > 1 else 1e-8 + if isBedrock(self.llm) and ("model_kwargs" in self.llm.__dict__): + self.llm.model_kwargs = {"temperature": temperature} + else: + self.llm.temperature = temperature + + if self.llm_supports_completions(self.llm): + return self._generate_multiple_completions(prompts, n, callbacks) + else: # call generate_completions n times to mimic multiple completions + list_llmresults = run_async_tasks( + [self.generate_completions(prompts, callbacks) for _ in range(n)] + ) + + # fill results as if the LLM supported multiple completions + generations = [] + for i in range(len(prompts)): + completions = [] + for result in list_llmresults: + completions.append(result.generations[i][0]) + generations.append(completions) + + llm_output = _compute_token_usage_langchain(list_llmresults) + return LLMResult(generations=generations, llm_output=llm_output) diff --git a/src/ragas/llms/llamaindex.py b/src/ragas/llms/llamaindex.py index 1c36ba583..5554754ca 100644 --- a/src/ragas/llms/llamaindex.py +++ b/src/ragas/llms/llamaindex.py @@ -5,7 +5,7 @@ from langchain.schema.output import Generation, LLMResult from ragas.async_utils import run_async_tasks -from ragas.llms.base import BaseRagasLLM +from ragas.llms.base import RagasLLM if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks @@ -13,7 +13,7 @@ from llama_index.llms.base import LLM as LiLLM -class LlamaIndexLLM(BaseRagasLLM): +class LlamaIndexLLM(RagasLLM): def __init__(self, llm: LiLLM) -> None: self.llama_index_llm = llm diff --git a/src/ragas/llms/openai.py b/src/ragas/llms/openai.py new file mode 100644 index 000000000..d7d521223 --- /dev/null +++ b/src/ragas/llms/openai.py @@ -0,0 +1,143 @@ +from __future__ import annotations + +import os +import typing as t +from abc import abstractmethod +from dataclasses import dataclass, field + +from langchain.adapters.openai import convert_message_to_dict +from langchain.schema import Generation, LLMResult +from openai import AsyncAzureOpenAI, AsyncClient, AsyncOpenAI + +from ragas.async_utils import run_async_tasks +from ragas.exceptions import AzureOpenAIKeyNotFound, OpenAIKeyNotFound +from ragas.llms.base import RagasLLM +from ragas.llms.langchain import _compute_token_usage_langchain +from ragas.utils import NO_KEY + +if t.TYPE_CHECKING: + from langchain.callbacks.base import Callbacks + from langchain.prompts import ChatPromptTemplate + + +class OpenAIBase(RagasLLM): + def __init__(self, model: str, _api_key_env_var: str) -> None: + self.model = model + self._api_key_env_var = _api_key_env_var + + # api key + key_from_env = os.getenv(self._api_key_env_var, NO_KEY) + if key_from_env != NO_KEY: + self.api_key = key_from_env + else: + self.api_key = self.api_key + self._client: AsyncClient + + @abstractmethod + def _client_init(self) -> AsyncClient: + ... + + @property + def llm(self): + return self + + def create_llm_result(self, response) -> LLMResult: + """Create the LLMResult from the choices and prompts.""" + if not isinstance(response, dict): + response = response.model_dump() + + # token Usage + token_usage = response.get("usage", {}) + llm_output = { + "token_usage": token_usage, + "model_name": None, + "system_fingerprint": response.get("system_fingerprint", ""), + } + + choices = response["choices"] + generations = [ + Generation( + text=choice["message"]["content"], + generation_info=dict( + finish_reason=choice.get("finish_reason"), + logprobs=choice.get("logprobs"), + ), + ) + for choice in choices + ] + llm_output = {"token_usage": token_usage, "model_name": self.model} + return LLMResult(generations=[generations], llm_output=llm_output) + + def generate( + self, + prompts: list[ChatPromptTemplate], + n: int = 1, + temperature: float = 0, + callbacks: t.Optional[Callbacks] = None, + ) -> t.Any: # TODO: LLMResult + llm_results = run_async_tasks( + [self.agenerate(p, n, temperature, callbacks) for p in prompts] + ) + + generations = [r.generations[0] for r in llm_results] + llm_output = _compute_token_usage_langchain(llm_results) + return LLMResult(generations=generations, llm_output=llm_output) + + async def agenerate( + self, + prompt: ChatPromptTemplate, + n: int = 1, + temperature: float = 0, + callbacks: t.Optional[Callbacks] = None, + ) -> LLMResult: + # TODO: use callbacks for llm generate + completion = await self._client.chat.completions.create( + model=self.model, + messages=[convert_message_to_dict(m) for m in prompt.format_messages()], # type: ignore + temperature=temperature, + n=n, + ) + + return self.create_llm_result(completion) + + +@dataclass +class OpenAI(OpenAIBase): + model: str = "gpt-3.5-turbo-16k" + api_key: str = field(default=NO_KEY, repr=False) + _api_key_env_var: str = "OPENAI_API_KEY" + + def __post_init__(self): + super().__init__(model=self.model, _api_key_env_var=self._api_key_env_var) + self._client_init() + + def _client_init(self): + self._client = AsyncOpenAI(api_key=self.api_key) + + def validate_api_key(self): + if self.llm.api_key == NO_KEY: + raise OpenAIKeyNotFound + + +@dataclass +class AzureOpenAI(OpenAIBase): + azure_endpoint: str + deployment: str + api_version: str + api_key: str = field(default=NO_KEY, repr=False) + _api_key_env_var: str = "AZURE_OPENAI_API_KEY" + + def __post_init__(self): + super().__init__(model=self.deployment, _api_key_env_var=self._api_key_env_var) + self._client_init() + + def _client_init(self): + self._client = AsyncAzureOpenAI( + api_version=self.api_version, + azure_endpoint=self.azure_endpoint, + api_key=self.api_key, + ) + + def validate_api_key(self): + if self.llm.api_key == NO_KEY: + raise AzureOpenAIKeyNotFound diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 0a58357e2..64c6b93c6 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -2,10 +2,10 @@ from ragas.metrics._answer_relevance import AnswerRelevancy, answer_relevancy from ragas.metrics._answer_similarity import AnswerSimilarity, answer_similarity from ragas.metrics._context_precision import ContextPrecision, context_precision -from ragas.metrics._context_relevancy import ContextRelevancy, context_relevancy from ragas.metrics._context_recall import ContextRecall, context_recall -from ragas.metrics.critique import AspectCritique +from ragas.metrics._context_relevancy import ContextRelevancy, context_relevancy from ragas.metrics._faithfulness import Faithfulness, faithfulness +from ragas.metrics.critique import AspectCritique DEFAULT_METRICS = [ answer_relevancy, diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 8a78e5f4d..b0757ac86 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -7,8 +7,8 @@ from datasets import Dataset from ragas.metrics._answer_similarity import AnswerSimilarity -from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.metrics._faithfulness import Faithfulness +from ragas.metrics.base import EvaluationMode, MetricWithLLM if t.TYPE_CHECKING: from langchain.callbacks.manager import CallbackManager diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index a1ca7fc06..a91a57f2a 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os import typing as t from dataclasses import dataclass, field @@ -8,7 +7,6 @@ from datasets import Dataset from langchain.callbacks.manager import trace_as_chain_group from langchain.embeddings import OpenAIEmbeddings -from langchain.embeddings.base import Embeddings from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from ragas.embeddings.base import embedding_factory diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py index 2273a5db7..1ea27ed3d 100644 --- a/src/ragas/metrics/_answer_similarity.py +++ b/src/ragas/metrics/_answer_similarity.py @@ -53,7 +53,9 @@ def __post_init__(self: t.Self): # only for cross encoder if isinstance(self.embeddings, HuggingfaceEmbeddings): self.is_cross_encoder = True if self.embeddings.is_cross_encoder else False - self.embeddings.encode_kwargs = {"batch_size": self.batch_size, "convert_to_tensor": True} + self.embeddings.encode_kwargs = { + "batch_size": self.batch_size, + } def init_model(self): super().init_model() diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index c999cdde8..bc3d1fbdf 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -2,10 +2,8 @@ import typing as t from dataclasses import dataclass -from typing import List import numpy as np -import pysbd from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate @@ -21,6 +19,7 @@ """ # noqa: E501 ) + @dataclass class ContextPrecision(MetricWithLLM): """ @@ -89,4 +88,4 @@ def _score_batch( return scores -context_precision = ContextPrecision() \ No newline at end of file +context_precision = ContextPrecision() diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index 899b54864..123396610 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -104,4 +104,5 @@ def _score_batch( return scores + context_relevancy = ContextRelevancy() diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index d36e43d90..1bc0b75ae 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -14,16 +14,16 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.chat_models import ChatOpenAI -from langchain.llms import OpenAI from tqdm import tqdm -from ragas.exceptions import OpenAIKeyNotFound -from ragas.llms import LangchainLLM, llm_factory +from ragas.llms import llm_factory if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks + from ragas.embeddings.base import RagasEmbeddings + from ragas.llms import RagasLLM + def make_batches(total_size: int, batch_size: int) -> list[range]: """ @@ -110,10 +110,15 @@ def get_batches(self, dataset_size: int) -> list[range]: @dataclass class MetricWithLLM(Metric): - llm: LangchainLLM = field(default_factory=llm_factory) + llm: RagasLLM = field(default_factory=llm_factory) def init_model(self): - if isinstance(self.llm, ChatOpenAI) or isinstance(self.llm, OpenAI): - self.llm.langchain_llm = t.cast(ChatOpenAI, self.llm) - if self.llm.langchain_llm.openai_api_key == "no-key": - raise OpenAIKeyNotFound + """ + Init any models in the metric, this is invoked before evaluate() + to load all the models + Also check if the api key is valid for OpenAI and AzureOpenAI + """ + self.llm.validate_api_key() + if hasattr(self, "embeddings"): + self.embeddings = t.cast(RagasEmbeddings, self.embeddings) + self.embeddings.validate_api_key() diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 86ad10b5a..f26eee56e 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -8,8 +8,11 @@ from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate -from ragas.llms import LangchainLLM -from ragas.metrics.base import EvaluationMode, MetricWithLLM, llm_factory +from ragas.llms import llm_factory +from ragas.metrics.base import EvaluationMode, MetricWithLLM + +if t.TYPE_CHECKING: + from ragas.llms import RagasLLM CRITIQUE_PROMPT = HumanMessagePromptTemplate.from_template( """Given a input and submission. Evaluate the submission only using the given criteria. @@ -55,7 +58,7 @@ class AspectCritique(MetricWithLLM): definition: str = field(default="", repr=True) strictness: int = field(default=1, repr=False) batch_size: int = field(default=15, repr=False) - llm: LangchainLLM = field( + llm: RagasLLM = field( default_factory=llm_factory, repr=False, ) diff --git a/src/ragas/testset/testset_generator.py b/src/ragas/testset/testset_generator.py index 80b8dc256..90f74daa8 100644 --- a/src/ragas/testset/testset_generator.py +++ b/src/ragas/testset/testset_generator.py @@ -8,7 +8,6 @@ import numpy as np import numpy.testing as npt import pandas as pd -from langchain.chat_models import ChatOpenAI from langchain.embeddings import OpenAIEmbeddings from langchain.embeddings.base import Embeddings from langchain.prompts import ChatPromptTemplate @@ -20,7 +19,7 @@ from numpy.random import default_rng from tqdm import tqdm -from ragas.llms import LangchainLLM +from ragas.llms import llm_factory from ragas.testset.prompts import ( ANSWER_FORMULATE, COMPRESS_QUESTION, @@ -35,6 +34,10 @@ ) from ragas.testset.utils import load_as_json, load_as_score +if t.TYPE_CHECKING: + from ragas.llms.base import RagasLLM + + DEFAULT_TEST_DISTRIBUTION = { "simple": 0.4, "reasoning": 0.2, @@ -105,8 +108,8 @@ class TestsetGenerator: def __init__( self, - generator_llm: LangchainLLM, - critic_llm: LangchainLLM, + generator_llm: RagasLLM, + critic_llm: RagasLLM, embeddings_model: Embeddings, testset_distribution: t.Optional[t.Dict[str, float]] = None, chat_qa: float = 0.0, @@ -141,8 +144,8 @@ def from_default( chunk_size: int = 512, testset_distribution: dict = DEFAULT_TEST_DISTRIBUTION, ): - generator_llm = LangchainLLM(llm=ChatOpenAI(model=openai_generator_llm)) - critic_llm = LangchainLLM(llm=ChatOpenAI(model=openai_filter_llm)) + generator_llm = llm_factory(openai_generator_llm) + critic_llm = llm_factory(openai_filter_llm) embeddings_model = OpenAIEmbeddings() # type: ignore return cls( generator_llm=generator_llm, diff --git a/src/ragas/utils.py b/src/ragas/utils.py index 4fc011089..0801da28f 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -11,6 +11,8 @@ DEVICES = ["cpu", "cuda"] DEBUG_ENV_VAR = "RAGAS_DEBUG" +# constant to tell us that there is no key passed to the llm/embeddings +NO_KEY = "no-key" def device_check(device: t.Literal["cpu", "cuda"] | Device) -> torch.device: diff --git a/tests/benchmarks/benchmark_eval.py b/tests/benchmarks/benchmark_eval.py index 3ada9e05b..b45a0773c 100644 --- a/tests/benchmarks/benchmark_eval.py +++ b/tests/benchmarks/benchmark_eval.py @@ -1,4 +1,6 @@ -from datasets import load_dataset +import time + +from datasets import DatasetDict, load_dataset from torch.cuda import is_available from ragas import evaluate @@ -13,11 +15,14 @@ DEVICE = "cuda" if is_available() else "cpu" # data -ds = load_dataset("explodinggradients/fiqa", "ragas_eval")["baseline"] +ds = load_dataset("explodinggradients/fiqa", "ragas_eval") +assert isinstance(ds, DatasetDict) +fiqa = ds["baseline"] if __name__ == "__main__": - result = evaluate( - ds.select(range(5)), + start = time.time() + _ = evaluate( + fiqa, metrics=[ answer_relevancy, context_precision, @@ -26,4 +31,4 @@ context_recall, ], ) - print(result) + print(f"Time taken: {time.time() - start:.2f}s") diff --git a/tests/unit/test_embeddings.py b/tests/unit/test_embeddings.py new file mode 100644 index 000000000..9d48db4f9 --- /dev/null +++ b/tests/unit/test_embeddings.py @@ -0,0 +1 @@ +from __future__ import annotations diff --git a/tests/unit/test_import.py b/tests/unit/test_import.py index e008fffff..0df78a883 100644 --- a/tests/unit/test_import.py +++ b/tests/unit/test_import.py @@ -2,28 +2,29 @@ import ragas.metrics.critique test_metrics = [ - 'answer_correctness', - 'answer_relevancy', - 'answer_similarity', - 'context_recall', - 'context_precision', - 'context_relevancy', - 'faithfulness' + "answer_correctness", + "answer_relevancy", + "answer_similarity", + "context_recall", + "context_precision", + "context_relevancy", + "faithfulness", ] test_critique = [ - 'harmfulness', - 'maliciousness', - 'coherence', - 'correctness', - 'conciseness', + "harmfulness", + "maliciousness", + "coherence", + "correctness", + "conciseness", ] + def test_import_module(): - assert ragas.metrics is not None, "module is not imported" + assert ragas.metrics is not None, "module is not imported" - for metric in test_metrics: - assert hasattr(ragas.metrics, metric) + for metric in test_metrics: + assert hasattr(ragas.metrics, metric) - for metric in test_critique: - assert hasattr(ragas.metrics.critique, metric) \ No newline at end of file + for metric in test_critique: + assert hasattr(ragas.metrics.critique, metric) diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py new file mode 100644 index 000000000..d414e4f21 --- /dev/null +++ b/tests/unit/test_llm.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +import os + +import pytest +from langchain.prompts.chat import ChatPromptTemplate +from langchain.schema import Generation, LLMResult + +from ragas.embeddings import AzureOpenAIEmbeddings, OpenAIEmbeddings +from ragas.llms.base import RagasLLM +from ragas.llms.openai import ( + AzureOpenAI, + AzureOpenAIKeyNotFound, + OpenAI, + OpenAIKeyNotFound, +) +from ragas.utils import NO_KEY + + +class TestLLM(RagasLLM): + def llm(self): + return self + + def generate( + self, prompts: list[ChatPromptTemplate], n=1, temperature=0, callbacks=None + ): + prompt_strs = [p.format() for p in prompts] + generations = [[Generation(text=prompt_str)] * n for prompt_str in prompt_strs] + return LLMResult(generations=generations) + + async def agenerate( + self, prompt: ChatPromptTemplate, n=1, temperature=0, callbacks=None + ): + return self.generate([prompt], n, temperature, callbacks) + + def validate_api_key(self): + if os.getenv("FAKELLM_API_KEY", NO_KEY) == NO_KEY: + raise ValueError("FAKELLM_API_KEY not found in environment variables.") + + +def test_validate_api_key(): + llm = TestLLM() + with pytest.raises(ValueError): + llm.validate_api_key() + os.environ["FAKELLM_API_KEY"] = "random-key-102848595" + # just check if no error is raised + assert llm.validate_api_key() is None + + +def openai_llm_factory(with_api_key): + if with_api_key: + api_key = "random-key-102848595" + return OpenAI(api_key=api_key), api_key + else: + return OpenAI() + + +def openai_embedding_factory(with_api_key): + if with_api_key: + api_key = "random-key-102848595" + return OpenAIEmbeddings(api_key=api_key), api_key + else: + return OpenAIEmbeddings() + + +def azure_llm_factory(with_api_key): + if with_api_key: + api_key = "random-key-102848595" + return ( + AzureOpenAI( + api_version="2020-09-03", + api_key=api_key, + azure_endpoint="https://api.labs.cognitive.microsofttranslator.com", + deployment="en-fr", + ), + api_key, + ) + else: + return AzureOpenAI( + azure_endpoint="https://api.labs.cognitive.microsofttranslator.com", + deployment="en-fr", + api_version="2020-09-03", + ) + + +def azure_embed_factory(with_api_key): + if with_api_key: + api_key = "random-key-102848595" + return ( + AzureOpenAIEmbeddings( + api_version="2020-09-03", + api_key=api_key, + azure_endpoint="https://api.labs.cognitive.microsofttranslator.com", + deployment="en-fr", + ), + api_key, + ) + else: + return AzureOpenAIEmbeddings( + azure_endpoint="https://api.labs.cognitive.microsofttranslator.com", + deployment="en-fr", + api_version="2020-09-03", + ) + + +@pytest.mark.parametrize( + "factory, key_not_found_exception, environ_key", + [ + (openai_llm_factory, OpenAIKeyNotFound, "OPENAI_API_KEY"), + (azure_llm_factory, AzureOpenAIKeyNotFound, "AZURE_OPENAI_API_KEY"), + (openai_embedding_factory, OpenAIKeyNotFound, "OPENAI_API_KEY"), + (azure_embed_factory, AzureOpenAIKeyNotFound, "AZURE_OPENAI_API_KEY"), + ], +) +def test_validate_api_key_for_different_llms( + factory, key_not_found_exception, environ_key +): + # load key from environment variables + if environ_key in os.environ: + os.environ.pop(environ_key) + obj = factory(with_api_key=False) + with pytest.raises(key_not_found_exception): + obj.validate_api_key() + os.environ[environ_key] = "random-key-102848595" + obj = factory(with_api_key=False) + assert obj.validate_api_key() is None + + # load key which is passed as argument + if environ_key in os.environ: + os.environ.pop(environ_key) + obj, _ = factory(with_api_key=True) + assert obj.validate_api_key() is None + + # assert order of precedence + os.environ[environ_key] = "random-key-102848595" + obj, api_key = factory(with_api_key=True) + assert obj.validate_api_key + assert obj.api_key == api_key diff --git a/tests/unit/test_simple.py b/tests/unit/test_simple.py index fe73098ad..43b27eef6 100644 --- a/tests/unit/test_simple.py +++ b/tests/unit/test_simple.py @@ -16,13 +16,6 @@ def test_type_casting(): def test_import_metrics(): - from ragas.metrics import ( - answer_relevancy, - context_precision, - context_recall, - context_relevancy, - faithfulness, - ) from ragas.metrics.critique import harmfulness assert harmfulness is not None