diff --git a/docs/howtos/customisations/aws-bedrock.ipynb b/docs/howtos/customisations/aws-bedrock.ipynb new file mode 100644 index 000000000..9aef4035b --- /dev/null +++ b/docs/howtos/customisations/aws-bedrock.ipynb @@ -0,0 +1,453 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7c249b40", + "metadata": {}, + "source": [ + "# Using Amazon Bedrock\n", + "\n", + "Amazon Bedrock is a fully managed service that makes FMs from leading AI startups and Amazon available via an API, so you can choose from a wide range of FMs to find the model that is best suited for your use case.\n", + "\n", + "This tutorial will show you how to use Amazon Bedrock endpoints and LangChain." + ] + }, + { + "cell_type": "markdown", + "id": "2e63f667", + "metadata": {}, + "source": [ + ":::{Note}\n", + "this guide is for folks who are using the Amazon Bedrock endpoints. Check the [evaluation guide](../../getstarted/evaluation.md) if your using OpenAI endpoints.\n", + ":::" + ] + }, + { + "cell_type": "markdown", + "id": "e54b5e01", + "metadata": {}, + "source": [ + "### Load sample dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b658e02f", + "metadata": {}, + "outputs": [], + "source": [ + "# data\n", + "from datasets import load_dataset\n", + "\n", + "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n", + "fiqa_eval" + ] + }, + { + "cell_type": "markdown", + "id": "d4b8a69c", + "metadata": {}, + "source": [ + "Lets import metrics that we are going to use" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f17bcf9d", + "metadata": {}, + "outputs": [], + "source": [ + "from ragas.metrics import (\n", + " context_precision,\n", + " answer_relevancy, # AnswerRelevancy\n", + " faithfulness,\n", + " context_recall,\n", + ")\n", + "from ragas.metrics.critique import harmfulness\n", + "\n", + "# list of metrics we're going to use\n", + "metrics = [\n", + " faithfulness,\n", + " answer_relevancy,\n", + " context_recall,\n", + " context_precision,\n", + " harmfulness,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "f1201199", + "metadata": {}, + "source": [ + "Now lets swap out the default `ChatOpenAI` with `BedrockChat`. Init a new instance of `BedrockChat` with the `model_id` of the model you want to use. You will also have to change the `BedrockEmbeddings` in the metrics that use them, which in our case is `answer_relevance`.\n", + "\n", + "Now in order to use the new `BedrockChat` llm instance with Ragas metrics, you have to create a new instance of `RagasLLM` using the `ragas.llms.LangchainLLM` wrapper. Its a simple wrapper around langchain that make Langchain LLM/Chat instances compatible with how Ragas metrics will use them." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "40406a26", + "metadata": {}, + "outputs": [], + "source": [ + "from ragas.llms import LangchainLLM\n", + "from langchain.chat_models import BedrockChat\n", + "from langchain.embeddings import BedrockEmbeddings\n", + "\n", + "config = {\n", + " \"credentials_profile_name\": \"your-profile-name\", # E.g \"default\"\n", + " \"region_name\": \"your-region-name\", # E.g. \"us-east-1\"\n", + " \"model_id\": \"your-model-id\", # E.g \"anthropic.claude-v2\"\n", + " \"model_kwargs\": {\"temperature\": 0.4},\n", + "}\n", + "\n", + "bedrock_model = BedrockChat(\n", + " credentials_profile_name=config[\"credentials_profile_name\"],\n", + " region_name=config[\"region_name\"],\n", + " endpoint_url=f\"https://bedrock-runtime.{config['region_name']}.amazonaws.com\",\n", + " model_id=config[\"model_id\"],\n", + " model_kwargs=config[\"model_kwargs\"],\n", + ")\n", + "# wrapper around bedrock_model\n", + "ragas_bedrock_model = LangchainLLM(bedrock_model)\n", + "# patch the new RagasLLM instance\n", + "answer_relevancy.llm = ragas_bedrock_model\n", + "\n", + "# init and change the embeddings\n", + "# only for answer_relevancy\n", + "bedrock_embeddings = BedrockEmbeddings(\n", + " credentials_profile_name=config[\"credentials_profile_name\"],\n", + " region_name=config[\"region_name\"],\n", + ")\n", + "# embeddings can be used as it is\n", + "answer_relevancy.embeddings = bedrock_embeddings" + ] + }, + { + "cell_type": "markdown", + "id": "44641e41", + "metadata": {}, + "source": [ + "This replaces the default llm of `answer_relevancy` with the Amazon Bedrock endpoint. Now with some `__setattr__` magic lets change it for all other metrics." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "52d9f5f3", + "metadata": {}, + "outputs": [], + "source": [ + "for m in metrics:\n", + " m.__setattr__(\"llm\", ragas_bedrock_model)" + ] + }, + { + "cell_type": "markdown", + "id": "8d6ecd5a", + "metadata": {}, + "source": [ + "### Evaluation\n", + "\n", + "Running the evalutation is as simple as calling evaluate on the `Dataset` with the metrics of your choice." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "22eb6f97", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [faithfulness]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████| 2/2 [01:22<00:00, 41.24s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [answer_relevancy]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████| 2/2 [01:21<00:00, 40.59s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [context_recall]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████| 2/2 [00:46<00:00, 23.22s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [context_precision]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████| 2/2 [00:59<00:00, 29.85s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [harmfulness]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████| 2/2 [00:33<00:00, 16.96s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "{'faithfulness': 0.9428, 'answer_relevancy': 0.7860, 'context_recall': 0.2296, 'context_precision': 0.0000, 'harmfulness': 0.0000}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from ragas import evaluate\n", + "import nest_asyncio # CHECK NOTES\n", + "\n", + "# NOTES: Only used when running on a jupyter notebook, otherwise comment or remove this function.\n", + "nest_asyncio.apply()\n", + "\n", + "result = evaluate(\n", + " fiqa_eval[\"baseline\"],\n", + " metrics=metrics,\n", + ")\n", + "\n", + "result" + ] + }, + { + "cell_type": "markdown", + "id": "a2dc0ec2", + "metadata": {}, + "source": [ + "and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.\n", + "\n", + "now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "8686bf53", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questioncontextsanswerground_truthsfaithfulnessanswer_relevancycontext_recallcontext_precisionharmfulness
0How to deposit a cheque issued to an associate...[Just have the associate sign the back and the...\\nThe best way to deposit a cheque issued to a...[Have the check reissued to the proper payee.J...1.00.9303110.2631580.00
1Can I send a money order from USPS as a business?[Sure you can. You can fill in whatever you w...\\nYes, you can send a money order from USPS as...[Sure you can. You can fill in whatever you w...1.00.9841220.3636360.00
21 EIN doing business under multiple business n...[You're confusing a lot of things here. Compan...\\nYes, it is possible to have one EIN doing bu...[You're confusing a lot of things here. Compan...1.00.8838720.3636360.00
3Applying for and receiving business credit[Set up a meeting with the bank that handles y...\\nApplying for and receiving business credit c...[\"I'm afraid the great myth of limited liabili...1.00.5182870.3636360.00
4401k Transfer After Business Closure[The time horizon for your 401K/IRA is essenti...\\nIf your employer has closed and you need to ...[You should probably consult an attorney. Howe...1.00.7794710.0000000.00
\n", + "
" + ], + "text/plain": [ + " question \\\n", + "0 How to deposit a cheque issued to an associate... \n", + "1 Can I send a money order from USPS as a business? \n", + "2 1 EIN doing business under multiple business n... \n", + "3 Applying for and receiving business credit \n", + "4 401k Transfer After Business Closure \n", + "\n", + " contexts \\\n", + "0 [Just have the associate sign the back and the... \n", + "1 [Sure you can. You can fill in whatever you w... \n", + "2 [You're confusing a lot of things here. Compan... \n", + "3 [Set up a meeting with the bank that handles y... \n", + "4 [The time horizon for your 401K/IRA is essenti... \n", + "\n", + " answer \\\n", + "0 \\nThe best way to deposit a cheque issued to a... \n", + "1 \\nYes, you can send a money order from USPS as... \n", + "2 \\nYes, it is possible to have one EIN doing bu... \n", + "3 \\nApplying for and receiving business credit c... \n", + "4 \\nIf your employer has closed and you need to ... \n", + "\n", + " ground_truths faithfulness \\\n", + "0 [Have the check reissued to the proper payee.J... 1.0 \n", + "1 [Sure you can. You can fill in whatever you w... 1.0 \n", + "2 [You're confusing a lot of things here. Compan... 1.0 \n", + "3 [\"I'm afraid the great myth of limited liabili... 1.0 \n", + "4 [You should probably consult an attorney. Howe... 1.0 \n", + "\n", + " answer_relevancy context_recall context_precision harmfulness \n", + "0 0.930311 0.263158 0.0 0 \n", + "1 0.984122 0.363636 0.0 0 \n", + "2 0.883872 0.363636 0.0 0 \n", + "3 0.518287 0.363636 0.0 0 \n", + "4 0.779471 0.000000 0.0 0 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = result.to_pandas()\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "f668fce1", + "metadata": {}, + "source": [ + "And thats it!\n", + "\n", + "if you have any suggestion/feedbacks/things your not happy about, please do share it in the [issue section](https://github.com/explodinggradients/ragas/issues). We love hearing from you 😁" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/howtos/customisations/index.md b/docs/howtos/customisations/index.md index 53e81118a..f56c4ee1d 100644 --- a/docs/howtos/customisations/index.md +++ b/docs/howtos/customisations/index.md @@ -5,4 +5,5 @@ How to customize Ragas for your needs :::{toctree} llms.ipynb azure-openai.ipynb +aws-bedrock.ipynb ::: diff --git a/requirements/dev.txt b/requirements/dev.txt index be811d44f..04e22e4d5 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -6,3 +6,4 @@ pyright llama_index notebook sphinx-autobuild +sentence-transformers diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index ccddf6c86..94c1612ee 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -120,5 +120,7 @@ def init_model(self): """ self.llm.validate_api_key() if hasattr(self, "embeddings"): - self.embeddings = t.cast(RagasEmbeddings, self.embeddings) - self.embeddings.validate_api_key() + # since we are using Langchain Embeddings directly, we need to check this + if hasattr(self.embeddings, "validate_api_key"): + self.embeddings = t.cast(RagasEmbeddings, self.embeddings) + self.embeddings.validate_api_key() diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index f26eee56e..6636fc2b6 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -131,7 +131,8 @@ def _score_batch( else: score = answer_dict.get(response[0][-1]) - scores.append(score) + # patch for critique: force score to 0 if the answer is not Yes or No + scores.append(score if score is not None else 0) return scores