From 7eec750d841611827ac1f39dee6fc1a4c1d0f69f Mon Sep 17 00:00:00 2001
From: ikka <shahules786@gmail.com>
Date: Fri, 6 Dec 2024 12:20:48 +0530
Subject: [PATCH] feat: Test generation for non-english corpus (#1734)

---
 docs/howtos/customizations/index.md           |   2 +-
 .../testgenerator/_language_adaptation.md     | 155 +++++++++
 .../testgenerator/language_adaptation.ipynb   | 297 ++++++++++++++++++
 mkdocs.yml                                    |   2 +-
 .../testset/synthesizers/multi_hop/base.py    |   4 +-
 .../testset/synthesizers/multi_hop/prompts.py |  43 ++-
 .../testset/synthesizers/single_hop/base.py   |  13 +-
 .../synthesizers/single_hop/prompts.py        |  23 +-
 8 files changed, 515 insertions(+), 24 deletions(-)
 create mode 100644 docs/howtos/customizations/testgenerator/_language_adaptation.md
 create mode 100644 docs/howtos/customizations/testgenerator/language_adaptation.ipynb

diff --git a/docs/howtos/customizations/index.md b/docs/howtos/customizations/index.md
index a95efe75f..b2aabcb87 100644
--- a/docs/howtos/customizations/index.md
+++ b/docs/howtos/customizations/index.md
@@ -15,7 +15,7 @@ How to customize various aspects of Ragas to suit your needs.
 
 
 ## Testset Generation
-
+- [Generate test data from non-english corpus](testgenerator/_language_adaptation.md)
 - [Configure or automatically generate Personas](testgenerator/_persona_generator.md)
 - [Customize single-hop queries for RAG evaluation](testgenerator/_testgen-custom-single-hop.md)
 - [Create custom multi-hop queries for RAG evaluation](testgenerator/_testgen-customisation.md)
diff --git a/docs/howtos/customizations/testgenerator/_language_adaptation.md b/docs/howtos/customizations/testgenerator/_language_adaptation.md
new file mode 100644
index 000000000..510168e00
--- /dev/null
+++ b/docs/howtos/customizations/testgenerator/_language_adaptation.md
@@ -0,0 +1,155 @@
+## Synthetic test generation from non-english corpus
+
+In this notebook, you'll learn how to adapt synthetic test data generation to non-english corpus settings. For the sake of this tutorial, I am generating queries in Spanish from Spanish wikipedia articles. 
+
+### Download and Load corpus
+
+
+```python
+! git clone https://huggingface.co/datasets/explodinggradients/Sample_non_english_corpus
+```
+
+    Cloning into 'Sample_non_english_corpus'...
+    remote: Enumerating objects: 12, done.[K
+    remote: Counting objects: 100% (8/8), done.[K
+    remote: Compressing objects: 100% (8/8), done.[K
+    remote: Total 12 (delta 0), reused 0 (delta 0), pack-reused 4 (from 1)[K
+    Unpacking objects: 100% (12/12), 11.43 KiB | 780.00 KiB/s, done.
+
+
+
+```python
+from langchain_community.document_loaders import DirectoryLoader, TextLoader
+
+
+path = "Sample_non_english_corpus/"
+loader = DirectoryLoader(path, glob="**/*.txt")
+docs = loader.load()
+```
+
+    /opt/homebrew/Caskroom/miniforge/base/envs/ragas/lib/python3.9/site-packages/requests/__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.20) or chardet (5.2.0)/charset_normalizer (None) doesn't match a supported version!
+      warnings.warn("urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported "
+
+
+
+```python
+len(docs)
+```
+
+
+
+
+    6
+
+
+
+### Intialize required models
+
+
+```python
+from ragas.llms import LangchainLLMWrapper
+from ragas.embeddings import LangchainEmbeddingsWrapper
+from langchain_openai import ChatOpenAI
+from langchain_openai import OpenAIEmbeddings
+
+generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
+generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
+```
+
+    /opt/homebrew/Caskroom/miniforge/base/envs/ragas/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
+      from .autonotebook import tqdm as notebook_tqdm
+
+
+### Setup Persona and transforms
+you may automatically create personas using this [notebook](./_persona_generator.md). For the sake of simplicity, I am using a pre-defined person, two basic tranforms and simple specic query distribution.
+
+
+```python
+from ragas.testset.persona import Persona
+
+personas = [
+    Persona(
+        name="curious student",
+        role_description="A student who is curious about the world and wants to learn more about different cultures and languages",
+    ),
+]
+```
+
+
+```python
+from ragas.testset.transforms.extractors.llm_based import NERExtractor
+from ragas.testset.transforms.splitters import HeadlineSplitter
+
+transforms = [HeadlineSplitter(), NERExtractor()]
+```
+
+### Intialize test generator
+
+
+```python
+from ragas.testset import TestsetGenerator
+
+generator = TestsetGenerator(
+    llm=generator_llm, embedding_model=generator_embeddings, persona_list=personas
+)
+```
+
+### Load and Adapt Queries
+
+Here we load the required query types and adapt them to the target language. 
+
+
+```python
+from ragas.testset.synthesizers.single_hop.specific import (
+    SingleHopSpecificQuerySynthesizer,
+)
+
+distribution = [
+    (SingleHopSpecificQuerySynthesizer(llm=generator_llm), 1.0),
+]
+
+for query, _ in distribution:
+    prompts = await query.adapt_prompts("spanish", llm=generator_llm)
+    query.set_prompts(**prompts)
+```
+
+### Generate
+
+
+```python
+dataset = generator.generate_with_langchain_docs(
+    docs[:],
+    testset_size=5,
+    transforms=transforms,
+    query_distribution=distribution,
+)
+```
+
+    Applying HeadlineSplitter:   0%|          | 0/6 [00:00<?, ?it/s]unable to apply transformation: 'headlines' property not found in this node
+    unable to apply transformation: 'headlines' property not found in this node
+    unable to apply transformation: 'headlines' property not found in this node
+    unable to apply transformation: 'headlines' property not found in this node
+    unable to apply transformation: 'headlines' property not found in this node
+    unable to apply transformation: 'headlines' property not found in this node
+    Generating Scenarios: 100%|██████████| 1/1 [00:07<00:00,  7.75s/it] 
+    Generating Samples: 100%|██████████| 5/5 [00:03<00:00,  1.65it/s]
+
+
+
+```python
+eval_dataset = dataset.to_evaluation_dataset()
+```
+
+
+```python
+print("Query:", eval_dataset[0].user_input)
+print("Reference:", eval_dataset[0].reference)
+```
+
+    Query: Quelles sont les caractéristiques du Bronx en tant que borough de New York?
+    Reference: Le Bronx est l'un des cinq arrondissements de New York, qui est la plus grande ville des États-Unis. Bien que le contexte ne fournisse pas de détails spécifiques sur le Bronx, il mentionne que New York est une ville cosmopolite avec de nombreux quartiers ethniques, ce qui pourrait inclure des caractéristiques culturelles variées présentes dans le Bronx.
+
+
+That's it. You can customize the test generation process as per your requirements.
+
+
diff --git a/docs/howtos/customizations/testgenerator/language_adaptation.ipynb b/docs/howtos/customizations/testgenerator/language_adaptation.ipynb
new file mode 100644
index 000000000..67b3e7fe4
--- /dev/null
+++ b/docs/howtos/customizations/testgenerator/language_adaptation.ipynb
@@ -0,0 +1,297 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Synthetic test generation from multi-lingual and cross-lingual corpus\n",
+    "\n",
+    "In this notebook, you'll learn how to adapt synthetic test data generation to multi-lingual (non english) and cross-lingual settings. For the sake of this tutorial, I am generating queries in Spanish from Spanish wikipedia articles. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Download and Load corpus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Cloning into 'Sample_non_english_corpus'...\n",
+      "remote: Enumerating objects: 12, done.\u001b[K\n",
+      "remote: Counting objects: 100% (8/8), done.\u001b[K\n",
+      "remote: Compressing objects: 100% (8/8), done.\u001b[K\n",
+      "remote: Total 12 (delta 0), reused 0 (delta 0), pack-reused 4 (from 1)\u001b[K\n",
+      "Unpacking objects: 100% (12/12), 11.43 KiB | 780.00 KiB/s, done.\n"
+     ]
+    }
+   ],
+   "source": [
+    "! git clone https://huggingface.co/datasets/explodinggradients/Sample_non_english_corpus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/homebrew/Caskroom/miniforge/base/envs/ragas/lib/python3.9/site-packages/requests/__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.20) or chardet (5.2.0)/charset_normalizer (None) doesn't match a supported version!\n",
+      "  warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain_community.document_loaders import DirectoryLoader, TextLoader\n",
+    "\n",
+    "\n",
+    "path = \"Sample_non_english_corpus/\"\n",
+    "loader = DirectoryLoader(path, glob=\"**/*.txt\")\n",
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "6"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(docs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Intialize required models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/homebrew/Caskroom/miniforge/base/envs/ragas/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from ragas.llms import LangchainLLMWrapper\n",
+    "from ragas.embeddings import LangchainEmbeddingsWrapper\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "\n",
+    "generator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o-mini\"))\n",
+    "generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Setup Persona and transforms\n",
+    "you may automatically create personas using this [notebook](./_persona_generator.md). For the sake of simplicity, I am using a pre-defined person, two basic tranforms and simple specic query distribution."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ragas.testset.persona import Persona\n",
+    "\n",
+    "personas = [\n",
+    "    Persona(\n",
+    "        name=\"curious student\",\n",
+    "        role_description=\"A student who is curious about the world and wants to learn more about different cultures and languages\",\n",
+    "    ),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ragas.testset.transforms.extractors.llm_based import NERExtractor\n",
+    "from ragas.testset.transforms.splitters import HeadlineSplitter\n",
+    "\n",
+    "transforms = [HeadlineSplitter(), NERExtractor()]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Intialize test generator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ragas.testset import TestsetGenerator\n",
+    "\n",
+    "generator = TestsetGenerator(\n",
+    "    llm=generator_llm, embedding_model=generator_embeddings, persona_list=personas\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load and Adapt Queries\n",
+    "\n",
+    "Here we load the required query types and adapt them to the target language. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ragas.testset.synthesizers.single_hop.specific import (\n",
+    "    SingleHopSpecificQuerySynthesizer,\n",
+    ")\n",
+    "\n",
+    "distribution = [\n",
+    "    (SingleHopSpecificQuerySynthesizer(llm=generator_llm), 1.0),\n",
+    "]\n",
+    "\n",
+    "for query, _ in distribution:\n",
+    "    prompts = await query.adapt_prompts(\"spanish\", llm=generator_llm)\n",
+    "    query.set_prompts(**prompts)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Generate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Applying HeadlineSplitter:   0%|          | 0/6 [00:00<?, ?it/s]unable to apply transformation: 'headlines' property not found in this node\n",
+      "unable to apply transformation: 'headlines' property not found in this node\n",
+      "unable to apply transformation: 'headlines' property not found in this node\n",
+      "unable to apply transformation: 'headlines' property not found in this node\n",
+      "unable to apply transformation: 'headlines' property not found in this node\n",
+      "unable to apply transformation: 'headlines' property not found in this node\n",
+      "Generating Scenarios: 100%|██████████| 1/1 [00:07<00:00,  7.75s/it] \n",
+      "Generating Samples: 100%|██████████| 5/5 [00:03<00:00,  1.65it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset = generator.generate_with_langchain_docs(\n",
+    "    docs[:],\n",
+    "    testset_size=5,\n",
+    "    transforms=transforms,\n",
+    "    query_distribution=distribution,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_dataset = dataset.to_evaluation_dataset()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Query: Quelles sont les caractéristiques du Bronx en tant que borough de New York?\n",
+      "Reference: Le Bronx est l'un des cinq arrondissements de New York, qui est la plus grande ville des États-Unis. Bien que le contexte ne fournisse pas de détails spécifiques sur le Bronx, il mentionne que New York est une ville cosmopolite avec de nombreux quartiers ethniques, ce qui pourrait inclure des caractéristiques culturelles variées présentes dans le Bronx.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Query:\", eval_dataset[0].user_input)\n",
+    "print(\"Reference:\", eval_dataset[0].reference)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "That's it. You can customize the test generation process as per your requirements."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/mkdocs.yml b/mkdocs.yml
index 2b0782c9d..728b12304 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -83,7 +83,7 @@ nav:
               - Write your own Metrics: howtos/customizations/metrics/_write_your_own_metric.md
               - Write your own Metrics - (advanced): howtos/customizations/metrics/_write_your_own_metric_advanced.md
           - Testset Generation:
-              - howtos/customizations/testgenerator/index.md
+              - Non-English Testset Generation: howtos/customizations/testgenerator/_language_adaptation.md
               - Persona Generation: howtos/customizations/testgenerator/_persona_generator.md
               - Custom Single-hop Query: howtos/customizations/testgenerator/_testgen-custom-single-hop.md
               - Custom Multi-hop Query: howtos/customizations/testgenerator/_testgen-customisation.md
diff --git a/src/ragas/testset/synthesizers/multi_hop/base.py b/src/ragas/testset/synthesizers/multi_hop/base.py
index 7d8be7ea8..7c8a1c2f5 100644
--- a/src/ragas/testset/synthesizers/multi_hop/base.py
+++ b/src/ragas/testset/synthesizers/multi_hop/base.py
@@ -181,8 +181,8 @@ async def _generate_sample(
     def make_contexts(self, scenario: MultiHopScenario) -> t.List[str]:
 
         contexts = []
-        for node in scenario.nodes:
-            context = f"{node.id}" + "\n\n" + node.properties.get("page_content", "")
+        for i, node in enumerate(scenario.nodes):
+            context = f"<{i+1}-hop>" + "\n\n" + node.properties.get("page_content", "")
             contexts.append(context)
 
         return contexts
diff --git a/src/ragas/testset/synthesizers/multi_hop/prompts.py b/src/ragas/testset/synthesizers/multi_hop/prompts.py
index 279347856..6675421dc 100644
--- a/src/ragas/testset/synthesizers/multi_hop/prompts.py
+++ b/src/ragas/testset/synthesizers/multi_hop/prompts.py
@@ -72,14 +72,43 @@ class QueryAnswerGenerationPrompt(
     PydanticPrompt[QueryConditions, GeneratedQueryAnswer]
 ):
     instruction: str = (
-        "Generate a query and answer based on the specified conditions (persona, themes, style, length) "
-        "and the provided context. Ensure the answer is fully faithful to the context, only using information "
-        "directly from the nodes provided."
+        "Generate a multi-hop query and answer based on the specified conditions (persona, themes, style, length) "
+        "and the provided context. The themes represent a set of phrases either extracted or generated from the "
+        "context, which highlight the suitability of the selected context for multi-hop query creation. Ensure the query "
+        "explicitly incorporates these themes."
         "### Instructions:\n"
-        "1. **Generate a Query**: Based on the context, persona, themes, style, and length, create a question "
-        "that aligns with the persona’s perspective and reflects the themes.\n"
-        "2. **Generate an Answer**: Using only the content from the provided context, create a faithful and detailed  answer to "
-        "the query. Do not include any information that not in or cannot be inferred from the given context.\n"
+        "1. **Generate a Multi-Hop Query**: Use the provided context segments and themes to form a query that requires combining "
+        "information from multiple segments (e.g., `<1-hop>` and `<2-hop>`). Ensure the query explicitly incorporates one or more "
+        "themes and reflects their relevance to the context.\n"
+        "2. **Generate an Answer**: Use only the content from the provided context to create a detailed and faithful answer to "
+        "the query. Avoid adding information that is not directly present or inferable from the given context.\n"
+        "3. **Multi-Hop Context Tags**:\n"
+        "   - Each context segment is tagged as `<1-hop>`, `<2-hop>`, etc.\n"
+        "   - Ensure the query uses information from at least two segments and connects them meaningfully."
     )
     input_model: t.Type[QueryConditions] = QueryConditions
     output_model: t.Type[GeneratedQueryAnswer] = GeneratedQueryAnswer
+    examples: t.List[t.Tuple[QueryConditions, GeneratedQueryAnswer]] = [
+        (
+            QueryConditions(
+                persona=Persona(
+                    name="Historian",
+                    role_description="Focuses on major scientific milestones and their global impact.",
+                ),
+                themes=["Theory of Relativity", "Experimental Validation"],
+                query_style="Formal",
+                query_length="Medium",
+                context=[
+                    "<1-hop> Albert Einstein developed the theory of relativity, introducing the concept of spacetime.",
+                    "<2-hop> The bending of light by gravity was confirmed during the 1919 solar eclipse, supporting Einstein’s theory.",
+                ],
+            ),
+            GeneratedQueryAnswer(
+                query="How was the experimental validation of the theory of relativity achieved during the 1919 solar eclipse?",
+                answer=(
+                    "The experimental validation of the theory of relativity was achieved during the 1919 solar eclipse by confirming "
+                    "the bending of light by gravity, which supported Einstein’s concept of spacetime as proposed in the theory."
+                ),
+            ),
+        ),
+    ]
diff --git a/src/ragas/testset/synthesizers/single_hop/base.py b/src/ragas/testset/synthesizers/single_hop/base.py
index a958117be..967ffaa2f 100644
--- a/src/ragas/testset/synthesizers/single_hop/base.py
+++ b/src/ragas/testset/synthesizers/single_hop/base.py
@@ -122,7 +122,7 @@ async def _generate_sample(
         self, scenario: SingleHopScenario, callbacks: Callbacks
     ) -> SingleTurnSample:
 
-        reference_context = self.make_contexts(scenario)
+        reference_context = scenario.nodes[0].properties.get("page_content", "")
         prompt_input = QueryCondition(
             persona=scenario.persona,
             term=scenario.term,
@@ -136,14 +136,5 @@ async def _generate_sample(
         return SingleTurnSample(
             user_input=response.query,
             reference=response.answer,
-            reference_contexts=reference_context,
+            reference_contexts=[reference_context],
         )
-
-    def make_contexts(self, scenario: SingleHopScenario) -> t.List[str]:
-
-        contexts = []
-        for node in scenario.nodes:
-            context = f"{node.id}" + "\n\n" + node.properties.get("page_content", "")
-            contexts.append(context)
-
-        return contexts
diff --git a/src/ragas/testset/synthesizers/single_hop/prompts.py b/src/ragas/testset/synthesizers/single_hop/prompts.py
index 281a86c51..9111f709c 100644
--- a/src/ragas/testset/synthesizers/single_hop/prompts.py
+++ b/src/ragas/testset/synthesizers/single_hop/prompts.py
@@ -11,7 +11,7 @@ class QueryCondition(BaseModel):
     term: str
     query_style: str
     query_length: str
-    context: t.List[str]
+    context: str
 
 
 class GeneratedQueryAnswer(BaseModel):
@@ -21,7 +21,7 @@ class GeneratedQueryAnswer(BaseModel):
 
 class QueryAnswerGenerationPrompt(PydanticPrompt[QueryCondition, GeneratedQueryAnswer]):
     instruction: str = (
-        "Generate a query and answer based on the specified conditions (persona, term, style, length) "
+        "Generate a single-hop query and answer based on the specified conditions (persona, term, style, length) "
         "and the provided context. Ensure the answer is entirely faithful to the context, using only the information "
         "directly from the provided context."
         "### Instructions:\n"
@@ -32,3 +32,22 @@ class QueryAnswerGenerationPrompt(PydanticPrompt[QueryCondition, GeneratedQueryA
     )
     input_model: t.Type[QueryCondition] = QueryCondition
     output_model: t.Type[GeneratedQueryAnswer] = GeneratedQueryAnswer
+    examples: t.List[t.Tuple[QueryCondition, GeneratedQueryAnswer]] = [
+        (
+            QueryCondition(
+                persona=Persona(
+                    name="Software Engineer",
+                    role_description="Focuses on coding best practices and system design.",
+                ),
+                term="microservices",
+                query_style="Formal",
+                query_length="Medium",
+                context="Microservices are an architectural style where applications are structured as a collection of loosely coupled services. "
+                "Each service is fine-grained and focuses on a single functionality.",
+            ),
+            GeneratedQueryAnswer(
+                query="What is the purpose of microservices in software architecture?",
+                answer="Microservices are designed to structure applications as a collection of loosely coupled services, each focusing on a single functionality.",
+            ),
+        ),
+    ]