From 7eec750d841611827ac1f39dee6fc1a4c1d0f69f Mon Sep 17 00:00:00 2001 From: ikka Date: Fri, 6 Dec 2024 12:20:48 +0530 Subject: [PATCH] feat: Test generation for non-english corpus (#1734) --- docs/howtos/customizations/index.md | 2 +- .../testgenerator/_language_adaptation.md | 155 +++++++++ .../testgenerator/language_adaptation.ipynb | 297 ++++++++++++++++++ mkdocs.yml | 2 +- .../testset/synthesizers/multi_hop/base.py | 4 +- .../testset/synthesizers/multi_hop/prompts.py | 43 ++- .../testset/synthesizers/single_hop/base.py | 13 +- .../synthesizers/single_hop/prompts.py | 23 +- 8 files changed, 515 insertions(+), 24 deletions(-) create mode 100644 docs/howtos/customizations/testgenerator/_language_adaptation.md create mode 100644 docs/howtos/customizations/testgenerator/language_adaptation.ipynb diff --git a/docs/howtos/customizations/index.md b/docs/howtos/customizations/index.md index a95efe75f..b2aabcb87 100644 --- a/docs/howtos/customizations/index.md +++ b/docs/howtos/customizations/index.md @@ -15,7 +15,7 @@ How to customize various aspects of Ragas to suit your needs. ## Testset Generation - +- [Generate test data from non-english corpus](testgenerator/_language_adaptation.md) - [Configure or automatically generate Personas](testgenerator/_persona_generator.md) - [Customize single-hop queries for RAG evaluation](testgenerator/_testgen-custom-single-hop.md) - [Create custom multi-hop queries for RAG evaluation](testgenerator/_testgen-customisation.md) diff --git a/docs/howtos/customizations/testgenerator/_language_adaptation.md b/docs/howtos/customizations/testgenerator/_language_adaptation.md new file mode 100644 index 000000000..510168e00 --- /dev/null +++ b/docs/howtos/customizations/testgenerator/_language_adaptation.md @@ -0,0 +1,155 @@ +## Synthetic test generation from non-english corpus + +In this notebook, you'll learn how to adapt synthetic test data generation to non-english corpus settings. For the sake of this tutorial, I am generating queries in Spanish from Spanish wikipedia articles. + +### Download and Load corpus + + +```python +! git clone https://huggingface.co/datasets/explodinggradients/Sample_non_english_corpus +``` + + Cloning into 'Sample_non_english_corpus'... + remote: Enumerating objects: 12, done. + remote: Counting objects: 100% (8/8), done. + remote: Compressing objects: 100% (8/8), done. + remote: Total 12 (delta 0), reused 0 (delta 0), pack-reused 4 (from 1) + Unpacking objects: 100% (12/12), 11.43 KiB | 780.00 KiB/s, done. + + + +```python +from langchain_community.document_loaders import DirectoryLoader, TextLoader + + +path = "Sample_non_english_corpus/" +loader = DirectoryLoader(path, glob="**/*.txt") +docs = loader.load() +``` + + /opt/homebrew/Caskroom/miniforge/base/envs/ragas/lib/python3.9/site-packages/requests/__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.20) or chardet (5.2.0)/charset_normalizer (None) doesn't match a supported version! + warnings.warn("urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported " + + + +```python +len(docs) +``` + + + + + 6 + + + +### Intialize required models + + +```python +from ragas.llms import LangchainLLMWrapper +from ragas.embeddings import LangchainEmbeddingsWrapper +from langchain_openai import ChatOpenAI +from langchain_openai import OpenAIEmbeddings + +generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini")) +generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) +``` + + /opt/homebrew/Caskroom/miniforge/base/envs/ragas/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html + from .autonotebook import tqdm as notebook_tqdm + + +### Setup Persona and transforms +you may automatically create personas using this [notebook](./_persona_generator.md). For the sake of simplicity, I am using a pre-defined person, two basic tranforms and simple specic query distribution. + + +```python +from ragas.testset.persona import Persona + +personas = [ + Persona( + name="curious student", + role_description="A student who is curious about the world and wants to learn more about different cultures and languages", + ), +] +``` + + +```python +from ragas.testset.transforms.extractors.llm_based import NERExtractor +from ragas.testset.transforms.splitters import HeadlineSplitter + +transforms = [HeadlineSplitter(), NERExtractor()] +``` + +### Intialize test generator + + +```python +from ragas.testset import TestsetGenerator + +generator = TestsetGenerator( + llm=generator_llm, embedding_model=generator_embeddings, persona_list=personas +) +``` + +### Load and Adapt Queries + +Here we load the required query types and adapt them to the target language. + + +```python +from ragas.testset.synthesizers.single_hop.specific import ( + SingleHopSpecificQuerySynthesizer, +) + +distribution = [ + (SingleHopSpecificQuerySynthesizer(llm=generator_llm), 1.0), +] + +for query, _ in distribution: + prompts = await query.adapt_prompts("spanish", llm=generator_llm) + query.set_prompts(**prompts) +``` + +### Generate + + +```python +dataset = generator.generate_with_langchain_docs( + docs[:], + testset_size=5, + transforms=transforms, + query_distribution=distribution, +) +``` + + Applying HeadlineSplitter: 0%| | 0/6 [00:00 t.List[str]: contexts = [] - for node in scenario.nodes: - context = f"{node.id}" + "\n\n" + node.properties.get("page_content", "") + for i, node in enumerate(scenario.nodes): + context = f"<{i+1}-hop>" + "\n\n" + node.properties.get("page_content", "") contexts.append(context) return contexts diff --git a/src/ragas/testset/synthesizers/multi_hop/prompts.py b/src/ragas/testset/synthesizers/multi_hop/prompts.py index 279347856..6675421dc 100644 --- a/src/ragas/testset/synthesizers/multi_hop/prompts.py +++ b/src/ragas/testset/synthesizers/multi_hop/prompts.py @@ -72,14 +72,43 @@ class QueryAnswerGenerationPrompt( PydanticPrompt[QueryConditions, GeneratedQueryAnswer] ): instruction: str = ( - "Generate a query and answer based on the specified conditions (persona, themes, style, length) " - "and the provided context. Ensure the answer is fully faithful to the context, only using information " - "directly from the nodes provided." + "Generate a multi-hop query and answer based on the specified conditions (persona, themes, style, length) " + "and the provided context. The themes represent a set of phrases either extracted or generated from the " + "context, which highlight the suitability of the selected context for multi-hop query creation. Ensure the query " + "explicitly incorporates these themes." "### Instructions:\n" - "1. **Generate a Query**: Based on the context, persona, themes, style, and length, create a question " - "that aligns with the persona’s perspective and reflects the themes.\n" - "2. **Generate an Answer**: Using only the content from the provided context, create a faithful and detailed answer to " - "the query. Do not include any information that not in or cannot be inferred from the given context.\n" + "1. **Generate a Multi-Hop Query**: Use the provided context segments and themes to form a query that requires combining " + "information from multiple segments (e.g., `<1-hop>` and `<2-hop>`). Ensure the query explicitly incorporates one or more " + "themes and reflects their relevance to the context.\n" + "2. **Generate an Answer**: Use only the content from the provided context to create a detailed and faithful answer to " + "the query. Avoid adding information that is not directly present or inferable from the given context.\n" + "3. **Multi-Hop Context Tags**:\n" + " - Each context segment is tagged as `<1-hop>`, `<2-hop>`, etc.\n" + " - Ensure the query uses information from at least two segments and connects them meaningfully." ) input_model: t.Type[QueryConditions] = QueryConditions output_model: t.Type[GeneratedQueryAnswer] = GeneratedQueryAnswer + examples: t.List[t.Tuple[QueryConditions, GeneratedQueryAnswer]] = [ + ( + QueryConditions( + persona=Persona( + name="Historian", + role_description="Focuses on major scientific milestones and their global impact.", + ), + themes=["Theory of Relativity", "Experimental Validation"], + query_style="Formal", + query_length="Medium", + context=[ + "<1-hop> Albert Einstein developed the theory of relativity, introducing the concept of spacetime.", + "<2-hop> The bending of light by gravity was confirmed during the 1919 solar eclipse, supporting Einstein’s theory.", + ], + ), + GeneratedQueryAnswer( + query="How was the experimental validation of the theory of relativity achieved during the 1919 solar eclipse?", + answer=( + "The experimental validation of the theory of relativity was achieved during the 1919 solar eclipse by confirming " + "the bending of light by gravity, which supported Einstein’s concept of spacetime as proposed in the theory." + ), + ), + ), + ] diff --git a/src/ragas/testset/synthesizers/single_hop/base.py b/src/ragas/testset/synthesizers/single_hop/base.py index a958117be..967ffaa2f 100644 --- a/src/ragas/testset/synthesizers/single_hop/base.py +++ b/src/ragas/testset/synthesizers/single_hop/base.py @@ -122,7 +122,7 @@ async def _generate_sample( self, scenario: SingleHopScenario, callbacks: Callbacks ) -> SingleTurnSample: - reference_context = self.make_contexts(scenario) + reference_context = scenario.nodes[0].properties.get("page_content", "") prompt_input = QueryCondition( persona=scenario.persona, term=scenario.term, @@ -136,14 +136,5 @@ async def _generate_sample( return SingleTurnSample( user_input=response.query, reference=response.answer, - reference_contexts=reference_context, + reference_contexts=[reference_context], ) - - def make_contexts(self, scenario: SingleHopScenario) -> t.List[str]: - - contexts = [] - for node in scenario.nodes: - context = f"{node.id}" + "\n\n" + node.properties.get("page_content", "") - contexts.append(context) - - return contexts diff --git a/src/ragas/testset/synthesizers/single_hop/prompts.py b/src/ragas/testset/synthesizers/single_hop/prompts.py index 281a86c51..9111f709c 100644 --- a/src/ragas/testset/synthesizers/single_hop/prompts.py +++ b/src/ragas/testset/synthesizers/single_hop/prompts.py @@ -11,7 +11,7 @@ class QueryCondition(BaseModel): term: str query_style: str query_length: str - context: t.List[str] + context: str class GeneratedQueryAnswer(BaseModel): @@ -21,7 +21,7 @@ class GeneratedQueryAnswer(BaseModel): class QueryAnswerGenerationPrompt(PydanticPrompt[QueryCondition, GeneratedQueryAnswer]): instruction: str = ( - "Generate a query and answer based on the specified conditions (persona, term, style, length) " + "Generate a single-hop query and answer based on the specified conditions (persona, term, style, length) " "and the provided context. Ensure the answer is entirely faithful to the context, using only the information " "directly from the provided context." "### Instructions:\n" @@ -32,3 +32,22 @@ class QueryAnswerGenerationPrompt(PydanticPrompt[QueryCondition, GeneratedQueryA ) input_model: t.Type[QueryCondition] = QueryCondition output_model: t.Type[GeneratedQueryAnswer] = GeneratedQueryAnswer + examples: t.List[t.Tuple[QueryCondition, GeneratedQueryAnswer]] = [ + ( + QueryCondition( + persona=Persona( + name="Software Engineer", + role_description="Focuses on coding best practices and system design.", + ), + term="microservices", + query_style="Formal", + query_length="Medium", + context="Microservices are an architectural style where applications are structured as a collection of loosely coupled services. " + "Each service is fine-grained and focuses on a single functionality.", + ), + GeneratedQueryAnswer( + query="What is the purpose of microservices in software architecture?", + answer="Microservices are designed to structure applications as a collection of loosely coupled services, each focusing on a single functionality.", + ), + ), + ]