Convert composite evaluators to async based implementation (#3601)

# Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](https://github.com/microsoft/promptflow/blob/main/CONTRIBUTING.md).** - [ ] **I confirm that all new dependencies are compatible with the MIT license.** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes.
microsoft · Aug 1, 2024 · 59d5b9f · 59d5b9f
1 parent 401ef1c
commit 59d5b9f
Show file tree

Hide file tree

Showing 38 changed files with 1,582 additions and 974 deletions.
diff --git a/src/promptflow-evals/CHANGELOG.md b/src/promptflow-evals/CHANGELOG.md
@@ -4,13 +4,13 @@
 
 ### Features Added
 - Introduced `JailbreakAdversarialSimulator` for customers who need to do run jailbreak and non jailbreak adversarial simulations at the same time. More info in the README.md in `/promptflow/evals/synthetic/README.md#jailbreak-simulator`
-- Exposed batch evaluation run timeout via "PF_BATCH_TIMEOUT_SEC" environment variable. This variable can be used to set the timeout for the batch evaluation for each evaluator and target separately only, not the entire API call.
+- Exposed batch evaluation run timeout via `PF_BATCH_TIMEOUT_SEC` environment variable. This variable can be used to set the timeout for the batch evaluation for each evaluator and target separately only, not the entire API call.
 
 ### Bugs Fixed
 - Large simulation was causing a jinja exception, this has been fixed.
 
 ### Improvements
-- Converted built-in evaluators to async-based implementation, leveraging async batch run for performance improvement.
+- Converted built-in evaluators to async-based implementation, leveraging async batch run for performance improvement. Introduced `PF_EVALS_BATCH_USE_ASYNC` environment variable to enable/disable async batch run, with the default set to False.
 - Parity between evals and Simulator on signature, passing credentials.
 - The `AdversarialSimulator` responds with `category` of harm in the response.
 - Reduced chances of NaNs in GPT based evaluators.

diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_batch_run_client/proxy_client.py b/src/promptflow-evals/promptflow/evals/evaluate/_batch_run_client/proxy_client.py
@@ -3,6 +3,7 @@
 # ---------------------------------------------------------
 import inspect
 import logging
+import os
 
 import numpy as np
 
@@ -50,9 +51,13 @@ def get_metrics(self, proxy_run):
 
     @staticmethod
     def _should_batch_use_async(flow):
-        if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
-            return True
-        elif inspect.iscoroutinefunction(flow):
-            return True
-        else:
-            return False
+        # TODO: Change default to true after promptflow-core releases fix for error handler for async prompty
+        # https://github.com/microsoft/promptflow/pull/3598
+        if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "false").lower() == "true":
+            if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
+                return True
+            elif inspect.iscoroutinefunction(flow):
+                return True
+            else:
+                return False
+        return False
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_chat/_chat.py b/src/promptflow-evals/promptflow/evals/evaluators/_chat/_chat.py
@@ -1,16 +1,15 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-
+import asyncio
 import json
 import logging
-from concurrent.futures import as_completed
 from typing import Dict, List
 
 import numpy as np
 
+from promptflow._utils.async_utils import async_run_allowing_running_loop
 from promptflow.core import AzureOpenAIModelConfiguration
-from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
 
 from .._coherence import CoherenceEvaluator
 from .._fluency import FluencyEvaluator
@@ -21,57 +20,7 @@
 logger = logging.getLogger(__name__)
 
 
-class ChatEvaluator:
-    """
-    Initialize a chat evaluator configured for a specific Azure OpenAI model.
-
-    :param model_config: Configuration for the Azure OpenAI model.
-    :type model_config: AzureOpenAIModelConfiguration
-    :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
-        focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
-    :type eval_last_turn: bool
-    :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
-        Default is True.
-    :type parallel: bool
-    :return: A function that evaluates and generates metrics for "chat" scenario.
-    :rtype: function
-
-    **Usage**
-
-    .. code-block:: python
-
-        chat_eval = ChatEvaluator(model_config)
-        conversation = [
-            {"role": "user", "content": "What is the value of 2 + 2?"},
-            {"role": "assistant", "content": "2 + 2 = 4", "context": {
-                "citations": [
-                        {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
-                        ]
-                }
-            }
-        ]
-        result = chat_eval(conversation=conversation)
-
-    **Output format**
-
-    .. code-block:: python
-
-        {
-            "evaluation_per_turn": {
-                "gpt_retrieval": [1.0, 2.0],
-                "gpt_groundedness": [5.0, 2.0],
-                "gpt_relevance": [3.0, 5.0],
-                "gpt_coherence": [1.0, 2.0],
-                "gpt_fluency": [3.0, 5.0]
-            }
-            "gpt_retrieval": 1.5,
-            "gpt_groundedness": 3.5,
-            "gpt_relevance": 4.0,
-            "gpt_coherence": 1.5,
-            "gpt_fluency": 4.0
-        }
-    """
-
+class _AsyncChatEvaluator:
     def __init__(
         self, model_config: AzureOpenAIModelConfiguration, eval_last_turn: bool = False, parallel: bool = True
     ):
@@ -80,19 +29,19 @@ def __init__(
 
         # TODO: Need a built-in evaluator for retrieval. It needs to be added to `self._rag_evaluators` collection
         self._rag_evaluators = [
-            GroundednessEvaluator(model_config),
-            RelevanceEvaluator(model_config),
+            GroundednessEvaluator(model_config)._to_async(),
+            RelevanceEvaluator(model_config)._to_async(),
         ]
         self._non_rag_evaluators = [
-            CoherenceEvaluator(model_config),
-            FluencyEvaluator(model_config),
+            CoherenceEvaluator(model_config)._to_async(),
+            FluencyEvaluator(model_config)._to_async(),
         ]
         # TODO: Temporary workaround to close the gap of missing retrieval score
         # https://msdata.visualstudio.com/Vienna/_workitems/edit/3186644
         # For long term, we need to add a built-in evaluator for retrieval after prompt is generalized for QA and Chat
-        self._retrieval_chat_evaluator = RetrievalChatEvaluator(model_config)
+        self._retrieval_chat_evaluator = RetrievalChatEvaluator(model_config)._to_async()
 
-    def __call__(self, *, conversation, **kwargs):
+    async def __call__(self, *, conversation, **kwargs):
         """
         Evaluates chat scenario.
 
@@ -148,21 +97,20 @@ def __call__(self, *, conversation, **kwargs):
 
             if self._parallel:
                 # Parallel execution
-                with ThreadPoolExecutor() as executor:
-                    future_to_evaluator = {
-                        executor.submit(
-                            self._evaluate_turn, turn_num, questions, answers, contexts, evaluator
-                        ): evaluator
-                        for evaluator in selected_evaluators
-                    }
-
-                    for future in as_completed(future_to_evaluator):
-                        result = future.result()
+                tasks = [
+                    self._evaluate_turn(turn_num, questions, answers, contexts, evaluator)
+                    for evaluator in selected_evaluators
+                ]
+                results = await asyncio.gather(*tasks, return_exceptions=True)
+                for result in results:
+                    if isinstance(result, Exception):
+                        logger.warning(f"Exception occurred during evaluation: {result}")
+                    else:
                         current_turn_result.update(result)
             else:
                 # Sequential execution
                 for evaluator in selected_evaluators:
-                    result = self._evaluate_turn(turn_num, questions, answers, contexts, evaluator)
+                    result = await self._evaluate_turn(turn_num, questions, answers, contexts, evaluator)
                     current_turn_result.update(result)
 
             per_turn_results.append(current_turn_result)
@@ -181,20 +129,20 @@ def __call__(self, *, conversation, **kwargs):
 
         # Run RetrievalChatEvaluator and merge the results
         if compute_rag_based_metrics:
-            retrieval_score = self._retrieval_chat_evaluator(conversation=conversation_slice)
+            retrieval_score = await self._retrieval_chat_evaluator(conversation=conversation_slice)
             aggregated["gpt_retrieval"] = retrieval_score["gpt_retrieval"]
             aggregated["evaluation_per_turn"]["gpt_retrieval"] = retrieval_score["evaluation_per_turn"]["gpt_retrieval"]
             aggregated = dict(sorted(aggregated.items()))
 
         return aggregated
 
-    def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator):
+    async def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator):
         try:
             question = questions[turn_num] if turn_num < len(questions) else ""
             answer = answers[turn_num] if turn_num < len(answers) else ""
             context = contexts[turn_num] if turn_num < len(contexts) else ""
 
-            score = evaluator(question=question, answer=answer, context=context)
+            score = await evaluator(question=question, answer=answer, context=context)
 
             return score
         except Exception as e:  # pylint: disable=broad-exception-caught
@@ -287,3 +235,75 @@ def _validate_conversation(self, conversation: List[Dict]):
         # Ensure the conversation ends with an assistant's turn
         if expected_role != "user":
             raise ValueError("The conversation must end with an assistant's turn.")
+
+
+class ChatEvaluator:
+    """
+    Initialize a chat evaluator configured for a specific Azure OpenAI model.
+
+    :param model_config: Configuration for the Azure OpenAI model.
+    :type model_config: AzureOpenAIModelConfiguration
+    :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
+        focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
+    :type eval_last_turn: bool
+    :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
+        Default is True.
+    :type parallel: bool
+    :return: A function that evaluates and generates metrics for "chat" scenario.
+    :rtype: function
+
+    **Usage**
+
+    .. code-block:: python
+
+        chat_eval = ChatEvaluator(model_config)
+        conversation = [
+            {"role": "user", "content": "What is the value of 2 + 2?"},
+            {"role": "assistant", "content": "2 + 2 = 4", "context": {
+                "citations": [
+                        {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
+                        ]
+                }
+            }
+        ]
+        result = chat_eval(conversation=conversation)
+
+    **Output format**
+
+    .. code-block:: python
+
+        {
+            "evaluation_per_turn": {
+                "gpt_retrieval": [1.0, 2.0],
+                "gpt_groundedness": [5.0, 2.0],
+                "gpt_relevance": [3.0, 5.0],
+                "gpt_coherence": [1.0, 2.0],
+                "gpt_fluency": [3.0, 5.0]
+            }
+            "gpt_retrieval": 1.5,
+            "gpt_groundedness": 3.5,
+            "gpt_relevance": 4.0,
+            "gpt_coherence": 1.5,
+            "gpt_fluency": 4.0
+        }
+    """
+
+    def __init__(
+        self, model_config: AzureOpenAIModelConfiguration, eval_last_turn: bool = False, parallel: bool = True
+    ):
+        self._async_evaluator = _AsyncChatEvaluator(model_config, eval_last_turn, parallel)
+
+    def __call__(self, *, conversation, **kwargs):
+        """
+        Evaluates chat scenario.
+
+        :keyword conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys.
+            "context" key is optional for assistant's turn and should have "citations" key with list of citations.
+        :paramtype conversation: List[Dict]
+        :return: The scores for Chat scenario.
+        :rtype: dict
+        """
+        return async_run_allowing_running_loop(self._async_evaluator, conversation=conversation, **kwargs)
+
+    def _to_async(self):
+        return self._async_evaluator