diff --git a/src/promptflow-evals/CHANGELOG.md b/src/promptflow-evals/CHANGELOG.md index 3be508e2bd6..997080a7e10 100644 --- a/src/promptflow-evals/CHANGELOG.md +++ b/src/promptflow-evals/CHANGELOG.md @@ -8,6 +8,7 @@ ### Bugs Fixed - Large simulation was causing a jinja exception, this has been fixed. +- Fixed the issue where the relative data path was not working with the evaluate API when using multiple evaluators. ### Improvements - Converted built-in evaluators to async-based implementation, leveraging async batch run for performance improvement. Introduced `PF_EVALS_BATCH_USE_ASYNC` environment variable to enable/disable async batch run, with the default set to False. diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py index 8f2b979afb9..f6f40c73376 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- import inspect +import os import re from typing import Any, Callable, Dict, Optional, Set, Tuple @@ -442,7 +443,15 @@ def _evaluate( # pylint: disable=too-many-locals # Batch Run evaluators_info = {} use_pf_client = kwargs.get("_use_pf_client", True) - batch_run_client = ProxyClient(pf_client) if use_pf_client else CodeClient() + if use_pf_client: + batch_run_client = ProxyClient(pf_client) + + # Ensure the absolute path is passed to pf.run, as relative path doesn't work with + # multiple evaluators. If the path is already absolute, abspath will return the original path. + data = os.path.abspath(data) + else: + batch_run_client = CodeClient() + data = input_data_df with BatchRunContext(batch_run_client): for evaluator_name, evaluator in evaluators.items(): @@ -452,7 +461,7 @@ def _evaluate( # pylint: disable=too-many-locals run=target_run, evaluator_name=evaluator_name, column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)), - data=input_data_df if isinstance(batch_run_client, CodeClient) else data, + data=data, stream=True, name=kwargs.get("_run_name"), ) diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py index 38dde389654..8e9b6f1fddb 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py @@ -125,6 +125,41 @@ def test_evaluate_with_groundedness_evaluator(self, model_config, data_file): assert row_result_df["outputs.f1_score.f1_score"][2] == 1 assert result["studio_url"] is None + def test_evaluate_with_relative_data_path(self, model_config): + original_working_dir = os.getcwd() + + try: + working_dir = os.path.dirname(__file__) + os.chdir(working_dir) + + data_file = "data/evaluate_test_data.jsonl" + input_data = pd.read_json(data_file, lines=True) + + groundedness_eval = GroundednessEvaluator(model_config) + fluency_eval = FluencyEvaluator(model_config) + + # Run the evaluation + result = evaluate( + data=data_file, + evaluators={"grounded": groundedness_eval, "fluency": fluency_eval}, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + + # Validate the results + assert result is not None + assert result["rows"] is not None + assert row_result_df.shape[0] == len(input_data) + + assert "outputs.grounded.gpt_groundedness" in row_result_df.columns.to_list() + assert "outputs.fluency.gpt_fluency" in row_result_df.columns.to_list() + + assert "grounded.gpt_groundedness" in metrics.keys() + assert "fluency.gpt_fluency" in metrics.keys() + finally: + os.chdir(original_working_dir) + @pytest.mark.azuretest def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file): input_data = pd.read_json(data_file, lines=True)