Skip to content

Commit

Permalink
[promptflow-evals] evaluator config support (#2963)
Browse files Browse the repository at this point in the history
# Description

Using an example to explain of how evaluator config works in evaluate
API:

Data: col1, col2
Target: col3

Mapping:
   question -> ${data.col1} 
   answer -> ${target.col3}


Evaluate API Workflow

1. Update evaluator config
    - Replace all "${target." with "${data."
    - New mapping:
		    question -> ${data.col1} 
		    answer -> ${data.col3}	

1. Apply target to data
     - New data: col1, col2, col3

1. Column validation
    - For each evaluator, rename column: 
       - col1->question, col3->answer
       - question, col2, answer
    - Compare with evaluator signature

1. Call evaluator


# All Promptflow Contribution checklist:
- [ ] **The pull request does not introduce [breaking changes].**
- [ ] **CHANGELOG is updated for new features, bug fixes or other
significant changes.**
- [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).**
- [ ] **Create an issue and link to the pull request to get dedicated
review from promptflow team. Learn more: [suggested
workflow](../CONTRIBUTING.md#suggested-workflow).**

## General Guidelines and Best Practices
- [ ] Title of the pull request is clear and informative.
- [ ] There are a small number of commits, each of which have an
informative message. This means that previously merged commits do not
appear in the history of the PR. For more information on cleaning up the
commits in your PR, [see this
page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md).

### Testing Guidelines
- [ ] Pull request includes test coverage for the included changes.
  • Loading branch information
ninghu authored Apr 26, 2024
1 parent e756829 commit 89c5bbb
Show file tree
Hide file tree
Showing 14 changed files with 407 additions and 384 deletions.
137 changes: 93 additions & 44 deletions src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,15 @@
# ---------------------------------------------------------
import inspect
import os
import re
import tempfile
import uuid

from types import FunctionType
from typing import Any, Callable, Dict, Optional, Set, Tuple

import pandas as pd

from promptflow.client import PFClient

from ._code_client import CodeClient

from promptflow._sdk._constants import LINE_NUMBER
from promptflow.client import PFClient


def _calculate_mean(df) -> Dict[str, float]:
Expand Down Expand Up @@ -70,14 +66,17 @@ def _validate_and_load_data(target, data, evaluators, output_path, tracking_uri,
try:
initial_data_df = pd.read_json(data, lines=True)
except Exception as e:
raise ValueError(
f"Failed to load data from {data}. Please validate it is a valid jsonl data. Error: {str(e)}.")
raise ValueError(f"Failed to load data from {data}. Please validate it is a valid jsonl data. Error: {str(e)}.")

_validate_columns(initial_data_df, evaluators, target)
return initial_data_df


def _validate_columns(df: pd.DataFrame, evaluators: Dict[str, Any], target: Optional[Callable]) -> None:
def _validate_columns(
df: pd.DataFrame,
evaluators: Dict[str, Any],
target: Optional[Callable],
evaluator_config: Dict[str, Dict[str, str]],
) -> None:
"""
Check that all columns needed by evaluator or target function are present.
Expand All @@ -96,14 +95,17 @@ def _validate_columns(df: pd.DataFrame, evaluators: Dict[str, Any], target: Opti
_validate_input_data_for_evaluator(target, None, df, is_target_fn=True)
else:
for evaluator_name, evaluator in evaluators.items():
_validate_input_data_for_evaluator(evaluator, evaluator_name, df)
# Apply column mapping
mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
new_df = _apply_column_mapping(df, mapping_config)

# Validate input data for evaluator
_validate_input_data_for_evaluator(evaluator, evaluator_name, new_df)


def _apply_target_to_data(
target: Callable,
data: str,
pf_client: PFClient,
initial_data: pd.DataFrame) -> Tuple[pd.DataFrame, Set[str]]:
target: Callable, data: str, pf_client: PFClient, initial_data: pd.DataFrame
) -> Tuple[pd.DataFrame, Set[str]]:
"""
Apply the target function to the data set and return updated data and generated columns.
Expand All @@ -121,18 +123,13 @@ def _apply_target_to_data(
# We are manually creating the temporary directory for the flow
# because the way tempdir remove temporary directories will
# hang the debugger, because promptflow will keep flow directory.
run = pf_client.run(
flow=target,
data=data,
name=f'preprocess_{uuid.uuid1()}',
stream=True
)
run = pf_client.run(flow=target, data=data, name=f"preprocess_{uuid.uuid1()}", stream=True)
target_output = pf_client.runs.get_details(run, all_results=True)
# Remove input and output prefix
prefix = 'outputs.'
rename_dict = {col: col[len(prefix):] for col in target_output.columns if col.startswith(prefix)}
prefix = "outputs."
rename_dict = {col: col[len(prefix) :] for col in target_output.columns if col.startswith(prefix)}
# Sort output by line numbers
target_output.set_index(f'inputs.{LINE_NUMBER}', inplace=True)
target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
target_output.sort_index(inplace=True)
target_output.reset_index(inplace=True, drop=False)
# target_output contains only input columns, taken by function,
Expand All @@ -146,6 +143,57 @@ def _apply_target_to_data(
return target_output, set(rename_dict.values())


def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace: bool = False):
"""
Apply column mapping to source_df based on mapping_config.
This function is used for pre-validation of input data for evaluators
"""
result_df = source_df

if mapping_config:
column_mapping = {}
pattern_prefix = "data."

for map_to_key, map_value in mapping_config.items():
match = re.search(r"^\${([^{}]+)}$", map_value)
if match is not None:
pattern = match.group(1)
if pattern.startswith(pattern_prefix):
map_from_key = pattern.split(pattern_prefix)[1]
column_mapping[map_from_key] = map_to_key

result_df = source_df.rename(columns=column_mapping, inplace=inplace)

return result_df


def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]):
"""Process evaluator_config to replace ${target.} with ${data.}"""

processed_config = {}

unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")

if evaluator_config:
for evaluator, mapping_config in evaluator_config.items():
if isinstance(mapping_config, dict):
processed_config[evaluator] = {}

for map_to_key, map_value in mapping_config.items():

# Check if there's any unexpected reference other than ${target.} or ${data.}
if unexpected_references.search(map_value):
raise ValueError(
"Unexpected references detected in 'evaluator_config'. "
"Ensure only ${target.} and ${data.} are used."
)

# Replace ${target.} with ${data.}
processed_config[evaluator][map_to_key] = map_value.replace("${target.", "${data.")

return processed_config


def evaluate(
*,
evaluation_name: Optional[str] = None,
Expand Down Expand Up @@ -176,34 +224,32 @@ def evaluate(
:rtype: ~azure.ai.generative.evaluate.EvaluationResult
"""

input_data_df = _validate_and_load_data(
target, data, evaluators, output_path, tracking_uri, evaluation_name)
input_data_df = _validate_and_load_data(target, data, evaluators, output_path, tracking_uri, evaluation_name)

# Process evaluator config to replace ${target.} with ${data.}
evaluator_config = _process_evaluator_config(evaluator_config)
_validate_columns(input_data_df, evaluators, target, evaluator_config)

pf_client = PFClient()
code_client = CodeClient()

target_generated_columns = set()
if data is not None and target is not None:
input_data_df, target_generated_columns = _apply_target_to_data(
target, data, pf_client, input_data_df)
input_data_df, target_generated_columns = _apply_target_to_data(target, data, pf_client, input_data_df)
# After we have generated all columns we can check if we have
# everything we need for evaluators.
_validate_columns(input_data_df, evaluators, None)
_validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)

evaluator_info = {}

with tempfile.TemporaryDirectory() as d:
data_file = data
if target_generated_columns:
data_file = os.path.join(d, 'input.jsonl')
input_data_df.to_json(data_file, orient='records', lines=True)
for evaluator_name, evaluator in evaluators.items():
if isinstance(evaluator, FunctionType):
evaluator_info.update({evaluator_name: {"client": pf_client, "evaluator": evaluator}})
else:
evaluator_info.update({evaluator_name: {"client": code_client, "evaluator": evaluator}})
data_file = os.path.join(d, "input.jsonl")
input_data_df.to_json(data_file, orient="records", lines=True)

evaluator_info[evaluator_name]["run"] = evaluator_info[evaluator_name]["client"].run(
for evaluator_name, evaluator in evaluators.items():
evaluator_info[evaluator_name] = {}
evaluator_info[evaluator_name]["run"] = pf_client.run(
flow=evaluator,
column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
data=data_file,
Expand All @@ -212,7 +258,7 @@ def evaluate(

evaluators_result_df = None
for evaluator_name, evaluator_info in evaluator_info.items():
evaluator_result_df = evaluator_info["client"].get_details(evaluator_info["run"], all_results=True)
evaluator_result_df = pf_client.get_details(evaluator_info["run"], all_results=True)

# drop input columns
evaluator_result_df = evaluator_result_df.drop(
Expand All @@ -223,8 +269,8 @@ def evaluate(
# Assuming after removing inputs columns, all columns are output columns
evaluator_result_df.rename(
columns={
col: "outputs."
f"{evaluator_name}.{col.replace('outputs.', '')}" for col in evaluator_result_df.columns
col: "outputs." f"{evaluator_name}.{col.replace('outputs.', '')}"
for col in evaluator_result_df.columns
},
inplace=True,
)
Expand All @@ -236,9 +282,12 @@ def evaluate(
)

# Rename columns, generated by template function to outputs instead of inputs.
input_data_df.rename(columns={
col: f"{'outputs' if col in target_generated_columns else 'inputs'}.{col}" for col in input_data_df.columns},
inplace=True)
input_data_df.rename(
columns={
col: f"{'outputs' if col in target_generated_columns else 'inputs'}.{col}" for col in input_data_df.columns
},
inplace=True,
)

result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)

Expand Down
6 changes: 3 additions & 3 deletions src/promptflow-evals/samples/evaluate_test_data.jsonl
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{"question":"How do you create a run?","context":"AML API only","answer":"To create a run using the Azure Machine Learning API, you first need to create an Experiment. Once you have an experiment, you can create a Run object that is associated with that experiment. Here is some Python code that demonstrates this process:\n\n```\nfrom azureml.core import Experiment, Run\nfrom azureml.core.workspace import Workspace\n\n# Define workspace and experiment\nws = Workspace.from_config()\nexp = Experiment(workspace=ws, name='my_experiment')\n\n# Create a new run\nrun = exp.start_logging()\n```\n\nIn this code, the `from_config()` method reads the configuration file that you created when you set up your Azure Machine Learning workspace. The `Experiment` constructor creates an Experiment object that is associated with your workspace, and the `start_logging()` method creates a new Run object that is associated with the Experiment. Now you can use the `run` object to log metrics, upload files, and track other information related to your machine learning experiment."}
{"question":"How do you log a model?","context":"Logging can be done using any OSS Sdk","answer":"There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`."}
{"question":"What is the capital of France?","context":"France is in Europe","answer":"Paris is the capital of France."}
{"question": "What is the capital of France?", "context": "France is in Europe", "answer": "Paris is the capital of France.", "ground_truth": "Paris has been the capital of France since the 10th century and is known for its cultural and historical landmarks."}
{"question": "Who developed the theory of relativity?", "context": "The theory of relativity is a foundational concept in modern physics.", "answer": "Albert Einstein developed the theory of relativity.", "ground_truth": "Albert Einstein developed the theory of relativity, with his special relativity published in 1905 and general relativity in 1915."}
{"question": "What is the speed of light?", "context": "Light travels at a constant speed in a vacuum.", "answer": "The speed of light is approximately 299,792,458 meters per second.", "ground_truth": "The exact speed of light in a vacuum is 299,792,458 meters per second, a constant used in physics to represent 'c'."}
Loading

0 comments on commit 89c5bbb

Please sign in to comment.