Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change eci prefix from upper case to lowercase #3771

Merged
merged 3 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 5 additions & 19 deletions src/promptflow-evals/promptflow/evals/_common/rai_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,42 +220,28 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
response = response.replace("true", "True")
parsed_response = literal_eval(response)
result = {}
metric_prefix = _get_metric_prefix(metric_name)
# Use label instead of score since these are assumed to be boolean results.
# Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
result[metric_prefix + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""

if metric_name == EvaluationMetrics.XPIA:
# Add "manipulated_content", "intrusion" and "information_gathering" to the result
# if present else set them to np.nan
result[metric_prefix + "_manipulated_content"] = (
result[metric_name + "_manipulated_content"] = (
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
)
result[metric_prefix + "_intrusion"] = (
result[metric_name + "_intrusion"] = (
parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
)
result[metric_prefix + "_information_gathering"] = (
result[metric_name + "_information_gathering"] = (
parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
)

return result
return _parse_content_harm_response(batch_response, metric_name)


def _get_metric_prefix(metric_name: str) -> str:
"""Get the prefix for the evaluation metric. This is usually the metric name.

:param metric_name: The evaluation metric to use.
:type metric_name: str
:return: The prefix for the evaluation metric.
:rtype: str
"""
if metric_name == _InternalEvaluationMetrics.ECI:
return "ECI"
return metric_name


def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
"""Parse the annotation response from Responsible AI service for a content harm evaluation.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ class ECIEvaluator:
.. code-block:: python

{
"ECI_label": "False",
"ECI_reason": "Some reason."
"eci_label": "False",
"eci_reason": "Some reason."
}
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -470,8 +470,8 @@ def test_eci_evaluator(self, project_scope, azure_cred):
answer="Rhombus",
)
assert unrelated_result is not None
assert not unrelated_result["ECI_label"]
assert "geometry question" in unrelated_result["ECI_reason"]
assert not unrelated_result["eci_label"]
assert "geometry question" in unrelated_result["eci_reason"]

# @pytest.mark.skipif(
# not is_replay(), reason="API not fully released yet. Don't run in live mode unless connected to INT."
Expand Down
8 changes: 4 additions & 4 deletions src/promptflow-evals/tests/evals/unittests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,8 +463,8 @@ def test_content_safety_aggregation(self):

def test_label_based_aggregation(self):
data = {
"eci.ECI_label": [True, False, True, False, True],
"eci.ECI_reasoning": ["a", "b", "c", "d", "e"],
"eci.eci_label": [True, False, True, False, True],
"eci.eci_reasoning": ["a", "b", "c", "d", "e"],
"protected_material.protected_material_label": [False, False, False, False, True],
"protected_material.protected_material_reasoning": ["f", "g", "h", "i", "j"],
"unknown.unaccounted_label": [True, False, False, False, True],
Expand All @@ -478,11 +478,11 @@ def test_label_based_aggregation(self):
aggregation = _aggregate_metrics(data_df, evaluators)
# ECI and PM labels should be replaced with defect rates, unaccounted should not
assert len(aggregation) == 3
assert "eci.ECI_label" not in aggregation
assert "eci.eci_label" not in aggregation
assert "protected_material.protected_material_label" not in aggregation
assert aggregation["unknown.unaccounted_label"] == 0.4

assert aggregation["eci.ECI_defect_rate"] == 0.6
assert aggregation["eci.eci_defect_rate"] == 0.6
assert aggregation["protected_material.protected_material_defect_rate"] == 0.2
assert "unaccounted_defect_rate" not in aggregation

Expand Down
Loading