microsoft · diondrapeck · Sep 18, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 17, 2024
@@ -220,42 +220,28 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
         response = response.replace("true", "True")
         parsed_response = literal_eval(response)
         result = {}
-        metric_prefix = _get_metric_prefix(metric_name)
         # Use label instead of score since these are assumed to be boolean results.
         # Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
-        result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
-        result[metric_prefix + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
+        result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
+        result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
 
         if metric_name == EvaluationMetrics.XPIA:
             # Add "manipulated_content", "intrusion" and "information_gathering" to the result
             # if present else set them to np.nan
-            result[metric_prefix + "_manipulated_content"] = (
+            result[metric_name + "_manipulated_content"] = (
                 parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
             )
-            result[metric_prefix + "_intrusion"] = (
+            result[metric_name + "_intrusion"] = (
                 parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
             )
-            result[metric_prefix + "_information_gathering"] = (
+            result[metric_name + "_information_gathering"] = (
                 parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
             )
 
         return result
     return _parse_content_harm_response(batch_response, metric_name)
 
 
-def _get_metric_prefix(metric_name: str) -> str:
-    """Get the prefix for the evaluation metric. This is usually the metric name.
-
-    :param metric_name: The evaluation metric to use.
-    :type metric_name: str
-    :return: The prefix for the evaluation metric.
-    :rtype: str
-    """
-    if metric_name == _InternalEvaluationMetrics.ECI:
-        return "ECI"
-    return metric_name
-
-
 def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
     """Parse the annotation response from Responsible AI service for a content harm evaluation.
 

@@ -65,8 +65,8 @@ class ECIEvaluator:
     .. code-block:: python
 
         {
-            "ECI_label": "False",
-            "ECI_reason": "Some reason."
+            "eci_label": "False",
+            "eci_reason": "Some reason."
         }
     """
 

@@ -470,8 +470,8 @@ def test_eci_evaluator(self, project_scope, azure_cred):
             answer="Rhombus",
         )
         assert unrelated_result is not None
-        assert not unrelated_result["ECI_label"]
-        assert "geometry question" in unrelated_result["ECI_reason"]
+        assert not unrelated_result["eci_label"]
+        assert "geometry question" in unrelated_result["eci_reason"]
 
     # @pytest.mark.skipif(
     #    not is_replay(), reason="API not fully released yet. Don't run in live mode unless connected to INT."

@@ -463,8 +463,8 @@ def test_content_safety_aggregation(self):
 
     def test_label_based_aggregation(self):
         data = {
-            "eci.ECI_label": [True, False, True, False, True],
-            "eci.ECI_reasoning": ["a", "b", "c", "d", "e"],
+            "eci.eci_label": [True, False, True, False, True],
+            "eci.eci_reasoning": ["a", "b", "c", "d", "e"],
             "protected_material.protected_material_label": [False, False, False, False, True],
             "protected_material.protected_material_reasoning": ["f", "g", "h", "i", "j"],
             "unknown.unaccounted_label": [True, False, False, False, True],
@@ -478,11 +478,11 @@ def test_label_based_aggregation(self):
         aggregation = _aggregate_metrics(data_df, evaluators)
         # ECI and PM labels should be replaced with defect rates, unaccounted should not
         assert len(aggregation) == 3
-        assert "eci.ECI_label" not in aggregation
+        assert "eci.eci_label" not in aggregation
         assert "protected_material.protected_material_label" not in aggregation
         assert aggregation["unknown.unaccounted_label"] == 0.4
 
-        assert aggregation["eci.ECI_defect_rate"] == 0.6
+        assert aggregation["eci.eci_defect_rate"] == 0.6
         assert aggregation["protected_material.protected_material_defect_rate"] == 0.2
         assert "unaccounted_defect_rate" not in aggregation