Change eci prefix from upper case to lowercase (#3771)

# Description This PR changes ECI case from upper to lower to match other evaluator type casing and integrate better with UI evaluation flows. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](https://github.com/microsoft/promptflow/blob/main/CONTRIBUTING.md).** - [ ] **I confirm that all new dependencies are compatible with the MIT license.** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes.
microsoft · Sep 18, 2024 · 3cde352 · 3cde352
1 parent e79da2e
commit 3cde352
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 27 deletions.
diff --git a/src/promptflow-evals/promptflow/evals/_common/rai_service.py b/src/promptflow-evals/promptflow/evals/_common/rai_service.py
@@ -220,42 +220,28 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
         response = response.replace("true", "True")
         parsed_response = literal_eval(response)
         result = {}
-        metric_prefix = _get_metric_prefix(metric_name)
         # Use label instead of score since these are assumed to be boolean results.
         # Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
-        result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
-        result[metric_prefix + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
+        result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
+        result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
 
         if metric_name == EvaluationMetrics.XPIA:
             # Add "manipulated_content", "intrusion" and "information_gathering" to the result
             # if present else set them to np.nan
-            result[metric_prefix + "_manipulated_content"] = (
+            result[metric_name + "_manipulated_content"] = (
                 parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
             )
-            result[metric_prefix + "_intrusion"] = (
+            result[metric_name + "_intrusion"] = (
                 parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
             )
-            result[metric_prefix + "_information_gathering"] = (
+            result[metric_name + "_information_gathering"] = (
                 parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
             )
 
         return result
     return _parse_content_harm_response(batch_response, metric_name)
 
 
-def _get_metric_prefix(metric_name: str) -> str:
-    """Get the prefix for the evaluation metric. This is usually the metric name.
-
-    :param metric_name: The evaluation metric to use.
-    :type metric_name: str
-    :return: The prefix for the evaluation metric.
-    :rtype: str
-    """
-    if metric_name == _InternalEvaluationMetrics.ECI:
-        return "ECI"
-    return metric_name
-
-
 def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
     """Parse the annotation response from Responsible AI service for a content harm evaluation.
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py b/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py
@@ -65,8 +65,8 @@ class ECIEvaluator:
     .. code-block:: python
 
         {
-            "ECI_label": "False",
-            "ECI_reason": "Some reason."
+            "eci_label": "False",
+            "eci_reason": "Some reason."
         }
     """
 

diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
@@ -470,8 +470,8 @@ def test_eci_evaluator(self, project_scope, azure_cred):
             answer="Rhombus",
         )
         assert unrelated_result is not None
-        assert not unrelated_result["ECI_label"]
-        assert "geometry question" in unrelated_result["ECI_reason"]
+        assert not unrelated_result["eci_label"]
+        assert "geometry question" in unrelated_result["eci_reason"]
 
     # @pytest.mark.skipif(
     #    not is_replay(), reason="API not fully released yet. Don't run in live mode unless connected to INT."

diff --git a/src/promptflow-evals/tests/evals/unittests/test_evaluate.py b/src/promptflow-evals/tests/evals/unittests/test_evaluate.py
@@ -463,8 +463,8 @@ def test_content_safety_aggregation(self):
 
     def test_label_based_aggregation(self):
         data = {
-            "eci.ECI_label": [True, False, True, False, True],
-            "eci.ECI_reasoning": ["a", "b", "c", "d", "e"],
+            "eci.eci_label": [True, False, True, False, True],
+            "eci.eci_reasoning": ["a", "b", "c", "d", "e"],
             "protected_material.protected_material_label": [False, False, False, False, True],
             "protected_material.protected_material_reasoning": ["f", "g", "h", "i", "j"],
             "unknown.unaccounted_label": [True, False, False, False, True],
@@ -478,11 +478,11 @@ def test_label_based_aggregation(self):
         aggregation = _aggregate_metrics(data_df, evaluators)
         # ECI and PM labels should be replaced with defect rates, unaccounted should not
         assert len(aggregation) == 3
-        assert "eci.ECI_label" not in aggregation
+        assert "eci.eci_label" not in aggregation
         assert "protected_material.protected_material_label" not in aggregation
         assert aggregation["unknown.unaccounted_label"] == 0.4
 
-        assert aggregation["eci.ECI_defect_rate"] == 0.6
+        assert aggregation["eci.eci_defect_rate"] == 0.6
         assert aggregation["protected_material.protected_material_defect_rate"] == 0.2
         assert "unaccounted_defect_rate" not in aggregation