diff --git a/src/promptflow-evals/promptflow/evals/_common/rai_service.py b/src/promptflow-evals/promptflow/evals/_common/rai_service.py index 84ad90acffa..6674206acdd 100644 --- a/src/promptflow-evals/promptflow/evals/_common/rai_service.py +++ b/src/promptflow-evals/promptflow/evals/_common/rai_service.py @@ -220,22 +220,21 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements response = response.replace("true", "True") parsed_response = literal_eval(response) result = {} - metric_prefix = _get_metric_prefix(metric_name) # Use label instead of score since these are assumed to be boolean results. # Use np.nan as null value since it's ignored by aggregations rather than treated as 0. - result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan - result[metric_prefix + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else "" + result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan + result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else "" if metric_name == EvaluationMetrics.XPIA: # Add "manipulated_content", "intrusion" and "information_gathering" to the result # if present else set them to np.nan - result[metric_prefix + "_manipulated_content"] = ( + result[metric_name + "_manipulated_content"] = ( parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan ) - result[metric_prefix + "_intrusion"] = ( + result[metric_name + "_intrusion"] = ( parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan ) - result[metric_prefix + "_information_gathering"] = ( + result[metric_name + "_information_gathering"] = ( parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan ) @@ -243,19 +242,6 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements return _parse_content_harm_response(batch_response, metric_name) -def _get_metric_prefix(metric_name: str) -> str: - """Get the prefix for the evaluation metric. This is usually the metric name. - - :param metric_name: The evaluation metric to use. - :type metric_name: str - :return: The prefix for the evaluation metric. - :rtype: str - """ - if metric_name == _InternalEvaluationMetrics.ECI: - return "ECI" - return metric_name - - def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict: """Parse the annotation response from Responsible AI service for a content harm evaluation. diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py b/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py index 2305c20c72c..428434693e0 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py @@ -65,8 +65,8 @@ class ECIEvaluator: .. code-block:: python { - "ECI_label": "False", - "ECI_reason": "Some reason." + "eci_label": "False", + "eci_reason": "Some reason." } """ diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py index acbf69651c3..de56db9049b 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py @@ -470,8 +470,8 @@ def test_eci_evaluator(self, project_scope, azure_cred): answer="Rhombus", ) assert unrelated_result is not None - assert not unrelated_result["ECI_label"] - assert "geometry question" in unrelated_result["ECI_reason"] + assert not unrelated_result["eci_label"] + assert "geometry question" in unrelated_result["eci_reason"] # @pytest.mark.skipif( # not is_replay(), reason="API not fully released yet. Don't run in live mode unless connected to INT." diff --git a/src/promptflow-evals/tests/evals/unittests/test_evaluate.py b/src/promptflow-evals/tests/evals/unittests/test_evaluate.py index 6627e4db156..8f5d38db1d3 100644 --- a/src/promptflow-evals/tests/evals/unittests/test_evaluate.py +++ b/src/promptflow-evals/tests/evals/unittests/test_evaluate.py @@ -463,8 +463,8 @@ def test_content_safety_aggregation(self): def test_label_based_aggregation(self): data = { - "eci.ECI_label": [True, False, True, False, True], - "eci.ECI_reasoning": ["a", "b", "c", "d", "e"], + "eci.eci_label": [True, False, True, False, True], + "eci.eci_reasoning": ["a", "b", "c", "d", "e"], "protected_material.protected_material_label": [False, False, False, False, True], "protected_material.protected_material_reasoning": ["f", "g", "h", "i", "j"], "unknown.unaccounted_label": [True, False, False, False, True], @@ -478,11 +478,11 @@ def test_label_based_aggregation(self): aggregation = _aggregate_metrics(data_df, evaluators) # ECI and PM labels should be replaced with defect rates, unaccounted should not assert len(aggregation) == 3 - assert "eci.ECI_label" not in aggregation + assert "eci.eci_label" not in aggregation assert "protected_material.protected_material_label" not in aggregation assert aggregation["unknown.unaccounted_label"] == 0.4 - assert aggregation["eci.ECI_defect_rate"] == 0.6 + assert aggregation["eci.eci_defect_rate"] == 0.6 assert aggregation["protected_material.protected_material_defect_rate"] == 0.2 assert "unaccounted_defect_rate" not in aggregation