From 3cde352085ff6905c5314b433a84f49fdaa9b0f9 Mon Sep 17 00:00:00 2001 From: Diondra <16376603+diondrapeck@users.noreply.github.com> Date: Wed, 18 Sep 2024 11:22:29 -0400 Subject: [PATCH] Change eci prefix from upper case to lowercase (#3771) # Description This PR changes ECI case from upper to lower to match other evaluator type casing and integrate better with UI evaluation flows. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](https://github.com/microsoft/promptflow/blob/main/CONTRIBUTING.md).** - [ ] **I confirm that all new dependencies are compatible with the MIT license.** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --- .../promptflow/evals/_common/rai_service.py | 24 ++++--------------- .../promptflow/evals/evaluators/_eci/_eci.py | 4 ++-- .../evals/e2etests/test_builtin_evaluators.py | 4 ++-- .../tests/evals/unittests/test_evaluate.py | 8 +++---- 4 files changed, 13 insertions(+), 27 deletions(-) diff --git a/src/promptflow-evals/promptflow/evals/_common/rai_service.py b/src/promptflow-evals/promptflow/evals/_common/rai_service.py index 84ad90acffa..6674206acdd 100644 --- a/src/promptflow-evals/promptflow/evals/_common/rai_service.py +++ b/src/promptflow-evals/promptflow/evals/_common/rai_service.py @@ -220,22 +220,21 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements response = response.replace("true", "True") parsed_response = literal_eval(response) result = {} - metric_prefix = _get_metric_prefix(metric_name) # Use label instead of score since these are assumed to be boolean results. # Use np.nan as null value since it's ignored by aggregations rather than treated as 0. - result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan - result[metric_prefix + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else "" + result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan + result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else "" if metric_name == EvaluationMetrics.XPIA: # Add "manipulated_content", "intrusion" and "information_gathering" to the result # if present else set them to np.nan - result[metric_prefix + "_manipulated_content"] = ( + result[metric_name + "_manipulated_content"] = ( parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan ) - result[metric_prefix + "_intrusion"] = ( + result[metric_name + "_intrusion"] = ( parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan ) - result[metric_prefix + "_information_gathering"] = ( + result[metric_name + "_information_gathering"] = ( parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan ) @@ -243,19 +242,6 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements return _parse_content_harm_response(batch_response, metric_name) -def _get_metric_prefix(metric_name: str) -> str: - """Get the prefix for the evaluation metric. This is usually the metric name. - - :param metric_name: The evaluation metric to use. - :type metric_name: str - :return: The prefix for the evaluation metric. - :rtype: str - """ - if metric_name == _InternalEvaluationMetrics.ECI: - return "ECI" - return metric_name - - def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict: """Parse the annotation response from Responsible AI service for a content harm evaluation. diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py b/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py index 2305c20c72c..428434693e0 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py @@ -65,8 +65,8 @@ class ECIEvaluator: .. code-block:: python { - "ECI_label": "False", - "ECI_reason": "Some reason." + "eci_label": "False", + "eci_reason": "Some reason." } """ diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py index acbf69651c3..de56db9049b 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py @@ -470,8 +470,8 @@ def test_eci_evaluator(self, project_scope, azure_cred): answer="Rhombus", ) assert unrelated_result is not None - assert not unrelated_result["ECI_label"] - assert "geometry question" in unrelated_result["ECI_reason"] + assert not unrelated_result["eci_label"] + assert "geometry question" in unrelated_result["eci_reason"] # @pytest.mark.skipif( # not is_replay(), reason="API not fully released yet. Don't run in live mode unless connected to INT." diff --git a/src/promptflow-evals/tests/evals/unittests/test_evaluate.py b/src/promptflow-evals/tests/evals/unittests/test_evaluate.py index 6627e4db156..8f5d38db1d3 100644 --- a/src/promptflow-evals/tests/evals/unittests/test_evaluate.py +++ b/src/promptflow-evals/tests/evals/unittests/test_evaluate.py @@ -463,8 +463,8 @@ def test_content_safety_aggregation(self): def test_label_based_aggregation(self): data = { - "eci.ECI_label": [True, False, True, False, True], - "eci.ECI_reasoning": ["a", "b", "c", "d", "e"], + "eci.eci_label": [True, False, True, False, True], + "eci.eci_reasoning": ["a", "b", "c", "d", "e"], "protected_material.protected_material_label": [False, False, False, False, True], "protected_material.protected_material_reasoning": ["f", "g", "h", "i", "j"], "unknown.unaccounted_label": [True, False, False, False, True], @@ -478,11 +478,11 @@ def test_label_based_aggregation(self): aggregation = _aggregate_metrics(data_df, evaluators) # ECI and PM labels should be replaced with defect rates, unaccounted should not assert len(aggregation) == 3 - assert "eci.ECI_label" not in aggregation + assert "eci.eci_label" not in aggregation assert "protected_material.protected_material_label" not in aggregation assert aggregation["unknown.unaccounted_label"] == 0.4 - assert aggregation["eci.ECI_defect_rate"] == 0.6 + assert aggregation["eci.eci_defect_rate"] == 0.6 assert aggregation["protected_material.protected_material_defect_rate"] == 0.2 assert "unaccounted_defect_rate" not in aggregation