diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 312026361a5d4..4aebcfee89d68 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -11,6 +11,7 @@ - Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path. - Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation - Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation. +- Fixed an issue with the `ContentSafetyEvaluator` that caused parallel execution of sub-evaluators to fail. Parallel execution is now enabled by default again, but can still be disabled via the '_parallel' boolean keyword argument during class initialization. ### Other Changes - Refined error messages for serviced-based evaluators and simulators. diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json index 1e52dc68cc084..1402fc658ba6b 100644 --- a/sdk/evaluation/azure-ai-evaluation/assets.json +++ b/sdk/evaluation/azure-ai-evaluation/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/evaluation/azure-ai-evaluation", - "Tag": "python/evaluation/azure-ai-evaluation_f6269c684c" + "Tag": "python/evaluation/azure-ai-evaluation_acededcaea" } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index 12dbac53101cb..80b830e4c3621 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -69,7 +69,7 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]): # TODO address 3579092 to re-enabled parallel evals. def __init__(self, credential, azure_ai_project, **kwargs): super().__init__() - self._parallel = kwargs.pop("_parallel", False) + self._parallel = kwargs.pop("_parallel", True) self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [ ViolenceEvaluator(credential, azure_ai_project), SexualEvaluator(credential, azure_ai_project), @@ -152,7 +152,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: with ThreadPoolExecutor() as executor: # pylint: disable=no-value-for-parameter futures = { - executor.submit(query=query, response=response, conversation=conversation): evaluator + executor.submit(evaluator, query=query, response=response, conversation=conversation): evaluator for evaluator in self._evaluators } diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py index 843963493ce98..8b35a0113961a 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py @@ -318,26 +318,6 @@ def test_content_safety_evaluator_hate_unfairness(self, project_scope, azure_cre score2["evaluation_per_turn"]["hate_unfairness_reason"] ), "hate_unfairness_reason must not be None or empty." - # TODO remove? - - @pytest.mark.skipif(True, reason="Content safety service IS available now!") - def test_content_safety_service_unavailable(self, project_scope, azure_cred): - eval_fn = ViolenceEvaluator(azure_cred, project_scope) - # Doing this is replay mode breaks causes mismatch between scrubbed recordings - # and the actual request made. - if is_live(): - # Warning, live testing fails due to unstable region. - # We need a use a new region. - project_scope["project_name"] = "pf-evals-ws-westus2" - - with pytest.raises(Exception) as exc_info: - score = eval_fn( - query="What is the capital of Japan?", - response="The capital of Japan is Tokyo.", - ) - - assert "RAI service is not available in this region" in exc_info._excinfo[1].args[0] - @pytest.mark.parametrize("parallel", [False, True]) def test_composite_evaluator_qa(self, model_config, parallel): qa_eval = QAEvaluator(model_config, _parallel=parallel) @@ -387,8 +367,9 @@ def test_composite_evaluator_qa_for_nans(self, model_config): assert not math.isnan(score["fluency"]) assert not math.isnan(score["similarity"]) - def test_composite_evaluator_content_safety(self, project_scope, azure_cred): - safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, _parallel=False) + @pytest.mark.parametrize("parallel", [True, False]) + def test_composite_evaluator_content_safety(self, project_scope, azure_cred, parallel): + safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, _parallel=parallel) score = safety_eval( query="Tokyo is the capital of which country?", response="Japan", @@ -408,8 +389,11 @@ def test_composite_evaluator_content_safety(self, project_scope, azure_cred): assert score["hate_unfairness_score"] < 1.0 assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty." - def test_composite_evaluator_content_safety_with_conversation(self, project_scope, azure_cred, simple_conversation): - safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, parallel=False) + @pytest.mark.parametrize("parallel", [True, False]) + def test_composite_evaluator_content_safety_with_conversation( + self, project_scope, azure_cred, simple_conversation, parallel + ): + safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, parallel=parallel) score = safety_eval( conversation=simple_conversation, ) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py index 625cf021e9d3b..ffa4bbf981254 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py @@ -192,7 +192,8 @@ def test_evaluate_with_relative_data_path(self, model_config): finally: os.chdir(original_working_dir) - def test_evaluate_with_content_safety_evaluator(self, project_scope, azure_cred, data_file): + @pytest.mark.parametrize("parallel", [True, False]) + def test_evaluate_with_content_safety_evaluator(self, project_scope, azure_cred, data_file, parallel): input_data = pd.read_json(data_file, lines=True) # CS evaluator tries to store the credential, which breaks multiprocessing at @@ -200,7 +201,7 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, azure_cred, # generate a default credential at runtime. # Internal Parallelism is also disabled to avoid faulty recordings. content_safety_eval = ContentSafetyEvaluator( - credential=azure_cred, azure_ai_project=project_scope, _parallel=False + credential=azure_cred, azure_ai_project=project_scope, _parallel=parallel ) # run the evaluation