Eval/bugfix/content safety parallel (Azure#38307)

* fix cs eval * recordings and cl
allenkim0129 · Nov 5, 2024 · 3560c15 · 3560c15
1 parent d9d8ca8
commit 3560c15
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 29 deletions.
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -11,6 +11,7 @@
 - Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path.
 - Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
 - Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
+- Fixed an issue with the `ContentSafetyEvaluator` that caused parallel execution of sub-evaluators to fail. Parallel execution is now enabled by default again, but can still be disabled via the '_parallel' boolean keyword argument during class initialization.
 
 ### Other Changes
 - Refined error messages for serviced-based evaluators and simulators.

diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_f6269c684c"
+  "Tag": "python/evaluation/azure-ai-evaluation_acededcaea"
 }
diff --git a/...on/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/...on/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -69,7 +69,7 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
     # TODO address 3579092 to re-enabled parallel evals.
     def __init__(self, credential, azure_ai_project, **kwargs):
         super().__init__()
-        self._parallel = kwargs.pop("_parallel", False)
+        self._parallel = kwargs.pop("_parallel", True)
         self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
             ViolenceEvaluator(credential, azure_ai_project),
             SexualEvaluator(credential, azure_ai_project),
@@ -152,7 +152,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
             with ThreadPoolExecutor() as executor:
                 # pylint: disable=no-value-for-parameter
                 futures = {
-                    executor.submit(query=query, response=response, conversation=conversation): evaluator
+                    executor.submit(evaluator, query=query, response=response, conversation=conversation): evaluator
                     for evaluator in self._evaluators
                 }
 

diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py
@@ -318,26 +318,6 @@ def test_content_safety_evaluator_hate_unfairness(self, project_scope, azure_cre
             score2["evaluation_per_turn"]["hate_unfairness_reason"]
         ), "hate_unfairness_reason must not be None or empty."
 
-    # TODO remove?
-
-    @pytest.mark.skipif(True, reason="Content safety service IS available now!")
-    def test_content_safety_service_unavailable(self, project_scope, azure_cred):
-        eval_fn = ViolenceEvaluator(azure_cred, project_scope)
-        # Doing this is replay mode breaks causes mismatch between scrubbed recordings
-        # and the actual request made.
-        if is_live():
-            # Warning, live testing fails due to unstable region.
-            # We need a use a new region.
-            project_scope["project_name"] = "pf-evals-ws-westus2"
-
-        with pytest.raises(Exception) as exc_info:
-            score = eval_fn(
-                query="What is the capital of Japan?",
-                response="The capital of Japan is Tokyo.",
-            )
-
-        assert "RAI service is not available in this region" in exc_info._excinfo[1].args[0]
-
     @pytest.mark.parametrize("parallel", [False, True])
     def test_composite_evaluator_qa(self, model_config, parallel):
         qa_eval = QAEvaluator(model_config, _parallel=parallel)
@@ -387,8 +367,9 @@ def test_composite_evaluator_qa_for_nans(self, model_config):
         assert not math.isnan(score["fluency"])
         assert not math.isnan(score["similarity"])
 
-    def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
-        safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, _parallel=False)
+    @pytest.mark.parametrize("parallel", [True, False])
+    def test_composite_evaluator_content_safety(self, project_scope, azure_cred, parallel):
+        safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, _parallel=parallel)
         score = safety_eval(
             query="Tokyo is the capital of which country?",
             response="Japan",
@@ -408,8 +389,11 @@ def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
         assert score["hate_unfairness_score"] < 1.0
         assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
 
-    def test_composite_evaluator_content_safety_with_conversation(self, project_scope, azure_cred, simple_conversation):
-        safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, parallel=False)
+    @pytest.mark.parametrize("parallel", [True, False])
+    def test_composite_evaluator_content_safety_with_conversation(
+        self, project_scope, azure_cred, simple_conversation, parallel
+    ):
+        safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, parallel=parallel)
         score = safety_eval(
             conversation=simple_conversation,
         )

diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py
@@ -192,15 +192,16 @@ def test_evaluate_with_relative_data_path(self, model_config):
         finally:
             os.chdir(original_working_dir)
 
-    def test_evaluate_with_content_safety_evaluator(self, project_scope, azure_cred, data_file):
+    @pytest.mark.parametrize("parallel", [True, False])
+    def test_evaluate_with_content_safety_evaluator(self, project_scope, azure_cred, data_file, parallel):
         input_data = pd.read_json(data_file, lines=True)
 
         # CS evaluator tries to store the credential, which breaks multiprocessing at
         # pickling stage. So we pass None for credential and let child evals
         # generate a default credential at runtime.
         # Internal Parallelism is also disabled to avoid faulty recordings.
         content_safety_eval = ContentSafetyEvaluator(
-            credential=azure_cred, azure_ai_project=project_scope, _parallel=False
+            credential=azure_cred, azure_ai_project=project_scope, _parallel=parallel
         )
 
         # run the evaluation