Skip to content

Commit

Permalink
Eval/bugfix/content safety parallel (Azure#38307)
Browse files Browse the repository at this point in the history
* fix cs eval

* recordings and cl
  • Loading branch information
MilesHolland authored and allenkim0129 committed Nov 5, 2024
1 parent d9d8ca8 commit 3560c15
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 29 deletions.
1 change: 1 addition & 0 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path.
- Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
- Fixed an issue with the `ContentSafetyEvaluator` that caused parallel execution of sub-evaluators to fail. Parallel execution is now enabled by default again, but can still be disabled via the '_parallel' boolean keyword argument during class initialization.

### Other Changes
- Refined error messages for serviced-based evaluators and simulators.
Expand Down
2 changes: 1 addition & 1 deletion sdk/evaluation/azure-ai-evaluation/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"AssetsRepo": "Azure/azure-sdk-assets",
"AssetsRepoPrefixPath": "python",
"TagPrefix": "python/evaluation/azure-ai-evaluation",
"Tag": "python/evaluation/azure-ai-evaluation_f6269c684c"
"Tag": "python/evaluation/azure-ai-evaluation_acededcaea"
}
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
# TODO address 3579092 to re-enabled parallel evals.
def __init__(self, credential, azure_ai_project, **kwargs):
super().__init__()
self._parallel = kwargs.pop("_parallel", False)
self._parallel = kwargs.pop("_parallel", True)
self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
ViolenceEvaluator(credential, azure_ai_project),
SexualEvaluator(credential, azure_ai_project),
Expand Down Expand Up @@ -152,7 +152,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
with ThreadPoolExecutor() as executor:
# pylint: disable=no-value-for-parameter
futures = {
executor.submit(query=query, response=response, conversation=conversation): evaluator
executor.submit(evaluator, query=query, response=response, conversation=conversation): evaluator
for evaluator in self._evaluators
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -318,26 +318,6 @@ def test_content_safety_evaluator_hate_unfairness(self, project_scope, azure_cre
score2["evaluation_per_turn"]["hate_unfairness_reason"]
), "hate_unfairness_reason must not be None or empty."

# TODO remove?

@pytest.mark.skipif(True, reason="Content safety service IS available now!")
def test_content_safety_service_unavailable(self, project_scope, azure_cred):
eval_fn = ViolenceEvaluator(azure_cred, project_scope)
# Doing this is replay mode breaks causes mismatch between scrubbed recordings
# and the actual request made.
if is_live():
# Warning, live testing fails due to unstable region.
# We need a use a new region.
project_scope["project_name"] = "pf-evals-ws-westus2"

with pytest.raises(Exception) as exc_info:
score = eval_fn(
query="What is the capital of Japan?",
response="The capital of Japan is Tokyo.",
)

assert "RAI service is not available in this region" in exc_info._excinfo[1].args[0]

@pytest.mark.parametrize("parallel", [False, True])
def test_composite_evaluator_qa(self, model_config, parallel):
qa_eval = QAEvaluator(model_config, _parallel=parallel)
Expand Down Expand Up @@ -387,8 +367,9 @@ def test_composite_evaluator_qa_for_nans(self, model_config):
assert not math.isnan(score["fluency"])
assert not math.isnan(score["similarity"])

def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, _parallel=False)
@pytest.mark.parametrize("parallel", [True, False])
def test_composite_evaluator_content_safety(self, project_scope, azure_cred, parallel):
safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, _parallel=parallel)
score = safety_eval(
query="Tokyo is the capital of which country?",
response="Japan",
Expand All @@ -408,8 +389,11 @@ def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
assert score["hate_unfairness_score"] < 1.0
assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."

def test_composite_evaluator_content_safety_with_conversation(self, project_scope, azure_cred, simple_conversation):
safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, parallel=False)
@pytest.mark.parametrize("parallel", [True, False])
def test_composite_evaluator_content_safety_with_conversation(
self, project_scope, azure_cred, simple_conversation, parallel
):
safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, parallel=parallel)
score = safety_eval(
conversation=simple_conversation,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -192,15 +192,16 @@ def test_evaluate_with_relative_data_path(self, model_config):
finally:
os.chdir(original_working_dir)

def test_evaluate_with_content_safety_evaluator(self, project_scope, azure_cred, data_file):
@pytest.mark.parametrize("parallel", [True, False])
def test_evaluate_with_content_safety_evaluator(self, project_scope, azure_cred, data_file, parallel):
input_data = pd.read_json(data_file, lines=True)

# CS evaluator tries to store the credential, which breaks multiprocessing at
# pickling stage. So we pass None for credential and let child evals
# generate a default credential at runtime.
# Internal Parallelism is also disabled to avoid faulty recordings.
content_safety_eval = ContentSafetyEvaluator(
credential=azure_cred, azure_ai_project=project_scope, _parallel=False
credential=azure_cred, azure_ai_project=project_scope, _parallel=parallel
)

# run the evaluation
Expand Down

0 comments on commit 3560c15

Please sign in to comment.