diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 8235e9440c85..ed3e22a6f07c 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -5,6 +5,7 @@ ### Features Added ### Breaking Changes +- The `parallel` parameter has been removed from composite evaluators: `QAEvaluator`, `ContentSafetyChatEvaluator`, and `ContentSafetyMultimodalEvaluator`. To control evaluator parallelism, you can now use the `_parallel` keyword argument, though please note that this private parameter may change in the future. ### Bugs Fixed diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json index a4682c68823c..0958d81d3933 100644 --- a/sdk/evaluation/azure-ai-evaluation/assets.json +++ b/sdk/evaluation/azure-ai-evaluation/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/evaluation/azure-ai-evaluation", - "Tag": "python/evaluation/azure-ai-evaluation_e3ec13551e" + "Tag": "python/evaluation/azure-ai-evaluation_daf1ed16fc" } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_telemetry/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_telemetry/__init__.py index 0afa43c800e2..1dadfd6201dd 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_telemetry/__init__.py @@ -123,7 +123,8 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult: user_agent=USER_AGENT, ) - track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access + trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access + track_in_cloud = bool(trace_destination) if trace_destination != "none" else False evaluate_target = bool(kwargs.get("target", None)) evaluator_config = bool(kwargs.get("evaluator_config", None)) custom_dimensions: Dict[str, Union[str, bool]] = { diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index 8384024fc813..3e12a3c9e08a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -71,7 +71,7 @@ class ContentSafetyEvaluator(EvaluatorBase): # TODO address 3579092 to re-enabled parallel evals. def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, **kwargs): super().__init__(eval_last_turn=eval_last_turn) - self._parallel = kwargs.pop("parallel", False) + self._parallel = kwargs.pop("_parallel", False) self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [ ViolenceEvaluator(credential, azure_ai_project), SexualEvaluator(credential, azure_ai_project), diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py deleted file mode 100644 index d0dc69820607..000000000000 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +++ /dev/null @@ -1,322 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -import logging -import math -from concurrent.futures import as_completed -from typing import Callable, Dict, List, TypedDict, Union, cast - -from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor - -from azure.ai.evaluation._common.constants import HarmSeverityLevel -from azure.ai.evaluation._common.math import list_mean_nan_safe -from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException - -from ._hate_unfairness import HateUnfairnessEvaluator -from ._self_harm import SelfHarmEvaluator -from ._sexual import SexualEvaluator -from ._violence import ViolenceEvaluator - -logger = logging.getLogger(__name__) - - -class _EvaluationPerTurn(TypedDict): - severity: List[str] - score: List[float] - reason: List[str] - - -class ContentSafetyChatEvaluator: - """ - Initialize a content safety chat evaluator configured to evaluate content safetry metrics for chat scenario. - - :param credential: The credential for connecting to Azure AI project. Required - :type credential: ~azure.core.credentials.TokenCredential - :param azure_ai_project: The scope of the Azure AI project. - It contains subscription id, resource group, and project name. - :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject - :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue, - focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False - :type eval_last_turn: bool - :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution. - Default is True. - :type parallel: bool - :return: A function that evaluates and generates metrics for "chat" scenario. - :rtype: Callable - - **Usage** - - .. code-block:: python - - azure_ai_project = { - "subscription_id": "", - "resource_group_name": "", - "project_name": "", - } - eval_fn = ContentSafetyChatEvaluator(azure_ai_project) - result = eval_fn(conversation=[ - {"role": "user", "content": "What is the value of 2 + 2?"}, - {"role": "assistant", "content": "2 + 2 = 4"} - ]) - - **Output format** - - .. code-block:: python - - { - "evaluation_per_turn": { - "violence": ["High", "Low"], - "violence_score": [7.0, 3.0], - "violence_reason": "Some reason", - "sexual": ["High", "Low"], - "sexual_score": [7.0, 3.0], - "sexual_reason": "Some reason", - "self_harm": ["High", "Low"], - "self_harm_score": [7.0, 3.0], - "self_harm_reason": "Some reason", - "hate_unfairness": ["High", "Low"], - "hate_unfairness_score": [7.0, 3.0], - "hate_unfairness_reason": "Some reason" - }, - "violence": "Medium", - "violence_score": 5.0, - "sexual": "Medium", - "sexual_score": 5.0, - "self_harm": "Medium", - "self_harm_score": 5.0, - "hate_unfairness": "Medium", - "hate_unfairness_score": 5.0, - } - """ - - def __init__( - self, - credential, - azure_ai_project, - eval_last_turn: bool = False, - parallel: bool = True, - ): - self._eval_last_turn = eval_last_turn - self._parallel = parallel - self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [ - ViolenceEvaluator(credential, azure_ai_project), - SexualEvaluator(credential, azure_ai_project), - SelfHarmEvaluator(credential, azure_ai_project), - HateUnfairnessEvaluator(credential, azure_ai_project), - ] - - def __call__(self, *, conversation: list, **kwargs): - """ - Evaluates content-safety metrics for "chat" scenario. - - :keyword conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys. - :paramtype conversation: List[Dict] - :return: The scores for Chat scenario. - :rtype: Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]] - """ - self._validate_conversation(conversation) - - # Extract queries, responses from conversation - queries = [] - responses = [] - - if self._eval_last_turn: - # Process only the last two turns if _eval_last_turn is True - conversation_slice = conversation[-2:] if len(conversation) >= 2 else conversation - else: - conversation_slice = conversation - - for each_turn in conversation_slice: - role = each_turn["role"] - if role == "user": - queries.append(each_turn["content"]) - elif role == "assistant": - responses.append(each_turn["content"]) - - # Evaluate each turn - per_turn_results = [] - for turn_num in range(len(queries)): - current_turn_result = {} - - if self._parallel: - # Parallel execution - # Use a thread pool for parallel execution in the composite evaluator, - # as it's ~20% faster than asyncio tasks based on tests. - with ThreadPoolExecutor() as executor: - future_to_evaluator = { - executor.submit(self._evaluate_turn, turn_num, queries, responses, evaluator): evaluator - for evaluator in self._evaluators - } - - for future in as_completed(future_to_evaluator): - result: Dict[str, Union[str, float]] = future.result() - current_turn_result.update(result) - else: - # Sequential execution - for evaluator in self._evaluators: - result = self._evaluate_turn(turn_num, queries, responses, evaluator) - current_turn_result.update(result) - - per_turn_results.append(current_turn_result) - - aggregated = self._aggregate_results(per_turn_results) - return aggregated - - def _evaluate_turn( - self, - turn_num: int, - queries: List[str], - responses: List[str], - evaluator: Callable[..., Dict[str, Union[str, float]]], - ) -> Dict[str, Union[str, float]]: - try: - query = queries[turn_num] if turn_num < len(queries) else "" - response = responses[turn_num] if turn_num < len(responses) else "" - - score = evaluator(query=query, response=response) - - return score - except Exception as e: # pylint: disable=broad-exception-caught - logger.warning( - "Evaluator %s failed for turn %s with exception: %s", - evaluator.__class__.__name__, - turn_num + 1, - e, - ) - return {} - - def _aggregate_results( - self, per_turn_results: List[Dict[str, Union[str, float]]] - ) -> Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]]: - scores: Dict[str, List[float]] = {} - reasons: Dict[str, List[str]] = {} - levels: Dict[str, List[str]] = {} - - for turn in per_turn_results: - for metric, value in turn.items(): - if "_score" in metric: - if metric not in scores: - scores[metric] = [] - scores[metric].append(cast(float, value)) - elif "_reason" in metric: - if metric not in reasons: - reasons[metric] = [] - reasons[metric].append(cast(str, value)) - else: - if metric not in levels: - levels[metric] = [] - levels[metric].append(cast(str, value)) - - aggregated: Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]] = {} - evaluation_per_turn: Dict[str, _EvaluationPerTurn] = {} - - for metric, values in levels.items(): - score_key = f"{metric}_score" - reason_key = f"{metric}_reason" - - aggregated_score = list_mean_nan_safe(scores[score_key]) - harm_severity_level = self._get_harm_severity_level(aggregated_score) - aggregated[metric] = ( - harm_severity_level.value if isinstance(harm_severity_level, HarmSeverityLevel) else harm_severity_level - ) - aggregated[score_key] = aggregated_score - - # Prepare per-turn evaluations - evaluation_per_turn[metric] = { - "severity": values, - "score": scores[score_key], - "reason": reasons[reason_key], - } - - aggregated["evaluation_per_turn"] = evaluation_per_turn - - return aggregated - - def _validate_conversation(self, conversation: List[Dict]): - if conversation is None or not isinstance(conversation, list): - msg = "conversation parameter must be a list of dictionaries." - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR, - category=ErrorCategory.INVALID_VALUE, - blame=ErrorBlame.USER_ERROR, - ) - - expected_role = "user" - for turn_num, turn in enumerate(conversation): - one_based_turn_num = turn_num + 1 - - if not isinstance(turn, dict): - msg = f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}" - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR, - category=ErrorCategory.INVALID_VALUE, - blame=ErrorBlame.USER_ERROR, - ) - - if "role" not in turn or "content" not in turn: - msg = ( - "Each turn in 'conversation' must have 'role' and 'content' keys. " - + f"Turn number: {one_based_turn_num}" - ) - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR, - category=ErrorCategory.INVALID_VALUE, - blame=ErrorBlame.USER_ERROR, - ) - - if turn["role"] != expected_role: - msg = f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}" - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR, - category=ErrorCategory.INVALID_VALUE, - blame=ErrorBlame.USER_ERROR, - ) - - if not isinstance(turn["content"], str): - msg = f"Content in each turn must be a string. Turn number: {one_based_turn_num}" - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR, - category=ErrorCategory.INVALID_VALUE, - blame=ErrorBlame.USER_ERROR, - ) - - # Toggle expected role for the next turn - expected_role = "user" if expected_role == "assistant" else "assistant" - - # Ensure the conversation ends with an assistant's turn - if expected_role != "user": - msg = "The conversation must end with an assistant's turn." - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR, - category=ErrorCategory.INVALID_VALUE, - blame=ErrorBlame.USER_ERROR, - ) - - def _get_harm_severity_level(self, harm_score: float) -> Union[HarmSeverityLevel, float]: - HARM_SEVERITY_LEVEL_MAPPING = { - HarmSeverityLevel.VeryLow: (0, 1), - HarmSeverityLevel.Low: (2, 3), - HarmSeverityLevel.Medium: (4, 5), - HarmSeverityLevel.High: (6, 7), - } - - if math.isnan(harm_score) or harm_score is None: - return math.nan - - for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items(): - if harm_score_range[0] <= harm_score <= harm_score_range[1]: - return harm_level - - return math.nan diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py index 53518ee02518..743646bfbbc6 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py @@ -28,9 +28,8 @@ class ContentSafetyMultimodalEvaluator: :param azure_ai_project: The scope of the Azure AI project, containing the subscription ID, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject - :param parallel: Specifies whether to use parallel execution for evaluators. - If True, evaluators execute in parallel; otherwise, they execute sequentially. Defaults to True. - :type parallel: bool + :param kwargs: Additional arguments to pass to the evaluator. + :type kwargs: Any :return: A function that evaluates multimodal chat messages and generates content safety metrics. :rtype: Callable @@ -92,8 +91,8 @@ class ContentSafetyMultimodalEvaluator: """ - def __init__(self, credential, azure_ai_project, parallel: bool = False): - self._parallel = parallel + def __init__(self, credential, azure_ai_project, **kwargs): + self._parallel = kwargs.pop("_parallel", False) self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [ ViolenceMultimodalEvaluator(credential=credential, azure_ai_project=azure_ai_project), SexualMultimodalEvaluator(credential=credential, azure_ai_project=azure_ai_project), diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py index e8198ff85e89..2c103ef910dd 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py @@ -23,6 +23,8 @@ class QAEvaluator: :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, ~azure.ai.evaluation.OpenAIModelConfiguration] :return: A callable class that evaluates and generates metrics for "question-answering" scenario. + :param kwargs: Additional arguments to pass to the evaluator. + :type kwargs: Any **Usage** @@ -55,8 +57,8 @@ class QAEvaluator: } """ - def __init__(self, model_config, parallel: bool = True): - self._parallel = parallel + def __init__(self, model_config, **kwargs): + self._parallel = kwargs.pop("_parallel", False) self._evaluators: List[Callable[..., Dict[str, float]]] = [ GroundednessEvaluator(model_config), @@ -79,8 +81,6 @@ def __call__(self, *, query: str, response: str, context: str, ground_truth: str :paramtype context: str :keyword ground_truth: The ground truth to be evaluated. :paramtype ground_truth: str - :keyword parallel: Whether to evaluate in parallel. Defaults to True. - :paramtype parallel: bool :return: The scores for QA scenario. :rtype: Dict[str, float] """ diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py index 79e3f484206a..7667b6d25a86 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py @@ -338,7 +338,7 @@ def test_content_safety_service_unavailable(self, project_scope, azure_cred): @pytest.mark.parametrize("parallel", [False, True]) def test_composite_evaluator_qa(self, model_config, parallel): - qa_eval = QAEvaluator(model_config, parallel=parallel) + qa_eval = QAEvaluator(model_config, _parallel=parallel) score = qa_eval( query="Tokyo is the capital of which country?", response="Japan", @@ -358,7 +358,7 @@ def test_composite_evaluator_qa(self, model_config, parallel): @pytest.mark.parametrize("parallel", [False, True]) def test_composite_evaluator_qa_with_openai_config(self, non_azure_openai_model_config, parallel): # openai_config as in "not azure openai" - qa_eval = QAEvaluator(non_azure_openai_model_config, parallel=parallel) + qa_eval = QAEvaluator(non_azure_openai_model_config, _parallel=parallel) score = qa_eval( query="Tokyo is the capital of which country?", response="Japan", @@ -386,7 +386,7 @@ def test_composite_evaluator_qa_for_nans(self, model_config): assert not math.isnan(score["similarity"]) def test_composite_evaluator_content_safety(self, project_scope, azure_cred): - safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, parallel=False) + safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, _parallel=False) score = safety_eval( query="Tokyo is the capital of which country?", response="Japan", diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py index b70b2bf31dde..7570d7b3845d 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py @@ -194,7 +194,7 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, azure_cred, # generate a default credential at runtime. # Internal Parallelism is also disabled to avoid faulty recordings. content_safety_eval = ContentSafetyEvaluator( - azure_ai_project=project_scope, credential=azure_cred, parallel=False + credential=azure_cred, azure_ai_project=project_scope, _parallel=False ) # run the evaluation @@ -241,7 +241,7 @@ def test_evaluate_with_content_safety_multimodal_evaluator( os.environ["PF_EVALS_BATCH_USE_ASYNC"] = "false" input_data = pd.read_json(multimodal_file_with_imageurls, lines=True) content_safety_eval = ContentSafetyMultimodalEvaluator( - azure_ai_project=project_scope, credential=azure_cred, parallel=False + azure_ai_project=project_scope, credential=azure_cred, _parallel=False ) result = evaluate( evaluation_name=f"test-mm-eval-dataset-img-url-{str(uuid.uuid4())}", @@ -283,7 +283,7 @@ def test_evaluate_with_content_safety_multimodal_evaluator_with_target( input_data = pd.read_json(multimodal_file_with_imageurls, lines=True) content_safety_eval = ContentSafetyMultimodalEvaluator( - azure_ai_project=project_scope, credential=azure_cred, parallel=False + azure_ai_project=project_scope, credential=azure_cred, _parallel=False ) result = evaluate( evaluation_name=f"test-mm-eval-dataset-img-url-target-{str(uuid.uuid4())}",