[pf-evals] Document improvements and convert timeout to constant (#3609)

# Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](https://github.com/microsoft/promptflow/blob/main/CONTRIBUTING.md).** - [ ] **I confirm that all new dependencies are compatible with the MIT license.** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes.
microsoft · Aug 2, 2024 · c34b962 · c34b962
1 parent 59d5b9f
commit c34b962
Show file tree

Hide file tree

Showing 19 changed files with 88 additions and 57 deletions.
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_chat/_chat.py b/src/promptflow-evals/promptflow/evals/evaluators/_chat/_chat.py
@@ -242,15 +242,15 @@ class ChatEvaluator:
     Initialize a chat evaluator configured for a specific Azure OpenAI model.
 
     :param model_config: Configuration for the Azure OpenAI model.
-    :type model_config: AzureOpenAIModelConfiguration
+    :type model_config: ~promptflow.core.AzureOpenAIModelConfiguration
     :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
         focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
     :type eval_last_turn: bool
     :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
         Default is True.
     :type parallel: bool
     :return: A function that evaluates and generates metrics for "chat" scenario.
-    :rtype: function
+    :rtype: Callable
 
     **Usage**
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_chat/retrieval/_retrieval.py b/src/promptflow-evals/promptflow/evals/evaluators/_chat/retrieval/_retrieval.py
@@ -21,6 +21,9 @@
 
 
 class _AsyncRetrievalChatEvaluator:
+    PROMPTY_FILE = "retrieval.prompty"
+    LLM_CALL_TIMEOUT = 600
+
     def __init__(self, model_config: AzureOpenAIModelConfiguration):
         if model_config.api_version is None:
             model_config.api_version = "2024-02-15-preview"
@@ -35,7 +38,7 @@ def __init__(self, model_config: AzureOpenAIModelConfiguration):
             prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": USER_AGENT})
 
         current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, "retrieval.prompty")
+        prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
         self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
 
     async def __call__(self, *, conversation, **kwargs):
@@ -65,7 +68,9 @@ async def __call__(self, *, conversation, **kwargs):
 
                 history.append({"user": question, "assistant": answer})
 
-                llm_output = await self._flow(query=question, history=history, documents=context, timeout=600, **kwargs)
+                llm_output = await self._flow(
+                    query=question, history=history, documents=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
+                )
                 score = np.nan
                 if llm_output:
                     parsed_score_response = re.findall(r"\d+", llm_output.split("# Result")[-1].strip())
@@ -96,9 +101,9 @@ class RetrievalChatEvaluator:
     Initialize an evaluator configured for a specific Azure OpenAI model.
 
     :param model_config: Configuration for the Azure OpenAI model.
-    :type model_config: AzureOpenAIModelConfiguration
+    :type model_config: ~promptflow.core.AzureOpenAIModelConfiguration
     :return: A function that evaluates and generates metrics for "chat" scenario.
-    :rtype: function
+    :rtype: Callable
     **Usage**
 
     .. code-block:: python

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_coherence/_coherence.py b/src/promptflow-evals/promptflow/evals/evaluators/_coherence/_coherence.py
@@ -17,6 +17,9 @@
 
 
 class _AsyncCoherenceEvaluator:
+    PROMPTY_FILE = "coherence.prompty"
+    LLM_CALL_TIMEOUT = 600
+
     def __init__(self, model_config: AzureOpenAIModelConfiguration):
         if model_config.api_version is None:
             model_config.api_version = "2024-02-15-preview"
@@ -31,7 +34,7 @@ def __init__(self, model_config: AzureOpenAIModelConfiguration):
             prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": USER_AGENT})
 
         current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, "coherence.prompty")
+        prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
         self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
 
     async def __call__(self, *, question: str, answer: str, **kwargs):
@@ -43,7 +46,7 @@ async def __call__(self, *, question: str, answer: str, **kwargs):
             raise ValueError("Both 'question' and 'answer' must be non-empty strings.")
 
         # Run the evaluation flow
-        llm_output = await self._flow(question=question, answer=answer, timeout=600, **kwargs)
+        llm_output = await self._flow(question=question, answer=answer, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
 
         score = np.nan
         if llm_output:
@@ -59,7 +62,7 @@ class CoherenceEvaluator:
     Initialize a coherence evaluator configured for a specific Azure OpenAI model.
 
     :param model_config: Configuration for the Azure OpenAI model.
-    :type model_config: AzureOpenAIModelConfiguration
+    :type model_config: ~promptflow.core.AzureOpenAIModelConfiguration
 
     **Usage**
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety.py
@@ -55,9 +55,9 @@ class ContentSafetyEvaluator:
     :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
         Default is True.
     :param credential: The credential for connecting to Azure AI project.
-    :type credential: TokenCredential
+    :type credential: ~azure.core.credentials.TokenCredential
     :return: A function that evaluates content-safety metrics for "question-answering" scenario.
-    :rtype: function
+    :rtype: Callable
 
     **Usage**
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety_chat.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety_chat.py
@@ -197,9 +197,9 @@ class ContentSafetyChatEvaluator:
         Default is True.
     :type parallel: bool
     :param credential: The credential for connecting to Azure AI project.
-    :type credential: TokenCredential
+    :type credential: ~azure.core.credentials.TokenCredential
     :return: A function that evaluates and generates metrics for "chat" scenario.
-    :rtype: function
+    :rtype: Callable
 
     **Usage**
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_hate_unfairness.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_hate_unfairness.py
@@ -33,7 +33,7 @@ class HateUnfairnessEvaluator:
         It contains subscription id, resource group, and project name.
     :type project_scope: dict
     :param credential: The credential for connecting to Azure AI project.
-    :type credential: TokenCredential
+    :type credential: ~azure.core.credentials.TokenCredential
 
     **Usage**
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_self_harm.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_self_harm.py
@@ -31,7 +31,7 @@ class SelfHarmEvaluator:
         It contains subscription id, resource group, and project name.
     :type project_scope: dict
     :param credential: The credential for connecting to Azure AI project.
-    :type credential: TokenCredential
+    :type credential: ~azure.core.credentials.TokenCredential
 
     **Usage**
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_sexual.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_sexual.py
@@ -31,7 +31,7 @@ class SexualEvaluator:
         It contains subscription id, resource group, and project name.
     :type project_scope: dict
     :param credential: The credential for connecting to Azure AI project.
-    :type credential: TokenCredential
+    :type credential: ~azure.core.credentials.TokenCredential
 
     **Usage**
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_violence.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_violence.py
@@ -31,7 +31,7 @@ class ViolenceEvaluator:
         It contains subscription id, resource group, and project name.
     :type project_scope: dict
     :param credential: The credential for connecting to Azure AI project.
-    :type credential: TokenCredential
+    :type credential: ~azure.core.credentials.TokenCredential
 
     **Usage**
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/constants.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/constants.py
@@ -4,6 +4,12 @@
 from enum import Enum
 
 
+class CommonConstants:
+    """Define common constants."""
+
+    DEFAULT_HTTP_TIMEOUT = 60
+
+
 class RAIService:
     """Define constants related to RAI service"""
 

diff --git a/...romptflow-evals/promptflow/evals/evaluators/_content_safety/common/content_safety_base.py b/...romptflow-evals/promptflow/evals/evaluators/_content_safety/common/content_safety_base.py
@@ -24,7 +24,7 @@ class ContentSafetyEvaluatorBase(ABC):
         It contains subscription id, resource group, and project name.
     :type project_scope: Dict
     :param credential: The credential for connecting to Azure AI project.
-    :type credential: TokenCredential
+    :type credential: ~azure.core.credentials.TokenCredential
     """
 
     def __init__(self, metric: EvaluationMetrics, project_scope: dict, credential=None):

diff --git a/...low-evals/promptflow/evals/evaluators/_content_safety/common/evaluate_with_rai_service.py b/...low-evals/promptflow/evals/evaluators/_content_safety/common/evaluate_with_rai_service.py
@@ -16,10 +16,10 @@
 from azure.identity import DefaultAzureCredential
 
 try:
-    from .constants import EvaluationMetrics, RAIService, Tasks
+    from .constants import CommonConstants, EvaluationMetrics, RAIService, Tasks
     from .utils import get_harm_severity_level
 except ImportError:
-    from constants import EvaluationMetrics, RAIService, Tasks
+    from constants import CommonConstants, EvaluationMetrics, RAIService, Tasks
     from utils import get_harm_severity_level
 
 try:
@@ -62,7 +62,7 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
     svc_liveness_url = rai_svc_url + "/checkannotation"
 
     async with httpx.AsyncClient() as client:
-        response = await client.get(svc_liveness_url, headers=headers, timeout=60)
+        response = await client.get(svc_liveness_url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
 
     if response.status_code != 200:
         raise Exception(  # pylint: disable=broad-exception-raised
@@ -101,7 +101,7 @@ async def submit_request(question: str, answer: str, metric: str, rai_svc_url: s
     headers = get_common_headers(token)
 
     async with httpx.AsyncClient() as client:
-        response = await client.post(url, json=payload, headers=headers, timeout=60)
+        response = await client.post(url, json=payload, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
 
     if response.status_code != 202:
         print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], response.text))
@@ -120,7 +120,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
     :param rai_svc_url: The Responsible AI service URL.
     :type rai_svc_url: str
     :param credential: The Azure authentication credential.
-    :type credential: TokenCredential
+    :type credential: ~azure.core.credentials.TokenCredential
     :param token: The Azure authentication token.
     :type token: str
     :return: The annotation result.
@@ -135,7 +135,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
         headers = get_common_headers(token)
 
         async with httpx.AsyncClient() as client:
-            response = await client.get(url, headers=headers, timeout=60)
+            response = await client.get(url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
 
         if response.status_code == 200:
             return response.json()
@@ -245,7 +245,7 @@ async def _get_service_discovery_url(azure_ai_project: dict, token: str) -> str:
             f"providers/Microsoft.MachineLearningServices/workspaces/{azure_ai_project['project_name']}?"
             f"api-version=2023-08-01-preview",
             headers=headers,
-            timeout=60,
+            timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT,
         )
     if response.status_code != 200:
         raise Exception("Failed to retrieve the discovery service URL")  # pylint: disable=broad-exception-raised
@@ -280,11 +280,12 @@ async def get_rai_svc_url(project_scope: dict, token: str) -> str:
 async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -> str:
     """Get token. Fetch a new token if the current token is near expiry
 
-    :param credential: The Azure authentication credential.
-    :type credential: TokenCredential
-    :param token: The Azure authentication token. Defaults to None. If none, a new token will be fetched.
-    :type token: str
-    :return: The Azure authentication token.
+       :param credential: The Azure authentication credential.
+       :type credential:
+    ~azure.core.credentials.TokenCredential
+       :param token: The Azure authentication token. Defaults to None. If none, a new token will be fetched.
+       :type token: str
+       :return: The Azure authentication token.
     """
     acquire_new_token = True
     try:
@@ -311,18 +312,19 @@ async def evaluate_with_rai_service(
 ):
     """ "Evaluate the content safety of the answer using Responsible AI service
 
-    :param question: The question to evaluate.
-    :type question: str
-    :param answer: The answer to evaluate.
-    :type answer: str
-    :param metric_name: The evaluation metric to use.
-    :type metric_name: str
-    :param project_scope: The Azure AI project scope details.
-    :type project_scope: Dict
-    :param credential: The Azure authentication credential.
-    :type credential: TokenCredential
-    :return: The parsed annotation result.
-    :rtype: List[List[Dict]]
+       :param question: The question to evaluate.
+       :type question: str
+       :param answer: The answer to evaluate.
+       :type answer: str
+       :param metric_name: The evaluation metric to use.
+       :type metric_name: str
+       :param project_scope: The Azure AI project scope details.
+       :type project_scope: Dict
+       :param credential: The Azure authentication credential.
+       :type credential:
+    ~azure.core.credentials.TokenCredential
+       :return: The parsed annotation result.
+       :rtype: List[List[Dict]]
     """
     # Use DefaultAzureCredential if no credential is provided
     # This is for the for batch run scenario as the credential cannot be serialized by promoptflow

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_fluency/_fluency.py b/src/promptflow-evals/promptflow/evals/evaluators/_fluency/_fluency.py
@@ -17,6 +17,9 @@
 
 
 class _AsyncFluencyEvaluator:
+    PROMPTY_FILE = "fluency.prompty"
+    LLM_CALL_TIMEOUT = 600
+
     def __init__(self, model_config: AzureOpenAIModelConfiguration):
         if model_config.api_version is None:
             model_config.api_version = "2024-02-15-preview"
@@ -31,7 +34,7 @@ def __init__(self, model_config: AzureOpenAIModelConfiguration):
             prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": USER_AGENT})
 
         current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, "fluency.prompty")
+        prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
         self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
 
     async def __call__(self, *, question: str, answer: str, **kwargs):
@@ -43,7 +46,7 @@ async def __call__(self, *, question: str, answer: str, **kwargs):
             raise ValueError("Both 'question' and 'answer' must be non-empty strings.")
 
         # Run the evaluation flow
-        llm_output = await self._flow(question=question, answer=answer, timeout=600, **kwargs)
+        llm_output = await self._flow(question=question, answer=answer, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
 
         score = np.nan
         if llm_output:
@@ -59,7 +62,7 @@ class FluencyEvaluator:
     Initialize a fluency evaluator configured for a specific Azure OpenAI model.
 
     :param model_config: Configuration for the Azure OpenAI model.
-    :type model_config: AzureOpenAIModelConfiguration
+    :type model_config: ~promptflow.core.AzureOpenAIModelConfiguration
 
     **Usage**
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py b/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py
@@ -17,6 +17,9 @@
 
 
 class _AsyncGroundednessEvaluator:
+    PROMPTY_FILE = "groundedness.prompty"
+    LLM_CALL_TIMEOUT = 600
+
     def __init__(self, model_config: AzureOpenAIModelConfiguration):
         if model_config.api_version is None:
             model_config.api_version = "2024-02-15-preview"
@@ -43,7 +46,7 @@ async def __call__(self, *, answer: str, context: str, **kwargs):
             raise ValueError("Both 'answer' and 'context' must be non-empty strings.")
 
         # Run the evaluation flow
-        llm_output = await self._flow(answer=answer, context=context, timeout=600, **kwargs)
+        llm_output = await self._flow(answer=answer, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
 
         score = np.nan
         if llm_output:
@@ -59,7 +62,7 @@ class GroundednessEvaluator:
     Initialize a groundedness evaluator configured for a specific Azure OpenAI model.
 
     :param model_config: Configuration for the Azure OpenAI model.
-    :type model_config: AzureOpenAIModelConfiguration
+    :type model_config: ~promptflow.core.AzureOpenAIModelConfiguration
 
     **Usage**
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_qa/_qa.py b/src/promptflow-evals/promptflow/evals/evaluators/_qa/_qa.py
@@ -59,9 +59,9 @@ class QAEvaluator:
     Initialize a question-answer evaluator configured for a specific Azure OpenAI model.
 
     :param model_config: Configuration for the Azure OpenAI model.
-    :type model_config: AzureOpenAIModelConfiguration
+    :type model_config: ~promptflow.core.AzureOpenAIModelConfiguration
     :return: A function that evaluates and generates metrics for "question-answering" scenario.
-    :rtype: function
+    :rtype: Callable
 
     **Usage**