microsoft · nick863 · Jul 15, 2024 · Jul 10, 2024 · Jul 10, 2024 · Jul 10, 2024
diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml
@@ -0,0 +1,95 @@
+name: promptflow-evals-local-e2e-test
+
+on:
+  schedule:
+    - cron: "40 10 * * *" # 2:40 PST every day
+  pull_request:
+    paths:
+      - src/promptflow-evals/**
+      - .github/workflows/promptflow-evals-local-e2e-test.yml
+  workflow_dispatch:
+
+env:
+  IS_IN_CI_PIPELINE: "true"
+  WORKING_DIRECTORY: ${{ github.workspace }}/src/promptflow-evals
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-13]
+        # TODO: Encounter hash mismatch for ubuntu-latest and 3.9 combination during installing promptflow-evals package
+        # https://github.com/microsoft/promptflow/actions/runs/9009397933/job/24753518853?pr=3158
+        # Add 3.9 back after we figure out the issue
+        python-version: ['3.8', '3.10', '3.11']
+      fail-fast: false
+    # snok/install-poetry need this to support Windows
+    defaults:
+      run:
+        shell: bash
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: set test mode
+        # Always run in replay mode for now until we figure out the test resource to run live mode
+        run: echo "PROMPT_FLOW_TEST_MODE=replay" >> $GITHUB_ENV
+        #run: echo "PROMPT_FLOW_TEST_MODE=$(if [[ "${{ github.event_name }}" == "pull_request" ]]; then echo replay; else echo live; fi)" >> $GITHUB_ENV
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: snok/install-poetry@v1
+      - name: install test dependency group
+        run: poetry install --only test
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: install promptflow packages in editable mode
+        run: |
+          poetry run pip install -e ../promptflow
+          poetry run pip install -e ../promptflow-core
+          poetry run pip install -e ../promptflow-devkit
+          poetry run pip install -e ../promptflow-tracing
+          poetry run pip install -e ../promptflow-tools
+          poetry run pip install -e ../promptflow-evals
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: generate end-to-end test config from secret
+        run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: check azure is not installed
+        run: poetry run pytest scripts/code_qa/assert_local_install.py
+        working-directory: ${{ github.workspace }}
+      - name: run e2e tests
+        run: poetry run pytest -m localtest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: upload coverage report
+        uses: actions/upload-artifact@v4
+        with:
+          name: report-${{ matrix.os }}-py${{ matrix.python-version }}
+          path: |
+            ${{ env.WORKING_DIRECTORY }}/*.xml
+            ${{ env.WORKING_DIRECTORY }}/htmlcov/
+
+  report:
+    needs: test
+    runs-on: ubuntu-latest
+    permissions:
+      checks: write
+      pull-requests: write
+      contents: read
+      issues: read
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          path: artifacts
+      - uses: EnricoMi/publish-unit-test-result-action@v2
+        with:
+          check_name: promptflow-evals test result
+          comment_title: promptflow-evals test result
+          files: "artifacts/**/test-results.xml"  # align with `--junit-xml` in pyproject.toml
+      - uses: irongut/[email protected]
+        with:
+          filename: "artifacts/report-ubuntu-latest-py3.11/coverage.xml"
+          badge: true
+          fail_below_min: false
+          format: markdown
+          hide_complexity: true
+          output: both
+          thresholds: 40 80
@@ -0,0 +1,17 @@
+"""Tests checking that azure packages are NOT installed."""
+import importlib
+import pytest
+
+
+class TestPackagesNotInstalles():
+    """Test imports."""
+
+    @pytest.mark.parametrize('package', [
+        'promptflow.azure',
+        'azure.ai.ml',
+        'azure.identity',
+        'azure.storage.blob'
+    ])
+    def test_promptflow_azure(self, package):
+        """Test promptflow. azure is not installed."""
+        assert importlib.util.find_spec(package) is None, f'Package {package} must be uninstalled for local test.'
@@ -11,15 +11,24 @@
 from typing import Any, Dict, Optional, Type
 from urllib.parse import urlparse
 
-from azure.storage.blob import BlobServiceClient
+
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
 from promptflow.evals._version import VERSION
 from promptflow._sdk.entities import Run
 
-from azure.ai.ml.entities._credentials import AccountKeyConfiguration
-from azure.ai.ml.entities._datastore.datastore import Datastore
+# Handle optional import. The azure libraries are only present if
+# promptflow-azure is installed.
+try:
+    from azure.ai.ml.entities._credentials import AccountKeyConfiguration
+    from azure.ai.ml.entities._datastore.datastore import Datastore
+    from azure.storage.blob import BlobServiceClient
+except (ModuleNotFoundError, ImportError):
+    # If the above mentioned modules cannot be imported, we are running
+    # in local mode and MLClient in the constructor will be None, so
+    # we will not arrive to Azure-dependent code.
+    pass
 
 LOGGER = logging.getLogger(__name__)
 
@@ -382,7 +391,7 @@ def log_artifact(self, artifact_folder: str, artifact_name: str = EVALUATION_ART
         if response.status_code != 200:
             self._log_warning('register artifact', response)
 
-    def _get_datastore_credential(self, datastore: Datastore):
+    def _get_datastore_credential(self, datastore: "Datastore"):
         # Reference the logic in azure.ai.ml._artifact._artifact_utilities
         # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
         credential = datastore.credentials

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py b/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py
@@ -75,7 +75,10 @@ def __call__(self, *, answer: str, context: str, **kwargs):
             raise ValueError("Both 'answer' and 'context' must be non-empty strings.")
 
         # Run the evaluation flow
-        llm_output = self._flow(answer=answer, context=context)
+        try:
+            llm_output = self._flow(answer=answer, context=context)
+        except BaseException as e:
+            print(e)
 
         score = np.nan
         if llm_output:

@@ -6,7 +6,7 @@
 
 import jwt
 import pytest
-from azure.ai.ml._ml_client import MLClient
+
 from pytest_mock import MockerFixture
 
 from promptflow.client import PFClient
@@ -20,8 +20,8 @@
     from promptflow.recording.record_mode import is_in_ci_pipeline, is_live, is_record, is_replay
 except ImportError as e:
     print(f"Failed to import promptflow-recording: {e}")
-
     # Run test in empty mode if promptflow-recording is not installed
+
     def recording_array_reset():
         pass
 
@@ -37,6 +37,12 @@ def is_record():
     def is_replay():
         return False
 
+# Import of optional packages
+AZURE_INSTALLED = True
+try:
+    from azure.ai.ml._ml_client import MLClient
+except ImportError:
+    AZURE_INSTALLED = False
 
 PROMPTFLOW_ROOT = Path(__file__) / "../../../.."
 CONNECTION_FILE = (PROMPTFLOW_ROOT / "promptflow-evals/connections.json").resolve().absolute().as_posix()
@@ -147,12 +153,15 @@ def mock_validate_trace_destination():
 @pytest.fixture
 def azure_ml_client(project_scope: Dict):
     """The fixture, returning MLClient"""
-    return MLClient(
-        subscription_id=project_scope["subscription_id"],
-        resource_group_name=project_scope["resource_group_name"],
-        workspace_name=project_scope["project_name"],
-        credential=get_cred(),
-    )
+    if AZURE_INSTALLED:
+        return MLClient(
+            subscription_id=project_scope["subscription_id"],
+            resource_group_name=project_scope["resource_group_name"],
+            workspace_name=project_scope["project_name"],
+            credential=get_cred(),
+        )
+    else:
+        return None
 
 
 @pytest.fixture

@@ -13,6 +13,7 @@
 @pytest.mark.usefixtures("recording_injection", "vcr_recording")
 @pytest.mark.e2etest
 class TestBuiltInEvaluators:
+    @pytest.mark.localtest
     def test_individual_evaluator_prompt_based(self, model_config):
         eval_fn = FluencyEvaluator(model_config)
         score = eval_fn(
@@ -22,6 +23,7 @@ def test_individual_evaluator_prompt_based(self, model_config):
         assert score is not None
         assert score["gpt_fluency"] > 1.0
 
+    @pytest.mark.localtest
     def test_individual_evaluator_prompt_based_with_dict_input(self, model_config):
         eval_fn = FluencyEvaluator(model_config)
         score = eval_fn(
@@ -55,6 +57,7 @@ def test_content_safety_service_unavailable(self, project_scope, azure_cred):
 
         assert "RAI service is not available in this region" in exc_info._excinfo[1].inner_exception.args[0]
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize("parallel", [False, True])
     def test_composite_evaluator_qa(self, model_config, parallel):
         qa_eval = QAEvaluator(model_config, parallel=parallel)
@@ -94,6 +97,7 @@ def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
         assert score["hate_unfairness_score"] < 1.0
         assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "eval_last_turn, parallel",
         [

@@ -82,6 +82,7 @@ def _get_run_from_run_history(flow_run_id, ml_client, project_scope):
 @pytest.mark.usefixtures("recording_injection")
 @pytest.mark.e2etest
 class TestEvaluate:
+    @pytest.mark.localtest
     def test_evaluate_with_groundedness_evaluator(self, model_config, data_file):
         # data
         input_data = pd.read_json(data_file, lines=True)
@@ -153,6 +154,7 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file,
         assert 0 <= metrics.get("content_safety.self_harm_defect_rate") <= 1
         assert 0 <= metrics.get("content_safety.hate_unfairness_defect_rate") <= 1
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "use_pf_client,function,column",
         [
@@ -186,6 +188,7 @@ def test_evaluate_python_function(self, data_file, use_pf_client, function, colu
         assert metrics.get(metric) == np.nanmean(row_result_df[out_column])
         assert row_result_df[out_column][2] == 31
 
+    @pytest.mark.localtest
     def test_evaluate_with_target(self, questions_file):
         """Test evaluation with target function."""
         # We cannot define target in this file as pytest will load
@@ -209,6 +212,7 @@ def test_evaluate_with_target(self, questions_file):
         assert "outputs.f1.f1_score" in row_result_df.columns
         assert not any(np.isnan(f1) for f1 in row_result_df["outputs.f1.f1_score"])
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "evaluation_config",
         [
@@ -251,6 +255,7 @@ def test_evaluate_another_questions(self, questions_file, evaluation_config):
         expected = list(row_result_df[question].str.len())
         assert expected == list(row_result_df["outputs.question_ev.length"])
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "evaluate_config",
         [
@@ -386,6 +391,7 @@ def test_evaluate_track_in_cloud_no_target(
         assert remote_run["runMetadata"]["properties"]["_azureml.evaluation_run"] == "azure-ai-generative-parent"
         assert remote_run["runMetadata"]["displayName"] == evaluation_name
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "return_json, aggregate_return_json",
         [
@@ -410,6 +416,7 @@ def test_evaluate_aggregation_with_threadpool(self, data_file, return_json, aggr
         if aggregate_return_json:
             assert "answer_length.median" in result["metrics"].keys()
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "return_json, aggregate_return_json",
         [