microsoft · nick863 · Jul 15, 2024 · Jul 10, 2024 · Jul 10, 2024 · Jul 10, 2024
@@ -1,12 +1,12 @@
-name: promptflow-evals-e2e-test
+name: promptflow-evals-e2e-test-azure
 
 on:
   schedule:
     - cron: "40 10 * * *" # 2:40 PST every day
   pull_request:
     paths:
       - src/promptflow-evals/**
-      - .github/workflows/promptflow-evals-e2e-test.yml
+      - .github/workflows/promptflow-evals-e2e-test-azure.yml
   workflow_dispatch:
 
 env:
@@ -83,10 +83,10 @@ jobs:
           creds: ${{ secrets.PF_EVALS_SP_CREDENTIALS }}
           enable-AzPSSession: true
       - name: run e2e tests
-        id: run_all_e2e_tests
+        id: run_e2e_tests_azure
         run: | 
-          poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
-          poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity all_e2e_tests_run_times --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }}
+          poetry run pytest -m azuretest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
+          poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_azure --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }}
         working-directory: ${{ env.WORKING_DIRECTORY }}
       - name: upload coverage report
         uses: actions/upload-artifact@v4

@@ -0,0 +1,99 @@
+name: promptflow-evals-e2e-test-local
+
+on:
+  schedule:
+    - cron: "40 10 * * *" # 2:40 PST every day
+  pull_request:
+    paths:
+      - src/promptflow-evals/**
+      - .github/workflows/promptflow-evals-e2e-test-local.yml
+  workflow_dispatch:
+
+env:
+  IS_IN_CI_PIPELINE: "true"
+  WORKING_DIRECTORY: ${{ github.workspace }}/src/promptflow-evals
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-13]
+        # TODO: Encounter hash mismatch for ubuntu-latest and 3.9 combination during installing promptflow-evals package
+        # https://github.com/microsoft/promptflow/actions/runs/9009397933/job/24753518853?pr=3158
+        # Add 3.9 back after we figure out the issue
+        python-version: ['3.8', '3.9', '3.10', '3.11']
+      fail-fast: false
+    # snok/install-poetry need this to support Windows
+    defaults:
+      run:
+        shell: bash
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: set test mode
+        # Always run in replay mode for now until we figure out the test resource to run live mode
+        run: echo "PROMPT_FLOW_TEST_MODE=replay" >> $GITHUB_ENV
+        #run: echo "PROMPT_FLOW_TEST_MODE=$(if [[ "${{ github.event_name }}" == "pull_request" ]]; then echo replay; else echo live; fi)" >> $GITHUB_ENV
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: snok/install-poetry@v1
+      - name: install test dependency group
+        run: poetry install --only test
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: install promptflow packages in editable mode
+        run: |
+          poetry run pip install -e ../promptflow
+          poetry run pip install -e ../promptflow-core
+          poetry run pip install -e ../promptflow-devkit
+          poetry run pip install -e ../promptflow-tracing
+          poetry run pip install -e ../promptflow-tools
+          poetry run pip install -e ../promptflow-evals
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: generate end-to-end test config from secret
+        run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: check azure is not installed
+        run: poetry run pytest ../../scripts/code_qa/assert_local_install.py
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: run e2e tests
+        id: run_e2e_tests_local
+        run: |
+            poetry run pytest -m localtest tests/evals/e2etests --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
+            poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
+          poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_local --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }}
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: upload coverage report
+        uses: actions/upload-artifact@v4
+        with:
+          name: report-${{ matrix.os }}-py${{ matrix.python-version }}
+          path: |
+            ${{ env.WORKING_DIRECTORY }}/*.xml
+            ${{ env.WORKING_DIRECTORY }}/htmlcov/
+
+  report:
+    needs: test
+    runs-on: ubuntu-latest
+    permissions:
+      checks: write
+      pull-requests: write
+      contents: read
+      issues: read
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          path: artifacts
+      - uses: EnricoMi/publish-unit-test-result-action@v2
+        with:
+          check_name: promptflow-evals test result
+          comment_title: promptflow-evals test result
+          files: "artifacts/**/test-results.xml"  # align with `--junit-xml` in pyproject.toml
+      - uses: irongut/[email protected]
+        with:
+          filename: "artifacts/report-ubuntu-latest-py3.11/coverage.xml"
+          badge: true
+          fail_below_min: false
+          format: markdown
+          hide_complexity: true
+          output: both
+          thresholds: 40 80
@@ -0,0 +1,21 @@
+"""Tests checking that azure packages are NOT installed."""
+import importlib
+import pytest
+
+
+class TestPackagesNotInstalles():
+    """Test imports."""
+
+    @pytest.mark.parametrize('package', [
+        'promptflow.azure',
+        'azure.ai.ml',
+        'azure.identity',
+        'azure.storage.blob'
+    ])
+    def test_promptflow_azure(self, package):
+        """Test promptflow. azure is not installed."""
+        try:
+            importlib.import_module(package)
+            assert False, f'Package {package} must be uninstalled for local test.'
+        except (ModuleNotFoundError, ImportError):
+            pass
@@ -19,11 +19,23 @@
 
 LOGGER = logging.getLogger(__name__)
 
+
+# Handle optional import. The azure libraries are only present if
+# promptflow-azure is installed.
 try:
-    from azure.storage.blob import BlobServiceClient
     from azure.ai.ml.entities._credentials import AccountKeyConfiguration
-except ImportError:
-    LOGGER.debug("promptflow.azure is not installed.")
+    from azure.ai.ml.entities._datastore.datastore import Datastore
+    from azure.storage.blob import BlobServiceClient
+except (ModuleNotFoundError, ImportError):
+    # If the above mentioned modules cannot be imported, we are running
+    # in local mode and MLClient in the constructor will be None, so
+    # we will not arrive to Azure-dependent code.
+    LOGGER.warning(
+        "azure-ai-ml cannot be imported. "
+        "The results will be saved locally, but will not be logged to Azure. "
+        "To log results to azure please install promptflow-evals with the command "
+        "pip install promptflow-evals[azure]"
+        )
 
 
 @dataclasses.dataclass
@@ -384,7 +396,7 @@ def log_artifact(self, artifact_folder: str, artifact_name: str = EVALUATION_ART
         if response.status_code != 200:
             self._log_warning('register artifact', response)
 
-    def _get_datastore_credential(self, datastore: 'Datastore'):
+    def _get_datastore_credential(self, datastore: "Datastore"):
         # Reference the logic in azure.ai.ml._artifact._artifact_utilities
         # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
         credential = datastore.credentials

@@ -4,9 +4,8 @@
 from typing import Dict
 from unittest.mock import patch
 
-import jwt
 import pytest
-from azure.ai.ml._ml_client import MLClient
+
 from pytest_mock import MockerFixture
 
 from promptflow.client import PFClient
@@ -20,8 +19,8 @@
     from promptflow.recording.record_mode import is_in_ci_pipeline, is_live, is_record, is_replay
 except ImportError as e:
     print(f"Failed to import promptflow-recording: {e}")
-
     # Run test in empty mode if promptflow-recording is not installed
+
     def recording_array_reset():
         pass
 
@@ -37,6 +36,13 @@ def is_record():
     def is_replay():
         return False
 
+# Import of optional packages
+AZURE_INSTALLED = True
+try:
+    import jwt
+    from azure.ai.ml._ml_client import MLClient
+except ImportError:
+    AZURE_INSTALLED = False
 
 PROMPTFLOW_ROOT = Path(__file__) / "../../../.."
 CONNECTION_FILE = (PROMPTFLOW_ROOT / "promptflow-evals/connections.json").resolve().absolute().as_posix()
@@ -147,12 +153,15 @@ def mock_validate_trace_destination():
 @pytest.fixture
 def azure_ml_client(project_scope: Dict):
     """The fixture, returning MLClient"""
-    return MLClient(
-        subscription_id=project_scope["subscription_id"],
-        resource_group_name=project_scope["resource_group_name"],
-        workspace_name=project_scope["project_name"],
-        credential=get_cred(),
-    )
+    if AZURE_INSTALLED:
+        return MLClient(
+            subscription_id=project_scope["subscription_id"],
+            resource_group_name=project_scope["resource_group_name"],
+            workspace_name=project_scope["project_name"],
+            credential=get_cred(),
+        )
+    else:
+        return None
 
 
 @pytest.fixture
@@ -293,6 +302,8 @@ def azure_cred():
 
 @pytest.fixture(scope=package_scope_in_live_mode())
 def user_object_id() -> str:
+    if not AZURE_INSTALLED:
+        return ""
     if pytest.is_replay:
         from promptflow.recording.azure import SanitizedValues
 
@@ -305,6 +316,8 @@ def user_object_id() -> str:
 
 @pytest.fixture(scope=package_scope_in_live_mode())
 def tenant_id() -> str:
+    if not AZURE_INSTALLED:
+        return ""
     if pytest.is_replay:
         from promptflow.recording.azure import SanitizedValues
 
@@ -317,9 +330,12 @@ def tenant_id() -> str:
 
 @pytest.fixture(scope=package_scope_in_live_mode())
 def variable_recorder():
-    from promptflow.recording.azure import VariableRecorder
+    if pytest.is_record or pytest.is_replay:
+        from promptflow.recording.azure import VariableRecorder
 
-    yield VariableRecorder()
+        yield VariableRecorder()
+    else:
+        yield None
 
 
 @pytest.fixture(scope=package_scope_in_live_mode())

@@ -6,7 +6,7 @@
 
 
 @pytest.mark.usefixtures("recording_injection")
-@pytest.mark.e2etest
+@pytest.mark.azuretest
 class TestAdvSimulator:
     @pytest.mark.usefixtures("vcr_recording")
     def test_adv_sim_init_with_prod_url(self, azure_cred, project_scope):

@@ -11,8 +11,8 @@
 
 
 @pytest.mark.usefixtures("recording_injection", "vcr_recording")
-@pytest.mark.e2etest
 class TestBuiltInEvaluators:
+    @pytest.mark.localtest
     def test_individual_evaluator_prompt_based(self, model_config):
         eval_fn = FluencyEvaluator(model_config)
         score = eval_fn(
@@ -22,6 +22,7 @@ def test_individual_evaluator_prompt_based(self, model_config):
         assert score is not None
         assert score["gpt_fluency"] > 1.0
 
+    @pytest.mark.localtest
     def test_individual_evaluator_prompt_based_with_dict_input(self, model_config):
         eval_fn = FluencyEvaluator(model_config)
         score = eval_fn(
@@ -31,6 +32,7 @@ def test_individual_evaluator_prompt_based_with_dict_input(self, model_config):
         assert score is not None
         assert score["gpt_fluency"] > 0.0
 
+    @pytest.mark.azuretest
     def test_individual_evaluator_service_based(self, project_scope, azure_cred):
         eval_fn = ViolenceEvaluator(project_scope, azure_cred)
         score = eval_fn(
@@ -42,6 +44,7 @@ def test_individual_evaluator_service_based(self, project_scope, azure_cred):
         assert score["violence_score"] < 1.0
         assert score["violence_reason"], "violence_reason must not be None or empty."
 
+    @pytest.mark.azuretest
     @pytest.mark.skip(reason="Not working in ci pipeline. For local run.")
     def test_content_safety_service_unavailable(self, project_scope, azure_cred):
         eval_fn = ViolenceEvaluator(project_scope, azure_cred)
@@ -55,6 +58,7 @@ def test_content_safety_service_unavailable(self, project_scope, azure_cred):
 
         assert "RAI service is not available in this region" in exc_info._excinfo[1].inner_exception.args[0]
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize("parallel", [False, True])
     def test_composite_evaluator_qa(self, model_config, parallel):
         qa_eval = QAEvaluator(model_config, parallel=parallel)
@@ -73,6 +77,7 @@ def test_composite_evaluator_qa(self, model_config, parallel):
         assert score["gpt_similarity"] > 0.0
         assert score["f1_score"] > 0.0
 
+    @pytest.mark.azuretest
     def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
         safety_eval = ContentSafetyEvaluator(project_scope, parallel=False, credential=azure_cred)
         score = safety_eval(
@@ -94,6 +99,7 @@ def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
         assert score["hate_unfairness_score"] < 1.0
         assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "eval_last_turn, parallel",
         [
@@ -156,6 +162,7 @@ def test_composite_evaluator_chat(self, model_config, eval_last_turn, parallel):
         assert score["evaluation_per_turn"]["gpt_retrieval"] is not None
         assert len(score["evaluation_per_turn"]["gpt_retrieval"]["score"]) == turn_count
 
+    @pytest.mark.azuretest
     @pytest.mark.parametrize(
         "eval_last_turn, parallel",
         [