microsoft · nick863 · Jul 15, 2024 · Jul 10, 2024 · Jul 10, 2024 · Jul 10, 2024
@@ -1,12 +1,12 @@
-name: promptflow-evals-e2e-test
+name: promptflow-evals-e2e-test-azure
 
 on:
   schedule:
     - cron: "40 10 * * *" # 2:40 PST every day
   pull_request:
     paths:
       - src/promptflow-evals/**
-      - .github/workflows/promptflow-evals-e2e-test.yml
+      - .github/workflows/promptflow-evals-e2e-test-azure.yml
   workflow_dispatch:
 
 env:
@@ -83,10 +83,10 @@ jobs:
           creds: ${{ secrets.PF_EVALS_SP_CREDENTIALS }}
           enable-AzPSSession: true
       - name: run e2e tests
-        id: run_all_e2e_tests
+        id: run_e2e_tests_azure
         run: | 
-          poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
-          poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity all_e2e_tests_run_times --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }}
+          poetry run pytest -m azuretest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
+          poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_azure --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }}
         working-directory: ${{ env.WORKING_DIRECTORY }}
       - name: upload coverage report
         uses: actions/upload-artifact@v4

@@ -0,0 +1,98 @@
+name: promptflow-evals-e2e-test-local
+
+on:
+  schedule:
+    - cron: "40 10 * * *" # 2:40 PST every day
+  pull_request:
+    paths:
+      - src/promptflow-evals/**
+      - .github/workflows/promptflow-evals-e2e-test-local.yml
+  workflow_dispatch:
+
+env:
+  IS_IN_CI_PIPELINE: "true"
+  WORKING_DIRECTORY: ${{ github.workspace }}/src/promptflow-evals
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-13]
+        # TODO: Encounter hash mismatch for ubuntu-latest and 3.9 combination during installing promptflow-evals package
+        # https://github.com/microsoft/promptflow/actions/runs/9009397933/job/24753518853?pr=3158
+        # Add 3.9 back after we figure out the issue
+        python-version: ['3.8', '3.9', '3.10', '3.11']
+      fail-fast: false
+    # snok/install-poetry need this to support Windows
+    defaults:
+      run:
+        shell: bash
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: set test mode
+        # Always run in replay mode for now until we figure out the test resource to run live mode
+        run: echo "PROMPT_FLOW_TEST_MODE=replay" >> $GITHUB_ENV
+        #run: echo "PROMPT_FLOW_TEST_MODE=$(if [[ "${{ github.event_name }}" == "pull_request" ]]; then echo replay; else echo live; fi)" >> $GITHUB_ENV
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: snok/install-poetry@v1
+      - name: install test dependency group
+        run: poetry install --only test
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: install promptflow packages in editable mode
+        run: |
+          poetry run pip install -e ../promptflow
+          poetry run pip install -e ../promptflow-core
+          poetry run pip install -e ../promptflow-devkit
+          poetry run pip install -e ../promptflow-tracing
+          poetry run pip install -e ../promptflow-tools
+          poetry run pip install -e ../promptflow-evals
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: generate end-to-end test config from secret
+        run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: check azure is not installed
+        run: poetry run pytest ../../scripts/code_qa/assert_local_install.py
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: run e2e tests
+        id: run_e2e_tests_local
+        run: |
+          poetry run pytest -m localtest tests/evals/e2etests --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
+          poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_local --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }}
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: upload coverage report
+        uses: actions/upload-artifact@v4
+        with:
+          name: report-${{ matrix.os }}-py${{ matrix.python-version }}
+          path: |
+            ${{ env.WORKING_DIRECTORY }}/*.xml
+            ${{ env.WORKING_DIRECTORY }}/htmlcov/
+
+  report:
+    needs: test
+    runs-on: ubuntu-latest
+    permissions:
+      checks: write
+      pull-requests: write
+      contents: read
+      issues: read
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          path: artifacts
+      - uses: EnricoMi/publish-unit-test-result-action@v2
+        with:
+          check_name: promptflow-evals test result
+          comment_title: promptflow-evals test result
+          files: "artifacts/**/test-results.xml"  # align with `--junit-xml` in pyproject.toml
+      - uses: irongut/[email protected]
+        with:
+          filename: "artifacts/report-ubuntu-latest-py3.11/coverage.xml"
+          badge: true
+          fail_below_min: false
+          format: markdown
+          hide_complexity: true
+          output: both
+          thresholds: 40 80
@@ -197,6 +197,7 @@ src/promptflow-*/promptflow/__init__.py
 # Eclipse project files
 **/.project
 **/.pydevproject
+**/.settings
 
 # benchmark results
 benchmark/promptflow-serve/test_runner/locust-results/
@@ -0,0 +1,21 @@
+"""Tests checking that azure packages are NOT installed."""
+import importlib
+import pytest
+
+
+class TestPackagesNotInstalles():
+    """Test imports."""
+
+    @pytest.mark.parametrize('package', [
+        'promptflow.azure',
+        'azure.ai.ml',
+        'azure.identity',
+        'azure.storage.blob'
+    ])
+    def test_promptflow_azure(self, package):
+        """Test promptflow. azure is not installed."""
+        try:
+            importlib.import_module(package)
+            assert False, f'Package {package} must be uninstalled for local test.'
+        except (ModuleNotFoundError, ImportError):
+            pass
@@ -21,10 +21,21 @@
 
 LOGGER = logging.getLogger(__name__)
 
+
+# Handle optional import. The azure libraries are only present if
+# promptflow-azure is installed.
 try:
-    from azure.storage.blob import BlobServiceClient
     from azure.ai.ml.entities._credentials import AccountKeyConfiguration
-except ImportError:
+    from azure.ai.ml.entities._datastore.datastore import Datastore
+    from azure.storage.blob import BlobServiceClient
+except (ModuleNotFoundError, ImportError):
+    # If the above mentioned modules cannot be imported, we are running
+    # in local mode and MLClient in the constructor will be None, so
+    # we will not arrive to Azure-dependent code.
+
+    # We are logging the import failure only if debug logging level is set because:
+    # - If the project configuration was not provided this import is not needed.
+    # - If the project configuration was provided, the error will be raised by PFClient.
     LOGGER.debug("promptflow.azure is not installed.")
 
 
@@ -413,7 +424,7 @@ def log_artifact(self, artifact_folder: str, artifact_name: str = EVALUATION_ART
         if response.status_code != 200:
             self._log_warning('register artifact', response)
 
-    def _get_datastore_credential(self, datastore: 'Datastore'):
+    def _get_datastore_credential(self, datastore: "Datastore"):
         # Reference the logic in azure.ai.ml._artifact._artifact_utilities
         # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
         credential = datastore.credentials

@@ -4,9 +4,8 @@
 from typing import Dict
 from unittest.mock import patch
 
-import jwt
 import pytest
-from azure.ai.ml._ml_client import MLClient
+
 from pytest_mock import MockerFixture
 
 from promptflow.client import PFClient
@@ -20,8 +19,8 @@
     from promptflow.recording.record_mode import is_in_ci_pipeline, is_live, is_record, is_replay
 except ImportError as e:
     print(f"Failed to import promptflow-recording: {e}")
-
     # Run test in empty mode if promptflow-recording is not installed
+
     def recording_array_reset():
         pass
 
@@ -37,6 +36,13 @@ def is_record():
     def is_replay():
         return False
 
+# Import of optional packages
+AZURE_INSTALLED = True
+try:
+    import jwt
+    from azure.ai.ml._ml_client import MLClient
+except ImportError:
+    AZURE_INSTALLED = False
 
 PROMPTFLOW_ROOT = Path(__file__) / "../../../.."
 CONNECTION_FILE = (PROMPTFLOW_ROOT / "promptflow-evals/connections.json").resolve().absolute().as_posix()
@@ -147,12 +153,15 @@ def mock_validate_trace_destination():
 @pytest.fixture
 def azure_ml_client(project_scope: Dict):
     """The fixture, returning MLClient"""
-    return MLClient(
-        subscription_id=project_scope["subscription_id"],
-        resource_group_name=project_scope["resource_group_name"],
-        workspace_name=project_scope["project_name"],
-        credential=get_cred(),
-    )
+    if AZURE_INSTALLED:
+        return MLClient(
+            subscription_id=project_scope["subscription_id"],
+            resource_group_name=project_scope["resource_group_name"],
+            workspace_name=project_scope["project_name"],
+            credential=get_cred(),
+        )
+    else:
+        return None
 
 
 @pytest.fixture
@@ -293,6 +302,8 @@ def azure_cred():
 
 @pytest.fixture(scope=package_scope_in_live_mode())
 def user_object_id() -> str:
+    if not AZURE_INSTALLED:
+        return ""
     if pytest.is_replay:
         from promptflow.recording.azure import SanitizedValues
 
@@ -305,6 +316,8 @@ def user_object_id() -> str:
 
 @pytest.fixture(scope=package_scope_in_live_mode())
 def tenant_id() -> str:
+    if not AZURE_INSTALLED:
+        return ""
     if pytest.is_replay:
         from promptflow.recording.azure import SanitizedValues
 
@@ -317,9 +330,12 @@ def tenant_id() -> str:
 
 @pytest.fixture(scope=package_scope_in_live_mode())
 def variable_recorder():
-    from promptflow.recording.azure import VariableRecorder
+    if pytest.is_record or pytest.is_replay:
+        from promptflow.recording.azure import VariableRecorder
 
-    yield VariableRecorder()
+        yield VariableRecorder()
+    else:
+        yield None
 
 
 @pytest.fixture(scope=package_scope_in_live_mode())
@@ -346,3 +362,18 @@ def vcr_recording(request: pytest.FixtureRequest, user_object_id: str, tenant_id
         yield recording
     else:
         yield None
+
+
+def pytest_collection_modifyitems(items):
+    parents = {}
+    for item in items:
+        # Check if parent contains 'localtest' marker and remove it.
+        if any(mark.name == 'localtest' for mark in item.parent.own_markers) or id(item.parent) in parents:
+            if id(item.parent) not in parents:
+                item.parent.own_markers = [
+                    marker for marker in item.own_markers if getattr(marker, 'name', None) != 'localtest']
+                parents[id(item.parent)] = item.parent
+            if not item.get_closest_marker('azuretest'):
+                # If item's parent was marked as 'localtest', mark the child as such, but not if
+                # it was marked as 'azuretest'.
+                item.add_marker(pytest.mark.localtest)
@@ -6,7 +6,7 @@
 
 
 @pytest.mark.usefixtures("recording_injection")
-@pytest.mark.e2etest
+@pytest.mark.azuretest
 class TestAdvSimulator:
     @pytest.mark.usefixtures("vcr_recording")
     def test_adv_sim_init_with_prod_url(self, azure_cred, project_scope):

@@ -11,7 +11,7 @@
 
 
 @pytest.mark.usefixtures("recording_injection", "vcr_recording")
-@pytest.mark.e2etest
+@pytest.mark.localtest
 class TestBuiltInEvaluators:
     def test_individual_evaluator_prompt_based(self, model_config):
         eval_fn = FluencyEvaluator(model_config)
@@ -31,6 +31,7 @@ def test_individual_evaluator_prompt_based_with_dict_input(self, model_config):
         assert score is not None
         assert score["gpt_fluency"] > 0.0
 
+    @pytest.mark.azuretest
     def test_individual_evaluator_service_based(self, project_scope, azure_cred):
         eval_fn = ViolenceEvaluator(project_scope, azure_cred)
         score = eval_fn(
@@ -73,6 +74,7 @@ def test_composite_evaluator_qa(self, model_config, parallel):
         assert score["gpt_similarity"] > 0.0
         assert score["f1_score"] > 0.0
 
+    @pytest.mark.azuretest
     def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
         safety_eval = ContentSafetyEvaluator(project_scope, parallel=False, credential=azure_cred)
         score = safety_eval(
@@ -156,6 +158,7 @@ def test_composite_evaluator_chat(self, model_config, eval_last_turn, parallel):
         assert score["evaluation_per_turn"]["gpt_retrieval"] is not None
         assert len(score["evaluation_per_turn"]["gpt_retrieval"]["score"]) == turn_count
 
+    @pytest.mark.azuretest
     @pytest.mark.parametrize(
         "eval_last_turn, parallel",
         [

@@ -6,7 +6,6 @@
 import pandas as pd
 import pytest
 import requests
-from azure.identity import DefaultAzureCredential
 
 from promptflow.evals.evaluate import evaluate
 from promptflow.evals.evaluators import ContentSafetyEvaluator, F1ScoreEvaluator, GroundednessEvaluator
@@ -46,6 +45,7 @@ def question_evaluator(question):
 
 def _get_run_from_run_history(flow_run_id, ml_client, project_scope):
     """Get run info from run history"""
+    from azure.identity import DefaultAzureCredential
     token = "Bearer " + DefaultAzureCredential().get_token("https://management.azure.com/.default").token
     headers = {
         "Authorization": token,
@@ -80,7 +80,7 @@ def _get_run_from_run_history(flow_run_id, ml_client, project_scope):
 
 
 @pytest.mark.usefixtures("recording_injection")
-@pytest.mark.e2etest
+@pytest.mark.localtest
 class TestEvaluate:
     def test_evaluate_with_groundedness_evaluator(self, model_config, data_file):
         # data
@@ -118,6 +118,7 @@ def test_evaluate_with_groundedness_evaluator(self, model_config, data_file):
         assert row_result_df["outputs.f1_score.f1_score"][2] == 1
         assert result["studio_url"] is None
 
+    @pytest.mark.azuretest
     @pytest.mark.skip(reason="Failed in CI pipeline. Pending for investigation.")
     def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file, azure_cred):
         input_data = pd.read_json(data_file, lines=True)
@@ -301,6 +302,7 @@ def test_evaluate_with_evaluator_config(self, questions_file, evaluate_config):
         assert "answer.length" in metrics.keys()
         assert "f1_score.f1_score" in metrics.keys()
 
+    @pytest.mark.azuretest
     def test_evaluate_track_in_cloud(
         self,
         questions_file,
@@ -344,6 +346,7 @@ def test_evaluate_track_in_cloud(
         assert remote_run["runMetadata"]["properties"]["runType"] == "eval_run"
         assert remote_run["runMetadata"]["displayName"] == evaluation_name
 
+    @pytest.mark.azuretest
     def test_evaluate_track_in_cloud_no_target(
         self,
         data_file,