From 7619024ae641b334a6cced3d9f651c6eb4eb77a5 Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 09:43:45 -0700
Subject: [PATCH 01/22] Add local tests

---
 .../promptflow-evals-local-e2e-test.yml       | 93 +++++++++++++++++++
 .../promptflow/evals/evaluate/_eval_run.py    | 17 +++-
 .../evaluators/_groundedness/_groundedness.py |  5 +-
 src/promptflow-evals/tests/evals/conftest.py  | 25 +++--
 .../tests/evals/e2etests/test_evaluate.py     |  7 ++
 5 files changed, 134 insertions(+), 13 deletions(-)
 create mode 100644 .github/workflows/promptflow-evals-local-e2e-test.yml

diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml
new file mode 100644
index 00000000000..b4092e81a2f
--- /dev/null
+++ b/.github/workflows/promptflow-evals-local-e2e-test.yml
@@ -0,0 +1,93 @@
+name: promptflow-evals-e2e-test
+
+on:
+  schedule:
+    - cron: "40 10 * * *" # 2:40 PST every day
+  pull_request:
+    paths:
+      - src/promptflow-evals/**
+      - .github/workflows/promptflow-evals-e2e-test.yml
+  workflow_dispatch:
+
+env:
+  IS_IN_CI_PIPELINE: "true"
+  WORKING_DIRECTORY: ${{ github.workspace }}/src/promptflow-evals
+
+jobs:
+  test:
+    needs: build
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-13]
+        # TODO: Encounter hash mismatch for ubuntu-latest and 3.9 combination during installing promptflow-evals package
+        # https://github.com/microsoft/promptflow/actions/runs/9009397933/job/24753518853?pr=3158
+        # Add 3.9 back after we figure out the issue
+        python-version: ['3.8', '3.10', '3.11']
+      fail-fast: false
+    # snok/install-poetry need this to support Windows
+    defaults:
+      run:
+        shell: bash
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: set test mode
+        # Always run in replay mode for now until we figure out the test resource to run live mode
+        run: echo "PROMPT_FLOW_TEST_MODE=replay" >> $GITHUB_ENV
+        #run: echo "PROMPT_FLOW_TEST_MODE=$(if [[ "${{ github.event_name }}" == "pull_request" ]]; then echo replay; else echo live; fi)" >> $GITHUB_ENV
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: snok/install-poetry@v1
+      - name: install test dependency group
+        run: poetry install --only test
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: install promptflow packages in editable mode
+        run: |
+          poetry run pip install -e ../promptflow
+          poetry run pip install -e ../promptflow-core
+          poetry run pip install -e ../promptflow-devkit
+          poetry run pip install -e ../promptflow-tracing
+          poetry run pip install -e ../promptflow-tools
+          poetry run pip install -e ../promptflow-evals
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: generate end-to-end test config from secret
+        run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: run e2e tests
+        run: poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: upload coverage report
+        uses: actions/upload-artifact@v4
+        with:
+          name: report-${{ matrix.os }}-py${{ matrix.python-version }}
+          path: |
+            ${{ env.WORKING_DIRECTORY }}/*.xml
+            ${{ env.WORKING_DIRECTORY }}/htmlcov/
+
+  report:
+    needs: test
+    runs-on: ubuntu-latest
+    permissions:
+      checks: write
+      pull-requests: write
+      contents: read
+      issues: read
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          path: artifacts
+      - uses: EnricoMi/publish-unit-test-result-action@v2
+        with:
+          check_name: promptflow-evals test result
+          comment_title: promptflow-evals test result
+          files: "artifacts/**/test-results.xml"  # align with `--junit-xml` in pyproject.toml
+      - uses: irongut/CodeCoverageSummary@v1.3.0
+        with:
+          filename: "artifacts/report-ubuntu-latest-py3.11/coverage.xml"
+          badge: true
+          fail_below_min: false
+          format: markdown
+          hide_complexity: true
+          output: both
+          thresholds: 40 80
diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
index 05fd89b53f2..7aabadac944 100644
--- a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
+++ b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
@@ -11,15 +11,24 @@
 from typing import Any, Dict, Optional, Type
 from urllib.parse import urlparse
 
-from azure.storage.blob import BlobServiceClient
+
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
 from promptflow.evals._version import VERSION
 from promptflow._sdk.entities import Run
 
-from azure.ai.ml.entities._credentials import AccountKeyConfiguration
-from azure.ai.ml.entities._datastore.datastore import Datastore
+# Handle optional import. The azure libraries are only present if
+# promptflow-azure is installed.
+try:
+    from azure.ai.ml.entities._credentials import AccountKeyConfiguration
+    from azure.ai.ml.entities._datastore.datastore import Datastore
+    from azure.storage.blob import BlobServiceClient
+except (ModuleNotFoundError, ImportError):
+    # If the above mentioned modules cannot be imported, we are running
+    # in local mode and MLClient in the constructor will be None, so
+    # we will not arrive to Azure-dependent code.
+    pass
 
 LOGGER = logging.getLogger(__name__)
 
@@ -382,7 +391,7 @@ def log_artifact(self, artifact_folder: str, artifact_name: str = EVALUATION_ART
         if response.status_code != 200:
             self._log_warning('register artifact', response)
 
-    def _get_datastore_credential(self, datastore: Datastore):
+    def _get_datastore_credential(self, datastore: "Datastore"):
         # Reference the logic in azure.ai.ml._artifact._artifact_utilities
         # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
         credential = datastore.credentials
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py b/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py
index 6eccd607814..282a2801a92 100644
--- a/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py
+++ b/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py
@@ -75,7 +75,10 @@ def __call__(self, *, answer: str, context: str, **kwargs):
             raise ValueError("Both 'answer' and 'context' must be non-empty strings.")
 
         # Run the evaluation flow
-        llm_output = self._flow(answer=answer, context=context)
+        try:
+            llm_output = self._flow(answer=answer, context=context)
+        except BaseException as e:
+            print(e)
 
         score = np.nan
         if llm_output:
diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py
index 4d263152e7b..dc1014c8082 100644
--- a/src/promptflow-evals/tests/evals/conftest.py
+++ b/src/promptflow-evals/tests/evals/conftest.py
@@ -6,7 +6,7 @@
 
 import jwt
 import pytest
-from azure.ai.ml._ml_client import MLClient
+
 from pytest_mock import MockerFixture
 
 from promptflow.client import PFClient
@@ -20,8 +20,8 @@
     from promptflow.recording.record_mode import is_in_ci_pipeline, is_live, is_record, is_replay
 except ImportError as e:
     print(f"Failed to import promptflow-recording: {e}")
-
     # Run test in empty mode if promptflow-recording is not installed
+
     def recording_array_reset():
         pass
 
@@ -37,6 +37,12 @@ def is_record():
     def is_replay():
         return False
 
+# Import of optional packages
+AZURE_INSTALLED = True
+try:
+    from azure.ai.ml._ml_client import MLClient
+except ImportError:
+    AZURE_INSTALLED = False
 
 PROMPTFLOW_ROOT = Path(__file__) / "../../../.."
 CONNECTION_FILE = (PROMPTFLOW_ROOT / "promptflow-evals/connections.json").resolve().absolute().as_posix()
@@ -147,12 +153,15 @@ def mock_validate_trace_destination():
 @pytest.fixture
 def azure_ml_client(project_scope: Dict):
     """The fixture, returning MLClient"""
-    return MLClient(
-        subscription_id=project_scope["subscription_id"],
-        resource_group_name=project_scope["resource_group_name"],
-        workspace_name=project_scope["project_name"],
-        credential=get_cred(),
-    )
+    if AZURE_INSTALLED:
+        return MLClient(
+            subscription_id=project_scope["subscription_id"],
+            resource_group_name=project_scope["resource_group_name"],
+            workspace_name=project_scope["project_name"],
+            credential=get_cred(),
+        )
+    else:
+        return None
 
 
 @pytest.fixture
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
index 16b06daf85f..ebd418bdfb9 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
@@ -82,6 +82,7 @@ def _get_run_from_run_history(flow_run_id, ml_client, project_scope):
 @pytest.mark.usefixtures("recording_injection")
 @pytest.mark.e2etest
 class TestEvaluate:
+    @pytest.mark.localtest
     def test_evaluate_with_groundedness_evaluator(self, model_config, data_file):
         # data
         input_data = pd.read_json(data_file, lines=True)
@@ -153,6 +154,7 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file,
         assert 0 <= metrics.get("content_safety.self_harm_defect_rate") <= 1
         assert 0 <= metrics.get("content_safety.hate_unfairness_defect_rate") <= 1
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "use_pf_client,function,column",
         [
@@ -186,6 +188,7 @@ def test_evaluate_python_function(self, data_file, use_pf_client, function, colu
         assert metrics.get(metric) == np.nanmean(row_result_df[out_column])
         assert row_result_df[out_column][2] == 31
 
+    @pytest.mark.localtest
     def test_evaluate_with_target(self, questions_file):
         """Test evaluation with target function."""
         # We cannot define target in this file as pytest will load
@@ -209,6 +212,7 @@ def test_evaluate_with_target(self, questions_file):
         assert "outputs.f1.f1_score" in row_result_df.columns
         assert not any(np.isnan(f1) for f1 in row_result_df["outputs.f1.f1_score"])
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "evaluation_config",
         [
@@ -251,6 +255,7 @@ def test_evaluate_another_questions(self, questions_file, evaluation_config):
         expected = list(row_result_df[question].str.len())
         assert expected == list(row_result_df["outputs.question_ev.length"])
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "evaluate_config",
         [
@@ -386,6 +391,7 @@ def test_evaluate_track_in_cloud_no_target(
         assert remote_run["runMetadata"]["properties"]["_azureml.evaluation_run"] == "azure-ai-generative-parent"
         assert remote_run["runMetadata"]["displayName"] == evaluation_name
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "return_json, aggregate_return_json",
         [
@@ -410,6 +416,7 @@ def test_evaluate_aggregation_with_threadpool(self, data_file, return_json, aggr
         if aggregate_return_json:
             assert "answer_length.median" in result["metrics"].keys()
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "return_json, aggregate_return_json",
         [

From 6e1f0d57ab4b425cc9b7005071a09627368c3f1a Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 12:22:34 -0700
Subject: [PATCH 02/22] Add assertion that azure packages are not installed.

---
 .../promptflow-evals-local-e2e-test.yml          |  5 ++++-
 scripts/code_qa/assert_local_install.py          | 16 ++++++++++++++++
 .../evals/e2etests/test_builtin_evaluators.py    |  4 ++++
 3 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 scripts/code_qa/assert_local_install.py

diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml
index b4092e81a2f..49d4da13738 100644
--- a/.github/workflows/promptflow-evals-local-e2e-test.yml
+++ b/.github/workflows/promptflow-evals-local-e2e-test.yml
@@ -54,8 +54,11 @@ jobs:
       - name: generate end-to-end test config from secret
         run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json
         working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name check azure is not installed
+        run: poetry run pytest scripts/assert_local_install.py
+        working-directory: ${{ github.workspace }}
       - name: run e2e tests
-        run: poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
+        run: poetry run pytest -m localtest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
         working-directory: ${{ env.WORKING_DIRECTORY }}
       - name: upload coverage report
         uses: actions/upload-artifact@v4
diff --git a/scripts/code_qa/assert_local_install.py b/scripts/code_qa/assert_local_install.py
new file mode 100644
index 00000000000..9d358171297
--- /dev/null
+++ b/scripts/code_qa/assert_local_install.py
@@ -0,0 +1,16 @@
+"""Tests checking that azure packages are NOT installed."""
+import importlib
+import pytest
+
+class TestPackagesNotInstalles():
+    """Test imports."""
+
+    @pytest.mark.parametrize('package', [
+        'promptflow.azure',
+        'azure.ai.ml',
+        'azure.identity',
+        'azure.storage.blob'
+    ])
+    def test_promptflow_azure(self, package):
+        """Test promptflow. azure is not installed."""
+        assert importlib.util.find_spec(package) is None, f'Package {package} must be uninstalled for local test.'
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
index 17bfb5029cf..8eeae5f2bfb 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
@@ -13,6 +13,7 @@
 @pytest.mark.usefixtures("recording_injection", "vcr_recording")
 @pytest.mark.e2etest
 class TestBuiltInEvaluators:
+    @pytest.mark.localtest
     def test_individual_evaluator_prompt_based(self, model_config):
         eval_fn = FluencyEvaluator(model_config)
         score = eval_fn(
@@ -22,6 +23,7 @@ def test_individual_evaluator_prompt_based(self, model_config):
         assert score is not None
         assert score["gpt_fluency"] > 1.0
 
+    @pytest.mark.localtest
     def test_individual_evaluator_prompt_based_with_dict_input(self, model_config):
         eval_fn = FluencyEvaluator(model_config)
         score = eval_fn(
@@ -55,6 +57,7 @@ def test_content_safety_service_unavailable(self, project_scope, azure_cred):
 
         assert "RAI service is not available in this region" in exc_info._excinfo[1].inner_exception.args[0]
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize("parallel", [False, True])
     def test_composite_evaluator_qa(self, model_config, parallel):
         qa_eval = QAEvaluator(model_config, parallel=parallel)
@@ -94,6 +97,7 @@ def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
         assert score["hate_unfairness_score"] < 1.0
         assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
 
+    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "eval_last_turn, parallel",
         [

From 5335b5449984131cbf5ee16c4c30932e3f42f93b Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 12:36:38 -0700
Subject: [PATCH 03/22] Typos

---
 .github/workflows/promptflow-evals-local-e2e-test.yml | 4 ++--
 scripts/code_qa/assert_local_install.py               | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml
index 49d4da13738..1c3092ef8f2 100644
--- a/.github/workflows/promptflow-evals-local-e2e-test.yml
+++ b/.github/workflows/promptflow-evals-local-e2e-test.yml
@@ -1,4 +1,4 @@
-name: promptflow-evals-e2e-test
+name: promptflow-evals_local-e2e-test
 
 on:
   schedule:
@@ -55,7 +55,7 @@ jobs:
         run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json
         working-directory: ${{ env.WORKING_DIRECTORY }}
       - name check azure is not installed
-        run: poetry run pytest scripts/assert_local_install.py
+        run: poetry run pytest scripts/code_qa/assert_local_install.py
         working-directory: ${{ github.workspace }}
       - name: run e2e tests
         run: poetry run pytest -m localtest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
diff --git a/scripts/code_qa/assert_local_install.py b/scripts/code_qa/assert_local_install.py
index 9d358171297..5dca70650b0 100644
--- a/scripts/code_qa/assert_local_install.py
+++ b/scripts/code_qa/assert_local_install.py
@@ -2,6 +2,7 @@
 import importlib
 import pytest
 
+
 class TestPackagesNotInstalles():
     """Test imports."""
 

From 564e6724bee6d2825c3d2e34708fee7890d5934b Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 12:47:47 -0700
Subject: [PATCH 04/22] Fix test name

---
 .github/workflows/promptflow-evals-local-e2e-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml
index 1c3092ef8f2..96fbf22ff19 100644
--- a/.github/workflows/promptflow-evals-local-e2e-test.yml
+++ b/.github/workflows/promptflow-evals-local-e2e-test.yml
@@ -6,7 +6,7 @@ on:
   pull_request:
     paths:
       - src/promptflow-evals/**
-      - .github/workflows/promptflow-evals-e2e-test.yml
+      - .github/workflows/promptflow-evals-local-e2e-test.yml
   workflow_dispatch:
 
 env:

From 319f8c288b1ad18ea0014396f47f662447a087dd Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 13:02:45 -0700
Subject: [PATCH 05/22] Fix workflow file

---
 .github/workflows/promptflow-evals-local-e2e-test.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml
index 96fbf22ff19..f1e70016c03 100644
--- a/.github/workflows/promptflow-evals-local-e2e-test.yml
+++ b/.github/workflows/promptflow-evals-local-e2e-test.yml
@@ -1,4 +1,4 @@
-name: promptflow-evals_local-e2e-test
+name: promptflow-evals-local-e2e-test
 
 on:
   schedule:
@@ -15,7 +15,6 @@ env:
 
 jobs:
   test:
-    needs: build
     strategy:
       matrix:
         os: [ubuntu-latest, windows-latest, macos-13]

From cc589d41c547083dacb6c5a550ff0e2878cc5481 Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 14:24:24 -0700
Subject: [PATCH 06/22] Fix workflow file II

---
 .github/workflows/promptflow-evals-local-e2e-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml
index f1e70016c03..1bf37e83aee 100644
--- a/.github/workflows/promptflow-evals-local-e2e-test.yml
+++ b/.github/workflows/promptflow-evals-local-e2e-test.yml
@@ -53,7 +53,7 @@ jobs:
       - name: generate end-to-end test config from secret
         run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json
         working-directory: ${{ env.WORKING_DIRECTORY }}
-      - name check azure is not installed
+      - name: check azure is not installed
         run: poetry run pytest scripts/code_qa/assert_local_install.py
         working-directory: ${{ github.workspace }}
       - name: run e2e tests

From 8020ad42b632dd8639b7d13de03841361a5492cf Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 14:45:11 -0700
Subject: [PATCH 07/22] Run test in the project directory

---
 .github/workflows/promptflow-evals-local-e2e-test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml
index 1bf37e83aee..db9ca8d2716 100644
--- a/.github/workflows/promptflow-evals-local-e2e-test.yml
+++ b/.github/workflows/promptflow-evals-local-e2e-test.yml
@@ -54,8 +54,8 @@ jobs:
         run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json
         working-directory: ${{ env.WORKING_DIRECTORY }}
       - name: check azure is not installed
-        run: poetry run pytest scripts/code_qa/assert_local_install.py
-        working-directory: ${{ github.workspace }}
+        run: poetry run pytest ../../scripts/code_qa/assert_local_install.py
+        working-directory: ${{ env.WORKING_DIRECTORY }}
       - name: run e2e tests
         run: poetry run pytest -m localtest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
         working-directory: ${{ env.WORKING_DIRECTORY }}

From 487073a32aea40bccc7dadbfaf2fd0c978a56edf Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 15:59:23 -0700
Subject: [PATCH 08/22] Fix and rename pipelines

---
 ...e-test.yml => promptflow-evals-e2e-test-azure.yml} |  6 +++---
 ...e-test.yml => promptflow-evals-e2e-test-local.yml} |  6 +++---
 .../promptflow/evals/evaluate/_eval_run.py            | 11 ++++++++---
 .../evals/evaluators/_groundedness/_groundedness.py   |  5 +----
 .../tests/evals/e2etests/test_adv_simulator.py        |  2 +-
 .../tests/evals/e2etests/test_builtin_evaluators.py   |  5 ++++-
 .../tests/evals/e2etests/test_evaluate.py             |  5 ++++-
 .../tests/evals/e2etests/test_metrics_upload.py       |  2 +-
 8 files changed, 25 insertions(+), 17 deletions(-)
 rename .github/workflows/{promptflow-evals-e2e-test.yml => promptflow-evals-e2e-test-azure.yml} (94%)
 rename .github/workflows/{promptflow-evals-local-e2e-test.yml => promptflow-evals-e2e-test-local.yml} (95%)

diff --git a/.github/workflows/promptflow-evals-e2e-test.yml b/.github/workflows/promptflow-evals-e2e-test-azure.yml
similarity index 94%
rename from .github/workflows/promptflow-evals-e2e-test.yml
rename to .github/workflows/promptflow-evals-e2e-test-azure.yml
index 2275f9cbd6c..304bed4383d 100644
--- a/.github/workflows/promptflow-evals-e2e-test.yml
+++ b/.github/workflows/promptflow-evals-e2e-test-azure.yml
@@ -1,4 +1,4 @@
-name: promptflow-evals-e2e-test
+name: promptflow-evals-e2e-test-azure
 
 on:
   schedule:
@@ -6,7 +6,7 @@ on:
   pull_request:
     paths:
       - src/promptflow-evals/**
-      - .github/workflows/promptflow-evals-e2e-test.yml
+      - .github/workflows/promptflow-evals-e2e-test-azure.yml
   workflow_dispatch:
 
 env:
@@ -83,7 +83,7 @@ jobs:
           creds: ${{ secrets.PF_EVALS_SP_CREDENTIALS }}
           enable-AzPSSession: true
       - name: run e2e tests
-        run: poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
+        run: poetry run pytest -m azuretest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
         working-directory: ${{ env.WORKING_DIRECTORY }}
       - name: upload coverage report
         uses: actions/upload-artifact@v4
diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-e2e-test-local.yml
similarity index 95%
rename from .github/workflows/promptflow-evals-local-e2e-test.yml
rename to .github/workflows/promptflow-evals-e2e-test-local.yml
index db9ca8d2716..fc6e7254b6e 100644
--- a/.github/workflows/promptflow-evals-local-e2e-test.yml
+++ b/.github/workflows/promptflow-evals-e2e-test-local.yml
@@ -1,4 +1,4 @@
-name: promptflow-evals-local-e2e-test
+name: promptflow-evals-e2e-test-local
 
 on:
   schedule:
@@ -6,7 +6,7 @@ on:
   pull_request:
     paths:
       - src/promptflow-evals/**
-      - .github/workflows/promptflow-evals-local-e2e-test.yml
+      - .github/workflows/promptflow-evals-e2e-test-local.yml
   workflow_dispatch:
 
 env:
@@ -21,7 +21,7 @@ jobs:
         # TODO: Encounter hash mismatch for ubuntu-latest and 3.9 combination during installing promptflow-evals package
         # https://github.com/microsoft/promptflow/actions/runs/9009397933/job/24753518853?pr=3158
         # Add 3.9 back after we figure out the issue
-        python-version: ['3.8', '3.10', '3.11']
+        python-version: ['3.8', '3.9', '3.10', '3.11']
       fail-fast: false
     # snok/install-poetry need this to support Windows
     defaults:
diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
index 7aabadac944..9c1f5abea5d 100644
--- a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
+++ b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
@@ -18,6 +18,8 @@
 from promptflow.evals._version import VERSION
 from promptflow._sdk.entities import Run
 
+LOGGER = logging.getLogger(__name__)
+
 # Handle optional import. The azure libraries are only present if
 # promptflow-azure is installed.
 try:
@@ -28,9 +30,12 @@
     # If the above mentioned modules cannot be imported, we are running
     # in local mode and MLClient in the constructor will be None, so
     # we will not arrive to Azure-dependent code.
-    pass
-
-LOGGER = logging.getLogger(__name__)
+    LOGGER.warning(
+        "azure-ai-ml cannot be imported. "
+        "The results will be saved locally, but will not be logged to Azure. "
+        "To log results to azure please install promptflow-evals with the command "
+        "pip install promptflow-evals[azure]"
+        )
 
 
 @dataclasses.dataclass
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py b/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py
index 282a2801a92..6eccd607814 100644
--- a/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py
+++ b/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py
@@ -75,10 +75,7 @@ def __call__(self, *, answer: str, context: str, **kwargs):
             raise ValueError("Both 'answer' and 'context' must be non-empty strings.")
 
         # Run the evaluation flow
-        try:
-            llm_output = self._flow(answer=answer, context=context)
-        except BaseException as e:
-            print(e)
+        llm_output = self._flow(answer=answer, context=context)
 
         score = np.nan
         if llm_output:
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py b/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py
index 1faef92a46a..16cd0bab1df 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py
@@ -6,7 +6,7 @@
 
 
 @pytest.mark.usefixtures("recording_injection")
-@pytest.mark.e2etest
+@pytest.mark.azuretest
 class TestAdvSimulator:
     @pytest.mark.usefixtures("vcr_recording")
     def test_adv_sim_init_with_prod_url(self, azure_cred, project_scope):
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
index 8eeae5f2bfb..6e714a013b5 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
@@ -11,7 +11,6 @@
 
 
 @pytest.mark.usefixtures("recording_injection", "vcr_recording")
-@pytest.mark.e2etest
 class TestBuiltInEvaluators:
     @pytest.mark.localtest
     def test_individual_evaluator_prompt_based(self, model_config):
@@ -33,6 +32,7 @@ def test_individual_evaluator_prompt_based_with_dict_input(self, model_config):
         assert score is not None
         assert score["gpt_fluency"] > 0.0
 
+    @pytest.mark.azuretest
     def test_individual_evaluator_service_based(self, project_scope, azure_cred):
         eval_fn = ViolenceEvaluator(project_scope, azure_cred)
         score = eval_fn(
@@ -44,6 +44,7 @@ def test_individual_evaluator_service_based(self, project_scope, azure_cred):
         assert score["violence_score"] < 1.0
         assert score["violence_reason"], "violence_reason must not be None or empty."
 
+    @pytest.mark.azuretest
     @pytest.mark.skip(reason="Not working in ci pipeline. For local run.")
     def test_content_safety_service_unavailable(self, project_scope, azure_cred):
         eval_fn = ViolenceEvaluator(project_scope, azure_cred)
@@ -76,6 +77,7 @@ def test_composite_evaluator_qa(self, model_config, parallel):
         assert score["gpt_similarity"] > 0.0
         assert score["f1_score"] > 0.0
 
+    @pytest.mark.azuretest
     def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
         safety_eval = ContentSafetyEvaluator(project_scope, parallel=False, credential=azure_cred)
         score = safety_eval(
@@ -160,6 +162,7 @@ def test_composite_evaluator_chat(self, model_config, eval_last_turn, parallel):
         assert score["evaluation_per_turn"]["gpt_retrieval"] is not None
         assert len(score["evaluation_per_turn"]["gpt_retrieval"]["score"]) == turn_count
 
+    @pytest.mark.azuretest
     @pytest.mark.parametrize(
         "eval_last_turn, parallel",
         [
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
index ebd418bdfb9..88872f5a3b7 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
@@ -80,7 +80,6 @@ def _get_run_from_run_history(flow_run_id, ml_client, project_scope):
 
 
 @pytest.mark.usefixtures("recording_injection")
-@pytest.mark.e2etest
 class TestEvaluate:
     @pytest.mark.localtest
     def test_evaluate_with_groundedness_evaluator(self, model_config, data_file):
@@ -119,6 +118,7 @@ def test_evaluate_with_groundedness_evaluator(self, model_config, data_file):
         assert row_result_df["outputs.f1_score.f1_score"][2] == 1
         assert result["studio_url"] is None
 
+    @pytest.mark.azuretest
     @pytest.mark.skip(reason="Failed in CI pipeline. Pending for investigation.")
     def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file, azure_cred):
         input_data = pd.read_json(data_file, lines=True)
@@ -306,6 +306,7 @@ def test_evaluate_with_evaluator_config(self, questions_file, evaluate_config):
         assert "answer.length" in metrics.keys()
         assert "f1_score.f1_score" in metrics.keys()
 
+    @pytest.mark.azuretest
     def test_evaluate_track_in_cloud(
         self,
         questions_file,
@@ -349,6 +350,7 @@ def test_evaluate_track_in_cloud(
         assert remote_run["runMetadata"]["properties"]["runType"] == "eval_run"
         assert remote_run["runMetadata"]["displayName"] == evaluation_name
 
+    @pytest.mark.azuretest
     def test_evaluate_track_in_cloud_no_target(
         self,
         data_file,
@@ -441,6 +443,7 @@ def test_evaluate_aggregation(self, data_file, return_json, aggregate_return_jso
         if aggregate_return_json:
             assert "answer_length.median" in result["metrics"].keys()
 
+    @pytest.mark.localtest
     @pytest.mark.skip(reason="TODO: Add test back")
     def test_prompty_with_threadpool_implementation(self):
         pass
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py
index 24ca1dd743c..e300348711b 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py
@@ -48,7 +48,7 @@ def setup_data(azure_ml_client, project_scope):
 
 
 @pytest.mark.usefixtures("model_config", "recording_injection", "project_scope")
-@pytest.mark.e2etest
+@pytest.mark.azuretest
 class TestMetricsUpload(object):
     """End to end tests to check how the metrics were uploaded to cloud."""
 

From b3ee5fdad334ba5aa32adf7a6222a43d478f414f Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 16:26:15 -0700
Subject: [PATCH 09/22] Fix script

---
 scripts/code_qa/assert_local_install.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scripts/code_qa/assert_local_install.py b/scripts/code_qa/assert_local_install.py
index 5dca70650b0..3c9f56bd6d5 100644
--- a/scripts/code_qa/assert_local_install.py
+++ b/scripts/code_qa/assert_local_install.py
@@ -14,4 +14,8 @@ class TestPackagesNotInstalles():
     ])
     def test_promptflow_azure(self, package):
         """Test promptflow. azure is not installed."""
-        assert importlib.util.find_spec(package) is None, f'Package {package} must be uninstalled for local test.'
+        try:
+            importlib.import_module(package)
+            assert False, f'Package {package} must be uninstalled for local test.'
+        except (ModuleNotFoundError, ImportError):
+            pass

From 073ff027a922e7670298499d0e575b35a1fe82b2 Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 16:44:44 -0700
Subject: [PATCH 10/22] Make jwt optional import in conftest

---
 src/promptflow-evals/tests/evals/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py
index dc1014c8082..2e303ffb4b7 100644
--- a/src/promptflow-evals/tests/evals/conftest.py
+++ b/src/promptflow-evals/tests/evals/conftest.py
@@ -4,7 +4,6 @@
 from typing import Dict
 from unittest.mock import patch
 
-import jwt
 import pytest
 
 from pytest_mock import MockerFixture
@@ -40,6 +39,7 @@ def is_replay():
 # Import of optional packages
 AZURE_INSTALLED = True
 try:
+    import jwt
     from azure.ai.ml._ml_client import MLClient
 except ImportError:
     AZURE_INSTALLED = False

From ae94b1fe06557557820d3f7d756edb81fd13fe3f Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 17:02:03 -0700
Subject: [PATCH 11/22] Fix import for local test

---
 src/promptflow-evals/tests/evals/e2etests/test_evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
index 88872f5a3b7..66123c77406 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
@@ -6,7 +6,6 @@
 import pandas as pd
 import pytest
 import requests
-from azure.identity import DefaultAzureCredential
 
 from promptflow.evals.evaluate import evaluate
 from promptflow.evals.evaluators import ContentSafetyEvaluator, F1ScoreEvaluator, GroundednessEvaluator
@@ -46,6 +45,7 @@ def question_evaluator(question):
 
 def _get_run_from_run_history(flow_run_id, ml_client, project_scope):
     """Get run info from run history"""
+    from azure.identity import DefaultAzureCredential
     token = "Bearer " + DefaultAzureCredential().get_token("https://management.azure.com/.default").token
     headers = {
         "Authorization": token,

From be84305d7e16b378cead9b18a9b64b0c8e9be904 Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 17:17:27 -0700
Subject: [PATCH 12/22] Fix imports

---
 .github/workflows/promptflow-evals-e2e-test-local.yml       | 2 +-
 .../tests/evals/e2etests/test_metrics_upload.py             | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/promptflow-evals-e2e-test-local.yml b/.github/workflows/promptflow-evals-e2e-test-local.yml
index fc6e7254b6e..f808956e451 100644
--- a/.github/workflows/promptflow-evals-e2e-test-local.yml
+++ b/.github/workflows/promptflow-evals-e2e-test-local.yml
@@ -57,7 +57,7 @@ jobs:
         run: poetry run pytest ../../scripts/code_qa/assert_local_install.py
         working-directory: ${{ env.WORKING_DIRECTORY }}
       - name: run e2e tests
-        run: poetry run pytest -m localtest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
+        run: poetry run pytest -m localtest tests/evals/e2etests --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
         working-directory: ${{ env.WORKING_DIRECTORY }}
       - name: upload coverage report
         uses: actions/upload-artifact@v4
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py
index e300348711b..3ed8c63a515 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py
@@ -10,8 +10,12 @@
 from promptflow.evals.evaluate._eval_run import EvalRun
 from promptflow.evals.evaluate._evaluate import evaluate
 from promptflow.evals.evaluators._f1_score._f1_score import F1ScoreEvaluator
-from promptflow.recording.record_mode import is_live
 from promptflow.tracing import _start_trace
+try:
+    from promptflow.recording.record_mode import is_live
+except ModuleNotFoundError:
+    # The file is being imported by the local test
+    pass
 
 
 @pytest.fixture

From e8c42e6d5abc8853fd3c6fb6257b1b70e9a0cf90 Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 17:35:57 -0700
Subject: [PATCH 13/22] Do not try to get token if azure is not installed

---
 src/promptflow-evals/tests/evals/conftest.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py
index 2e303ffb4b7..b0f22a35259 100644
--- a/src/promptflow-evals/tests/evals/conftest.py
+++ b/src/promptflow-evals/tests/evals/conftest.py
@@ -302,6 +302,8 @@ def azure_cred():
 
 @pytest.fixture(scope=package_scope_in_live_mode())
 def user_object_id() -> str:
+    if not AZURE_INSTALLED:
+        return ""
     if pytest.is_replay:
         from promptflow.recording.azure import SanitizedValues
 
@@ -314,6 +316,8 @@ def user_object_id() -> str:
 
 @pytest.fixture(scope=package_scope_in_live_mode())
 def tenant_id() -> str:
+    if not AZURE_INSTALLED:
+        return ""
     if pytest.is_replay:
         from promptflow.recording.azure import SanitizedValues
 

From 870094427f66311e812978b812465bcbbd84ff9e Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 17:55:33 -0700
Subject: [PATCH 14/22] Fix recording

---
 src/promptflow-evals/tests/evals/conftest.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py
index b0f22a35259..f57a77ff837 100644
--- a/src/promptflow-evals/tests/evals/conftest.py
+++ b/src/promptflow-evals/tests/evals/conftest.py
@@ -330,9 +330,12 @@ def tenant_id() -> str:
 
 @pytest.fixture(scope=package_scope_in_live_mode())
 def variable_recorder():
-    from promptflow.recording.azure import VariableRecorder
-
-    yield VariableRecorder()
+    if pytest.is_record or pytest.is_replay:
+        from promptflow.recording.azure import VariableRecorder
+    
+        yield VariableRecorder()
+    else:
+        yield None
 
 
 @pytest.fixture(scope=package_scope_in_live_mode())

From 9e9d29ad162e5e77ccf4a2b97a79d97c40116b95 Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Wed, 10 Jul 2024 18:21:01 -0700
Subject: [PATCH 15/22] Fix linter

---
 src/promptflow-evals/tests/evals/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py
index f57a77ff837..eec82c01508 100644
--- a/src/promptflow-evals/tests/evals/conftest.py
+++ b/src/promptflow-evals/tests/evals/conftest.py
@@ -332,7 +332,7 @@ def tenant_id() -> str:
 def variable_recorder():
     if pytest.is_record or pytest.is_replay:
         from promptflow.recording.azure import VariableRecorder
-    
+
         yield VariableRecorder()
     else:
         yield None

From 150a081c7130ba879fb06617ce4cdc17ac581afd Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Thu, 11 Jul 2024 10:00:15 -0700
Subject: [PATCH 16/22] Fixes

---
 .../promptflow-evals-e2e-test-local.yml       |  1 -
 .../promptflow/evals/evaluate/_eval_run.py    | 19 +++++++++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/promptflow-evals-e2e-test-local.yml b/.github/workflows/promptflow-evals-e2e-test-local.yml
index 30fb666d64e..4adaf86c5a9 100644
--- a/.github/workflows/promptflow-evals-e2e-test-local.yml
+++ b/.github/workflows/promptflow-evals-e2e-test-local.yml
@@ -60,7 +60,6 @@ jobs:
         id: run_e2e_tests_local
         run: |
             poetry run pytest -m localtest tests/evals/e2etests --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
-            poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
           poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_local --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }}
         working-directory: ${{ env.WORKING_DIRECTORY }}
       - name: upload coverage report
diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
index 57aed2a5441..6c689c7a016 100644
--- a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
+++ b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
@@ -22,6 +22,7 @@
 
 # Handle optional import. The azure libraries are only present if
 # promptflow-azure is installed.
+_AZURE_IMPORTED = True
 try:
     from azure.ai.ml.entities._credentials import AccountKeyConfiguration
     from azure.ai.ml.entities._datastore.datastore import Datastore
@@ -30,12 +31,11 @@
     # If the above mentioned modules cannot be imported, we are running
     # in local mode and MLClient in the constructor will be None, so
     # we will not arrive to Azure-dependent code.
-    LOGGER.warning(
-        "azure-ai-ml cannot be imported. "
-        "The results will be saved locally, but will not be logged to Azure. "
-        "To log results to azure please install promptflow-evals with the command "
-        "pip install promptflow-evals[azure]"
-        )
+
+    # We are not logging the import failure because
+    # - If the project configuration was not provided this import is not needed.
+    # - If the project configuration was provided, the error will be raised by PFClient.
+    pass
 
 
 @dataclasses.dataclass
@@ -133,8 +133,11 @@ def __init__(self,
         self._is_promptflow_run: bool = promptflow_run is not None
         self._is_broken = False
         if self._tracking_uri is None:
-            LOGGER.warning("tracking_uri was not provided, "
-                           "The results will be saved locally, but will not be logged to Azure.")
+            if self._tracking_uri:
+                LOGGER.warning(
+                    "tracking_uri was not provided, "
+                    "The results will be saved locally, but will not be logged to Azure."
+                )
             self._url_base = None
             self._is_broken = True
             self.info = RunInfo.generate(run_name)

From 3dc5c9bbbb1d1b89fe343a3450c66ad968b98bd3 Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Thu, 11 Jul 2024 10:01:28 -0700
Subject: [PATCH 17/22] Fix indentation

---
 .github/workflows/promptflow-evals-e2e-test-local.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/promptflow-evals-e2e-test-local.yml b/.github/workflows/promptflow-evals-e2e-test-local.yml
index 4adaf86c5a9..f5cef2aa4d2 100644
--- a/.github/workflows/promptflow-evals-e2e-test-local.yml
+++ b/.github/workflows/promptflow-evals-e2e-test-local.yml
@@ -59,7 +59,7 @@ jobs:
       - name: run e2e tests
         id: run_e2e_tests_local
         run: |
-            poetry run pytest -m localtest tests/evals/e2etests --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
+          poetry run pytest -m localtest tests/evals/e2etests --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
           poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_local --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }}
         working-directory: ${{ env.WORKING_DIRECTORY }}
       - name: upload coverage report

From 96fd9fbef034f30e593f5a3d0617e1169f15b24d Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Thu, 11 Jul 2024 10:18:59 -0700
Subject: [PATCH 18/22] Remove extra code

---
 .../promptflow/evals/evaluate/_eval_run.py             | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
index 6c689c7a016..75539ed7381 100644
--- a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
+++ b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
@@ -22,7 +22,6 @@
 
 # Handle optional import. The azure libraries are only present if
 # promptflow-azure is installed.
-_AZURE_IMPORTED = True
 try:
     from azure.ai.ml.entities._credentials import AccountKeyConfiguration
     from azure.ai.ml.entities._datastore.datastore import Datastore
@@ -133,11 +132,10 @@ def __init__(self,
         self._is_promptflow_run: bool = promptflow_run is not None
         self._is_broken = False
         if self._tracking_uri is None:
-            if self._tracking_uri:
-                LOGGER.warning(
-                    "tracking_uri was not provided, "
-                    "The results will be saved locally, but will not be logged to Azure."
-                )
+            LOGGER.warning(
+                "tracking_uri was not provided, "
+                "The results will be saved locally, but will not be logged to Azure."
+            )
             self._url_base = None
             self._is_broken = True
             self.info = RunInfo.generate(run_name)

From abca0e9d887b223abe8c0e121ab7f5794e255fb4 Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Thu, 11 Jul 2024 12:12:01 -0700
Subject: [PATCH 19/22] Add logging back and add .settings to gitignore

---
 .gitignore                                                  | 1 +
 src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9ef59b176df..5489d5b6745 100644
--- a/.gitignore
+++ b/.gitignore
@@ -197,6 +197,7 @@ src/promptflow-*/promptflow/__init__.py
 # Eclipse project files
 **/.project
 **/.pydevproject
+**/.settings
 
 # benchmark results
 benchmark/promptflow-serve/test_runner/locust-results/
\ No newline at end of file
diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
index 75539ed7381..86deb73fbc2 100644
--- a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
+++ b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
@@ -31,10 +31,10 @@
     # in local mode and MLClient in the constructor will be None, so
     # we will not arrive to Azure-dependent code.
 
-    # We are not logging the import failure because
+    # We are logging the import failure only if debug logging level is set because:
     # - If the project configuration was not provided this import is not needed.
     # - If the project configuration was provided, the error will be raised by PFClient.
-    pass
+    LOGGER.debug("promptflow.azure is not installed.")
 
 
 @dataclasses.dataclass

From b94bb2bf908c4f8becf602e62b496144edc7b85b Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Fri, 12 Jul 2024 12:37:16 -0700
Subject: [PATCH 20/22] Fix

---
 .../tests/evals/e2etests/test_builtin_evaluators.py      | 7 +------
 .../tests/evals/e2etests/test_evaluate.py                | 9 +--------
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
index 6e714a013b5..77306cdb576 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
@@ -11,8 +11,8 @@
 
 
 @pytest.mark.usefixtures("recording_injection", "vcr_recording")
+@pytest.mark.localtest
 class TestBuiltInEvaluators:
-    @pytest.mark.localtest
     def test_individual_evaluator_prompt_based(self, model_config):
         eval_fn = FluencyEvaluator(model_config)
         score = eval_fn(
@@ -22,7 +22,6 @@ def test_individual_evaluator_prompt_based(self, model_config):
         assert score is not None
         assert score["gpt_fluency"] > 1.0
 
-    @pytest.mark.localtest
     def test_individual_evaluator_prompt_based_with_dict_input(self, model_config):
         eval_fn = FluencyEvaluator(model_config)
         score = eval_fn(
@@ -32,7 +31,6 @@ def test_individual_evaluator_prompt_based_with_dict_input(self, model_config):
         assert score is not None
         assert score["gpt_fluency"] > 0.0
 
-    @pytest.mark.azuretest
     def test_individual_evaluator_service_based(self, project_scope, azure_cred):
         eval_fn = ViolenceEvaluator(project_scope, azure_cred)
         score = eval_fn(
@@ -44,7 +42,6 @@ def test_individual_evaluator_service_based(self, project_scope, azure_cred):
         assert score["violence_score"] < 1.0
         assert score["violence_reason"], "violence_reason must not be None or empty."
 
-    @pytest.mark.azuretest
     @pytest.mark.skip(reason="Not working in ci pipeline. For local run.")
     def test_content_safety_service_unavailable(self, project_scope, azure_cred):
         eval_fn = ViolenceEvaluator(project_scope, azure_cred)
@@ -58,7 +55,6 @@ def test_content_safety_service_unavailable(self, project_scope, azure_cred):
 
         assert "RAI service is not available in this region" in exc_info._excinfo[1].inner_exception.args[0]
 
-    @pytest.mark.localtest
     @pytest.mark.parametrize("parallel", [False, True])
     def test_composite_evaluator_qa(self, model_config, parallel):
         qa_eval = QAEvaluator(model_config, parallel=parallel)
@@ -99,7 +95,6 @@ def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
         assert score["hate_unfairness_score"] < 1.0
         assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
 
-    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "eval_last_turn, parallel",
         [
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
index 66123c77406..f57c05e35ce 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
@@ -80,8 +80,8 @@ def _get_run_from_run_history(flow_run_id, ml_client, project_scope):
 
 
 @pytest.mark.usefixtures("recording_injection")
+@pytest.mark.localtest
 class TestEvaluate:
-    @pytest.mark.localtest
     def test_evaluate_with_groundedness_evaluator(self, model_config, data_file):
         # data
         input_data = pd.read_json(data_file, lines=True)
@@ -154,7 +154,6 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file,
         assert 0 <= metrics.get("content_safety.self_harm_defect_rate") <= 1
         assert 0 <= metrics.get("content_safety.hate_unfairness_defect_rate") <= 1
 
-    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "use_pf_client,function,column",
         [
@@ -188,7 +187,6 @@ def test_evaluate_python_function(self, data_file, use_pf_client, function, colu
         assert metrics.get(metric) == np.nanmean(row_result_df[out_column])
         assert row_result_df[out_column][2] == 31
 
-    @pytest.mark.localtest
     def test_evaluate_with_target(self, questions_file):
         """Test evaluation with target function."""
         # We cannot define target in this file as pytest will load
@@ -212,7 +210,6 @@ def test_evaluate_with_target(self, questions_file):
         assert "outputs.f1.f1_score" in row_result_df.columns
         assert not any(np.isnan(f1) for f1 in row_result_df["outputs.f1.f1_score"])
 
-    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "evaluation_config",
         [
@@ -255,7 +252,6 @@ def test_evaluate_another_questions(self, questions_file, evaluation_config):
         expected = list(row_result_df[question].str.len())
         assert expected == list(row_result_df["outputs.question_ev.length"])
 
-    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "evaluate_config",
         [
@@ -393,7 +389,6 @@ def test_evaluate_track_in_cloud_no_target(
         assert remote_run["runMetadata"]["properties"]["_azureml.evaluation_run"] == "azure-ai-generative-parent"
         assert remote_run["runMetadata"]["displayName"] == evaluation_name
 
-    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "return_json, aggregate_return_json",
         [
@@ -418,7 +413,6 @@ def test_evaluate_aggregation_with_threadpool(self, data_file, return_json, aggr
         if aggregate_return_json:
             assert "answer_length.median" in result["metrics"].keys()
 
-    @pytest.mark.localtest
     @pytest.mark.parametrize(
         "return_json, aggregate_return_json",
         [
@@ -443,7 +437,6 @@ def test_evaluate_aggregation(self, data_file, return_json, aggregate_return_jso
         if aggregate_return_json:
             assert "answer_length.median" in result["metrics"].keys()
 
-    @pytest.mark.localtest
     @pytest.mark.skip(reason="TODO: Add test back")
     def test_prompty_with_threadpool_implementation(self):
         pass

From f9bf2173de00c8d701edd21563fa8baec8da33a3 Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Fri, 12 Jul 2024 12:52:31 -0700
Subject: [PATCH 21/22] Add conftest change

---
 src/promptflow-evals/tests/evals/conftest.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py
index eec82c01508..396673637a3 100644
--- a/src/promptflow-evals/tests/evals/conftest.py
+++ b/src/promptflow-evals/tests/evals/conftest.py
@@ -362,3 +362,9 @@ def vcr_recording(request: pytest.FixtureRequest, user_object_id: str, tenant_id
         yield recording
     else:
         yield None
+
+
+def pytest_collection_modifyitems(items):
+    for item in items:
+        if item.get_closest_marker('azuretest'):
+            item.own_markers = [marker for marker in item.own_markers if marker.name != 'localtest']

From 851385c370d006f2e03146b85c6e95ee574d3b6e Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Fri, 12 Jul 2024 15:36:35 -0700
Subject: [PATCH 22/22] Fix conftest

---
 src/promptflow-evals/tests/evals/conftest.py        | 13 +++++++++++--
 .../tests/evals/e2etests/test_builtin_evaluators.py |  1 +
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py
index 396673637a3..e184b334628 100644
--- a/src/promptflow-evals/tests/evals/conftest.py
+++ b/src/promptflow-evals/tests/evals/conftest.py
@@ -365,6 +365,15 @@ def vcr_recording(request: pytest.FixtureRequest, user_object_id: str, tenant_id
 
 
 def pytest_collection_modifyitems(items):
+    parents = {}
     for item in items:
-        if item.get_closest_marker('azuretest'):
-            item.own_markers = [marker for marker in item.own_markers if marker.name != 'localtest']
+        # Check if parent contains 'localtest' marker and remove it.
+        if any(mark.name == 'localtest' for mark in item.parent.own_markers) or id(item.parent) in parents:
+            if id(item.parent) not in parents:
+                item.parent.own_markers = [
+                    marker for marker in item.own_markers if getattr(marker, 'name', None) != 'localtest']
+                parents[id(item.parent)] = item.parent
+            if not item.get_closest_marker('azuretest'):
+                # If item's parent was marked as 'localtest', mark the child as such, but not if
+                # it was marked as 'azuretest'.
+                item.add_marker(pytest.mark.localtest)
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
index 77306cdb576..e1a305ca388 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
@@ -31,6 +31,7 @@ def test_individual_evaluator_prompt_based_with_dict_input(self, model_config):
         assert score is not None
         assert score["gpt_fluency"] > 0.0
 
+    @pytest.mark.azuretest
     def test_individual_evaluator_service_based(self, project_scope, azure_cred):
         eval_fn = ViolenceEvaluator(project_scope, azure_cred)
         score = eval_fn(