From 7619024ae641b334a6cced3d9f651c6eb4eb77a5 Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 09:43:45 -0700 Subject: [PATCH 01/22] Add local tests --- .../promptflow-evals-local-e2e-test.yml | 93 +++++++++++++++++++ .../promptflow/evals/evaluate/_eval_run.py | 17 +++- .../evaluators/_groundedness/_groundedness.py | 5 +- src/promptflow-evals/tests/evals/conftest.py | 25 +++-- .../tests/evals/e2etests/test_evaluate.py | 7 ++ 5 files changed, 134 insertions(+), 13 deletions(-) create mode 100644 .github/workflows/promptflow-evals-local-e2e-test.yml diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml new file mode 100644 index 00000000000..b4092e81a2f --- /dev/null +++ b/.github/workflows/promptflow-evals-local-e2e-test.yml @@ -0,0 +1,93 @@ +name: promptflow-evals-e2e-test + +on: + schedule: + - cron: "40 10 * * *" # 2:40 PST every day + pull_request: + paths: + - src/promptflow-evals/** + - .github/workflows/promptflow-evals-e2e-test.yml + workflow_dispatch: + +env: + IS_IN_CI_PIPELINE: "true" + WORKING_DIRECTORY: ${{ github.workspace }}/src/promptflow-evals + +jobs: + test: + needs: build + strategy: + matrix: + os: [ubuntu-latest, windows-latest, macos-13] + # TODO: Encounter hash mismatch for ubuntu-latest and 3.9 combination during installing promptflow-evals package + # https://github.com/microsoft/promptflow/actions/runs/9009397933/job/24753518853?pr=3158 + # Add 3.9 back after we figure out the issue + python-version: ['3.8', '3.10', '3.11'] + fail-fast: false + # snok/install-poetry need this to support Windows + defaults: + run: + shell: bash + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: set test mode + # Always run in replay mode for now until we figure out the test resource to run live mode + run: echo "PROMPT_FLOW_TEST_MODE=replay" >> $GITHUB_ENV + #run: echo "PROMPT_FLOW_TEST_MODE=$(if [[ "${{ github.event_name }}" == "pull_request" ]]; then echo replay; else echo live; fi)" >> $GITHUB_ENV + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - uses: snok/install-poetry@v1 + - name: install test dependency group + run: poetry install --only test + working-directory: ${{ env.WORKING_DIRECTORY }} + - name: install promptflow packages in editable mode + run: | + poetry run pip install -e ../promptflow + poetry run pip install -e ../promptflow-core + poetry run pip install -e ../promptflow-devkit + poetry run pip install -e ../promptflow-tracing + poetry run pip install -e ../promptflow-tools + poetry run pip install -e ../promptflow-evals + working-directory: ${{ env.WORKING_DIRECTORY }} + - name: generate end-to-end test config from secret + run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json + working-directory: ${{ env.WORKING_DIRECTORY }} + - name: run e2e tests + run: poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml + working-directory: ${{ env.WORKING_DIRECTORY }} + - name: upload coverage report + uses: actions/upload-artifact@v4 + with: + name: report-${{ matrix.os }}-py${{ matrix.python-version }} + path: | + ${{ env.WORKING_DIRECTORY }}/*.xml + ${{ env.WORKING_DIRECTORY }}/htmlcov/ + + report: + needs: test + runs-on: ubuntu-latest + permissions: + checks: write + pull-requests: write + contents: read + issues: read + steps: + - uses: actions/download-artifact@v4 + with: + path: artifacts + - uses: EnricoMi/publish-unit-test-result-action@v2 + with: + check_name: promptflow-evals test result + comment_title: promptflow-evals test result + files: "artifacts/**/test-results.xml" # align with `--junit-xml` in pyproject.toml + - uses: irongut/CodeCoverageSummary@v1.3.0 + with: + filename: "artifacts/report-ubuntu-latest-py3.11/coverage.xml" + badge: true + fail_below_min: false + format: markdown + hide_complexity: true + output: both + thresholds: 40 80 diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py index 05fd89b53f2..7aabadac944 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py @@ -11,15 +11,24 @@ from typing import Any, Dict, Optional, Type from urllib.parse import urlparse -from azure.storage.blob import BlobServiceClient + from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from promptflow.evals._version import VERSION from promptflow._sdk.entities import Run -from azure.ai.ml.entities._credentials import AccountKeyConfiguration -from azure.ai.ml.entities._datastore.datastore import Datastore +# Handle optional import. The azure libraries are only present if +# promptflow-azure is installed. +try: + from azure.ai.ml.entities._credentials import AccountKeyConfiguration + from azure.ai.ml.entities._datastore.datastore import Datastore + from azure.storage.blob import BlobServiceClient +except (ModuleNotFoundError, ImportError): + # If the above mentioned modules cannot be imported, we are running + # in local mode and MLClient in the constructor will be None, so + # we will not arrive to Azure-dependent code. + pass LOGGER = logging.getLogger(__name__) @@ -382,7 +391,7 @@ def log_artifact(self, artifact_folder: str, artifact_name: str = EVALUATION_ART if response.status_code != 200: self._log_warning('register artifact', response) - def _get_datastore_credential(self, datastore: Datastore): + def _get_datastore_credential(self, datastore: "Datastore"): # Reference the logic in azure.ai.ml._artifact._artifact_utilities # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103 credential = datastore.credentials diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py b/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py index 6eccd607814..282a2801a92 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py @@ -75,7 +75,10 @@ def __call__(self, *, answer: str, context: str, **kwargs): raise ValueError("Both 'answer' and 'context' must be non-empty strings.") # Run the evaluation flow - llm_output = self._flow(answer=answer, context=context) + try: + llm_output = self._flow(answer=answer, context=context) + except BaseException as e: + print(e) score = np.nan if llm_output: diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py index 4d263152e7b..dc1014c8082 100644 --- a/src/promptflow-evals/tests/evals/conftest.py +++ b/src/promptflow-evals/tests/evals/conftest.py @@ -6,7 +6,7 @@ import jwt import pytest -from azure.ai.ml._ml_client import MLClient + from pytest_mock import MockerFixture from promptflow.client import PFClient @@ -20,8 +20,8 @@ from promptflow.recording.record_mode import is_in_ci_pipeline, is_live, is_record, is_replay except ImportError as e: print(f"Failed to import promptflow-recording: {e}") - # Run test in empty mode if promptflow-recording is not installed + def recording_array_reset(): pass @@ -37,6 +37,12 @@ def is_record(): def is_replay(): return False +# Import of optional packages +AZURE_INSTALLED = True +try: + from azure.ai.ml._ml_client import MLClient +except ImportError: + AZURE_INSTALLED = False PROMPTFLOW_ROOT = Path(__file__) / "../../../.." CONNECTION_FILE = (PROMPTFLOW_ROOT / "promptflow-evals/connections.json").resolve().absolute().as_posix() @@ -147,12 +153,15 @@ def mock_validate_trace_destination(): @pytest.fixture def azure_ml_client(project_scope: Dict): """The fixture, returning MLClient""" - return MLClient( - subscription_id=project_scope["subscription_id"], - resource_group_name=project_scope["resource_group_name"], - workspace_name=project_scope["project_name"], - credential=get_cred(), - ) + if AZURE_INSTALLED: + return MLClient( + subscription_id=project_scope["subscription_id"], + resource_group_name=project_scope["resource_group_name"], + workspace_name=project_scope["project_name"], + credential=get_cred(), + ) + else: + return None @pytest.fixture diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py index 16b06daf85f..ebd418bdfb9 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py @@ -82,6 +82,7 @@ def _get_run_from_run_history(flow_run_id, ml_client, project_scope): @pytest.mark.usefixtures("recording_injection") @pytest.mark.e2etest class TestEvaluate: + @pytest.mark.localtest def test_evaluate_with_groundedness_evaluator(self, model_config, data_file): # data input_data = pd.read_json(data_file, lines=True) @@ -153,6 +154,7 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file, assert 0 <= metrics.get("content_safety.self_harm_defect_rate") <= 1 assert 0 <= metrics.get("content_safety.hate_unfairness_defect_rate") <= 1 + @pytest.mark.localtest @pytest.mark.parametrize( "use_pf_client,function,column", [ @@ -186,6 +188,7 @@ def test_evaluate_python_function(self, data_file, use_pf_client, function, colu assert metrics.get(metric) == np.nanmean(row_result_df[out_column]) assert row_result_df[out_column][2] == 31 + @pytest.mark.localtest def test_evaluate_with_target(self, questions_file): """Test evaluation with target function.""" # We cannot define target in this file as pytest will load @@ -209,6 +212,7 @@ def test_evaluate_with_target(self, questions_file): assert "outputs.f1.f1_score" in row_result_df.columns assert not any(np.isnan(f1) for f1 in row_result_df["outputs.f1.f1_score"]) + @pytest.mark.localtest @pytest.mark.parametrize( "evaluation_config", [ @@ -251,6 +255,7 @@ def test_evaluate_another_questions(self, questions_file, evaluation_config): expected = list(row_result_df[question].str.len()) assert expected == list(row_result_df["outputs.question_ev.length"]) + @pytest.mark.localtest @pytest.mark.parametrize( "evaluate_config", [ @@ -386,6 +391,7 @@ def test_evaluate_track_in_cloud_no_target( assert remote_run["runMetadata"]["properties"]["_azureml.evaluation_run"] == "azure-ai-generative-parent" assert remote_run["runMetadata"]["displayName"] == evaluation_name + @pytest.mark.localtest @pytest.mark.parametrize( "return_json, aggregate_return_json", [ @@ -410,6 +416,7 @@ def test_evaluate_aggregation_with_threadpool(self, data_file, return_json, aggr if aggregate_return_json: assert "answer_length.median" in result["metrics"].keys() + @pytest.mark.localtest @pytest.mark.parametrize( "return_json, aggregate_return_json", [ From 6e1f0d57ab4b425cc9b7005071a09627368c3f1a Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 12:22:34 -0700 Subject: [PATCH 02/22] Add assertion that azure packages are not installed. --- .../promptflow-evals-local-e2e-test.yml | 5 ++++- scripts/code_qa/assert_local_install.py | 16 ++++++++++++++++ .../evals/e2etests/test_builtin_evaluators.py | 4 ++++ 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 scripts/code_qa/assert_local_install.py diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml index b4092e81a2f..49d4da13738 100644 --- a/.github/workflows/promptflow-evals-local-e2e-test.yml +++ b/.github/workflows/promptflow-evals-local-e2e-test.yml @@ -54,8 +54,11 @@ jobs: - name: generate end-to-end test config from secret run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json working-directory: ${{ env.WORKING_DIRECTORY }} + - name check azure is not installed + run: poetry run pytest scripts/assert_local_install.py + working-directory: ${{ github.workspace }} - name: run e2e tests - run: poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml + run: poetry run pytest -m localtest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml working-directory: ${{ env.WORKING_DIRECTORY }} - name: upload coverage report uses: actions/upload-artifact@v4 diff --git a/scripts/code_qa/assert_local_install.py b/scripts/code_qa/assert_local_install.py new file mode 100644 index 00000000000..9d358171297 --- /dev/null +++ b/scripts/code_qa/assert_local_install.py @@ -0,0 +1,16 @@ +"""Tests checking that azure packages are NOT installed.""" +import importlib +import pytest + +class TestPackagesNotInstalles(): + """Test imports.""" + + @pytest.mark.parametrize('package', [ + 'promptflow.azure', + 'azure.ai.ml', + 'azure.identity', + 'azure.storage.blob' + ]) + def test_promptflow_azure(self, package): + """Test promptflow. azure is not installed.""" + assert importlib.util.find_spec(package) is None, f'Package {package} must be uninstalled for local test.' diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py index 17bfb5029cf..8eeae5f2bfb 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py @@ -13,6 +13,7 @@ @pytest.mark.usefixtures("recording_injection", "vcr_recording") @pytest.mark.e2etest class TestBuiltInEvaluators: + @pytest.mark.localtest def test_individual_evaluator_prompt_based(self, model_config): eval_fn = FluencyEvaluator(model_config) score = eval_fn( @@ -22,6 +23,7 @@ def test_individual_evaluator_prompt_based(self, model_config): assert score is not None assert score["gpt_fluency"] > 1.0 + @pytest.mark.localtest def test_individual_evaluator_prompt_based_with_dict_input(self, model_config): eval_fn = FluencyEvaluator(model_config) score = eval_fn( @@ -55,6 +57,7 @@ def test_content_safety_service_unavailable(self, project_scope, azure_cred): assert "RAI service is not available in this region" in exc_info._excinfo[1].inner_exception.args[0] + @pytest.mark.localtest @pytest.mark.parametrize("parallel", [False, True]) def test_composite_evaluator_qa(self, model_config, parallel): qa_eval = QAEvaluator(model_config, parallel=parallel) @@ -94,6 +97,7 @@ def test_composite_evaluator_content_safety(self, project_scope, azure_cred): assert score["hate_unfairness_score"] < 1.0 assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty." + @pytest.mark.localtest @pytest.mark.parametrize( "eval_last_turn, parallel", [ From 5335b5449984131cbf5ee16c4c30932e3f42f93b Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 12:36:38 -0700 Subject: [PATCH 03/22] Typos --- .github/workflows/promptflow-evals-local-e2e-test.yml | 4 ++-- scripts/code_qa/assert_local_install.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml index 49d4da13738..1c3092ef8f2 100644 --- a/.github/workflows/promptflow-evals-local-e2e-test.yml +++ b/.github/workflows/promptflow-evals-local-e2e-test.yml @@ -1,4 +1,4 @@ -name: promptflow-evals-e2e-test +name: promptflow-evals_local-e2e-test on: schedule: @@ -55,7 +55,7 @@ jobs: run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json working-directory: ${{ env.WORKING_DIRECTORY }} - name check azure is not installed - run: poetry run pytest scripts/assert_local_install.py + run: poetry run pytest scripts/code_qa/assert_local_install.py working-directory: ${{ github.workspace }} - name: run e2e tests run: poetry run pytest -m localtest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml diff --git a/scripts/code_qa/assert_local_install.py b/scripts/code_qa/assert_local_install.py index 9d358171297..5dca70650b0 100644 --- a/scripts/code_qa/assert_local_install.py +++ b/scripts/code_qa/assert_local_install.py @@ -2,6 +2,7 @@ import importlib import pytest + class TestPackagesNotInstalles(): """Test imports.""" From 564e6724bee6d2825c3d2e34708fee7890d5934b Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 12:47:47 -0700 Subject: [PATCH 04/22] Fix test name --- .github/workflows/promptflow-evals-local-e2e-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml index 1c3092ef8f2..96fbf22ff19 100644 --- a/.github/workflows/promptflow-evals-local-e2e-test.yml +++ b/.github/workflows/promptflow-evals-local-e2e-test.yml @@ -6,7 +6,7 @@ on: pull_request: paths: - src/promptflow-evals/** - - .github/workflows/promptflow-evals-e2e-test.yml + - .github/workflows/promptflow-evals-local-e2e-test.yml workflow_dispatch: env: From 319f8c288b1ad18ea0014396f47f662447a087dd Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 13:02:45 -0700 Subject: [PATCH 05/22] Fix workflow file --- .github/workflows/promptflow-evals-local-e2e-test.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml index 96fbf22ff19..f1e70016c03 100644 --- a/.github/workflows/promptflow-evals-local-e2e-test.yml +++ b/.github/workflows/promptflow-evals-local-e2e-test.yml @@ -1,4 +1,4 @@ -name: promptflow-evals_local-e2e-test +name: promptflow-evals-local-e2e-test on: schedule: @@ -15,7 +15,6 @@ env: jobs: test: - needs: build strategy: matrix: os: [ubuntu-latest, windows-latest, macos-13] From cc589d41c547083dacb6c5a550ff0e2878cc5481 Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 14:24:24 -0700 Subject: [PATCH 06/22] Fix workflow file II --- .github/workflows/promptflow-evals-local-e2e-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml index f1e70016c03..1bf37e83aee 100644 --- a/.github/workflows/promptflow-evals-local-e2e-test.yml +++ b/.github/workflows/promptflow-evals-local-e2e-test.yml @@ -53,7 +53,7 @@ jobs: - name: generate end-to-end test config from secret run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json working-directory: ${{ env.WORKING_DIRECTORY }} - - name check azure is not installed + - name: check azure is not installed run: poetry run pytest scripts/code_qa/assert_local_install.py working-directory: ${{ github.workspace }} - name: run e2e tests From 8020ad42b632dd8639b7d13de03841361a5492cf Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 14:45:11 -0700 Subject: [PATCH 07/22] Run test in the project directory --- .github/workflows/promptflow-evals-local-e2e-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-local-e2e-test.yml index 1bf37e83aee..db9ca8d2716 100644 --- a/.github/workflows/promptflow-evals-local-e2e-test.yml +++ b/.github/workflows/promptflow-evals-local-e2e-test.yml @@ -54,8 +54,8 @@ jobs: run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json working-directory: ${{ env.WORKING_DIRECTORY }} - name: check azure is not installed - run: poetry run pytest scripts/code_qa/assert_local_install.py - working-directory: ${{ github.workspace }} + run: poetry run pytest ../../scripts/code_qa/assert_local_install.py + working-directory: ${{ env.WORKING_DIRECTORY }} - name: run e2e tests run: poetry run pytest -m localtest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml working-directory: ${{ env.WORKING_DIRECTORY }} From 487073a32aea40bccc7dadbfaf2fd0c978a56edf Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 15:59:23 -0700 Subject: [PATCH 08/22] Fix and rename pipelines --- ...e-test.yml => promptflow-evals-e2e-test-azure.yml} | 6 +++--- ...e-test.yml => promptflow-evals-e2e-test-local.yml} | 6 +++--- .../promptflow/evals/evaluate/_eval_run.py | 11 ++++++++--- .../evals/evaluators/_groundedness/_groundedness.py | 5 +---- .../tests/evals/e2etests/test_adv_simulator.py | 2 +- .../tests/evals/e2etests/test_builtin_evaluators.py | 5 ++++- .../tests/evals/e2etests/test_evaluate.py | 5 ++++- .../tests/evals/e2etests/test_metrics_upload.py | 2 +- 8 files changed, 25 insertions(+), 17 deletions(-) rename .github/workflows/{promptflow-evals-e2e-test.yml => promptflow-evals-e2e-test-azure.yml} (94%) rename .github/workflows/{promptflow-evals-local-e2e-test.yml => promptflow-evals-e2e-test-local.yml} (95%) diff --git a/.github/workflows/promptflow-evals-e2e-test.yml b/.github/workflows/promptflow-evals-e2e-test-azure.yml similarity index 94% rename from .github/workflows/promptflow-evals-e2e-test.yml rename to .github/workflows/promptflow-evals-e2e-test-azure.yml index 2275f9cbd6c..304bed4383d 100644 --- a/.github/workflows/promptflow-evals-e2e-test.yml +++ b/.github/workflows/promptflow-evals-e2e-test-azure.yml @@ -1,4 +1,4 @@ -name: promptflow-evals-e2e-test +name: promptflow-evals-e2e-test-azure on: schedule: @@ -6,7 +6,7 @@ on: pull_request: paths: - src/promptflow-evals/** - - .github/workflows/promptflow-evals-e2e-test.yml + - .github/workflows/promptflow-evals-e2e-test-azure.yml workflow_dispatch: env: @@ -83,7 +83,7 @@ jobs: creds: ${{ secrets.PF_EVALS_SP_CREDENTIALS }} enable-AzPSSession: true - name: run e2e tests - run: poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml + run: poetry run pytest -m azuretest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml working-directory: ${{ env.WORKING_DIRECTORY }} - name: upload coverage report uses: actions/upload-artifact@v4 diff --git a/.github/workflows/promptflow-evals-local-e2e-test.yml b/.github/workflows/promptflow-evals-e2e-test-local.yml similarity index 95% rename from .github/workflows/promptflow-evals-local-e2e-test.yml rename to .github/workflows/promptflow-evals-e2e-test-local.yml index db9ca8d2716..fc6e7254b6e 100644 --- a/.github/workflows/promptflow-evals-local-e2e-test.yml +++ b/.github/workflows/promptflow-evals-e2e-test-local.yml @@ -1,4 +1,4 @@ -name: promptflow-evals-local-e2e-test +name: promptflow-evals-e2e-test-local on: schedule: @@ -6,7 +6,7 @@ on: pull_request: paths: - src/promptflow-evals/** - - .github/workflows/promptflow-evals-local-e2e-test.yml + - .github/workflows/promptflow-evals-e2e-test-local.yml workflow_dispatch: env: @@ -21,7 +21,7 @@ jobs: # TODO: Encounter hash mismatch for ubuntu-latest and 3.9 combination during installing promptflow-evals package # https://github.com/microsoft/promptflow/actions/runs/9009397933/job/24753518853?pr=3158 # Add 3.9 back after we figure out the issue - python-version: ['3.8', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11'] fail-fast: false # snok/install-poetry need this to support Windows defaults: diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py index 7aabadac944..9c1f5abea5d 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py @@ -18,6 +18,8 @@ from promptflow.evals._version import VERSION from promptflow._sdk.entities import Run +LOGGER = logging.getLogger(__name__) + # Handle optional import. The azure libraries are only present if # promptflow-azure is installed. try: @@ -28,9 +30,12 @@ # If the above mentioned modules cannot be imported, we are running # in local mode and MLClient in the constructor will be None, so # we will not arrive to Azure-dependent code. - pass - -LOGGER = logging.getLogger(__name__) + LOGGER.warning( + "azure-ai-ml cannot be imported. " + "The results will be saved locally, but will not be logged to Azure. " + "To log results to azure please install promptflow-evals with the command " + "pip install promptflow-evals[azure]" + ) @dataclasses.dataclass diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py b/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py index 282a2801a92..6eccd607814 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py @@ -75,10 +75,7 @@ def __call__(self, *, answer: str, context: str, **kwargs): raise ValueError("Both 'answer' and 'context' must be non-empty strings.") # Run the evaluation flow - try: - llm_output = self._flow(answer=answer, context=context) - except BaseException as e: - print(e) + llm_output = self._flow(answer=answer, context=context) score = np.nan if llm_output: diff --git a/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py b/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py index 1faef92a46a..16cd0bab1df 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py @@ -6,7 +6,7 @@ @pytest.mark.usefixtures("recording_injection") -@pytest.mark.e2etest +@pytest.mark.azuretest class TestAdvSimulator: @pytest.mark.usefixtures("vcr_recording") def test_adv_sim_init_with_prod_url(self, azure_cred, project_scope): diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py index 8eeae5f2bfb..6e714a013b5 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py @@ -11,7 +11,6 @@ @pytest.mark.usefixtures("recording_injection", "vcr_recording") -@pytest.mark.e2etest class TestBuiltInEvaluators: @pytest.mark.localtest def test_individual_evaluator_prompt_based(self, model_config): @@ -33,6 +32,7 @@ def test_individual_evaluator_prompt_based_with_dict_input(self, model_config): assert score is not None assert score["gpt_fluency"] > 0.0 + @pytest.mark.azuretest def test_individual_evaluator_service_based(self, project_scope, azure_cred): eval_fn = ViolenceEvaluator(project_scope, azure_cred) score = eval_fn( @@ -44,6 +44,7 @@ def test_individual_evaluator_service_based(self, project_scope, azure_cred): assert score["violence_score"] < 1.0 assert score["violence_reason"], "violence_reason must not be None or empty." + @pytest.mark.azuretest @pytest.mark.skip(reason="Not working in ci pipeline. For local run.") def test_content_safety_service_unavailable(self, project_scope, azure_cred): eval_fn = ViolenceEvaluator(project_scope, azure_cred) @@ -76,6 +77,7 @@ def test_composite_evaluator_qa(self, model_config, parallel): assert score["gpt_similarity"] > 0.0 assert score["f1_score"] > 0.0 + @pytest.mark.azuretest def test_composite_evaluator_content_safety(self, project_scope, azure_cred): safety_eval = ContentSafetyEvaluator(project_scope, parallel=False, credential=azure_cred) score = safety_eval( @@ -160,6 +162,7 @@ def test_composite_evaluator_chat(self, model_config, eval_last_turn, parallel): assert score["evaluation_per_turn"]["gpt_retrieval"] is not None assert len(score["evaluation_per_turn"]["gpt_retrieval"]["score"]) == turn_count + @pytest.mark.azuretest @pytest.mark.parametrize( "eval_last_turn, parallel", [ diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py index ebd418bdfb9..88872f5a3b7 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py @@ -80,7 +80,6 @@ def _get_run_from_run_history(flow_run_id, ml_client, project_scope): @pytest.mark.usefixtures("recording_injection") -@pytest.mark.e2etest class TestEvaluate: @pytest.mark.localtest def test_evaluate_with_groundedness_evaluator(self, model_config, data_file): @@ -119,6 +118,7 @@ def test_evaluate_with_groundedness_evaluator(self, model_config, data_file): assert row_result_df["outputs.f1_score.f1_score"][2] == 1 assert result["studio_url"] is None + @pytest.mark.azuretest @pytest.mark.skip(reason="Failed in CI pipeline. Pending for investigation.") def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file, azure_cred): input_data = pd.read_json(data_file, lines=True) @@ -306,6 +306,7 @@ def test_evaluate_with_evaluator_config(self, questions_file, evaluate_config): assert "answer.length" in metrics.keys() assert "f1_score.f1_score" in metrics.keys() + @pytest.mark.azuretest def test_evaluate_track_in_cloud( self, questions_file, @@ -349,6 +350,7 @@ def test_evaluate_track_in_cloud( assert remote_run["runMetadata"]["properties"]["runType"] == "eval_run" assert remote_run["runMetadata"]["displayName"] == evaluation_name + @pytest.mark.azuretest def test_evaluate_track_in_cloud_no_target( self, data_file, @@ -441,6 +443,7 @@ def test_evaluate_aggregation(self, data_file, return_json, aggregate_return_jso if aggregate_return_json: assert "answer_length.median" in result["metrics"].keys() + @pytest.mark.localtest @pytest.mark.skip(reason="TODO: Add test back") def test_prompty_with_threadpool_implementation(self): pass diff --git a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py index 24ca1dd743c..e300348711b 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py @@ -48,7 +48,7 @@ def setup_data(azure_ml_client, project_scope): @pytest.mark.usefixtures("model_config", "recording_injection", "project_scope") -@pytest.mark.e2etest +@pytest.mark.azuretest class TestMetricsUpload(object): """End to end tests to check how the metrics were uploaded to cloud.""" From b3ee5fdad334ba5aa32adf7a6222a43d478f414f Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 16:26:15 -0700 Subject: [PATCH 09/22] Fix script --- scripts/code_qa/assert_local_install.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/code_qa/assert_local_install.py b/scripts/code_qa/assert_local_install.py index 5dca70650b0..3c9f56bd6d5 100644 --- a/scripts/code_qa/assert_local_install.py +++ b/scripts/code_qa/assert_local_install.py @@ -14,4 +14,8 @@ class TestPackagesNotInstalles(): ]) def test_promptflow_azure(self, package): """Test promptflow. azure is not installed.""" - assert importlib.util.find_spec(package) is None, f'Package {package} must be uninstalled for local test.' + try: + importlib.import_module(package) + assert False, f'Package {package} must be uninstalled for local test.' + except (ModuleNotFoundError, ImportError): + pass From 073ff027a922e7670298499d0e575b35a1fe82b2 Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 16:44:44 -0700 Subject: [PATCH 10/22] Make jwt optional import in conftest --- src/promptflow-evals/tests/evals/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py index dc1014c8082..2e303ffb4b7 100644 --- a/src/promptflow-evals/tests/evals/conftest.py +++ b/src/promptflow-evals/tests/evals/conftest.py @@ -4,7 +4,6 @@ from typing import Dict from unittest.mock import patch -import jwt import pytest from pytest_mock import MockerFixture @@ -40,6 +39,7 @@ def is_replay(): # Import of optional packages AZURE_INSTALLED = True try: + import jwt from azure.ai.ml._ml_client import MLClient except ImportError: AZURE_INSTALLED = False From ae94b1fe06557557820d3f7d756edb81fd13fe3f Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 17:02:03 -0700 Subject: [PATCH 11/22] Fix import for local test --- src/promptflow-evals/tests/evals/e2etests/test_evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py index 88872f5a3b7..66123c77406 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py @@ -6,7 +6,6 @@ import pandas as pd import pytest import requests -from azure.identity import DefaultAzureCredential from promptflow.evals.evaluate import evaluate from promptflow.evals.evaluators import ContentSafetyEvaluator, F1ScoreEvaluator, GroundednessEvaluator @@ -46,6 +45,7 @@ def question_evaluator(question): def _get_run_from_run_history(flow_run_id, ml_client, project_scope): """Get run info from run history""" + from azure.identity import DefaultAzureCredential token = "Bearer " + DefaultAzureCredential().get_token("https://management.azure.com/.default").token headers = { "Authorization": token, From be84305d7e16b378cead9b18a9b64b0c8e9be904 Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 17:17:27 -0700 Subject: [PATCH 12/22] Fix imports --- .github/workflows/promptflow-evals-e2e-test-local.yml | 2 +- .../tests/evals/e2etests/test_metrics_upload.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/promptflow-evals-e2e-test-local.yml b/.github/workflows/promptflow-evals-e2e-test-local.yml index fc6e7254b6e..f808956e451 100644 --- a/.github/workflows/promptflow-evals-e2e-test-local.yml +++ b/.github/workflows/promptflow-evals-e2e-test-local.yml @@ -57,7 +57,7 @@ jobs: run: poetry run pytest ../../scripts/code_qa/assert_local_install.py working-directory: ${{ env.WORKING_DIRECTORY }} - name: run e2e tests - run: poetry run pytest -m localtest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml + run: poetry run pytest -m localtest tests/evals/e2etests --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml working-directory: ${{ env.WORKING_DIRECTORY }} - name: upload coverage report uses: actions/upload-artifact@v4 diff --git a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py index e300348711b..3ed8c63a515 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py @@ -10,8 +10,12 @@ from promptflow.evals.evaluate._eval_run import EvalRun from promptflow.evals.evaluate._evaluate import evaluate from promptflow.evals.evaluators._f1_score._f1_score import F1ScoreEvaluator -from promptflow.recording.record_mode import is_live from promptflow.tracing import _start_trace +try: + from promptflow.recording.record_mode import is_live +except ModuleNotFoundError: + # The file is being imported by the local test + pass @pytest.fixture From e8c42e6d5abc8853fd3c6fb6257b1b70e9a0cf90 Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 17:35:57 -0700 Subject: [PATCH 13/22] Do not try to get token if azure is not installed --- src/promptflow-evals/tests/evals/conftest.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py index 2e303ffb4b7..b0f22a35259 100644 --- a/src/promptflow-evals/tests/evals/conftest.py +++ b/src/promptflow-evals/tests/evals/conftest.py @@ -302,6 +302,8 @@ def azure_cred(): @pytest.fixture(scope=package_scope_in_live_mode()) def user_object_id() -> str: + if not AZURE_INSTALLED: + return "" if pytest.is_replay: from promptflow.recording.azure import SanitizedValues @@ -314,6 +316,8 @@ def user_object_id() -> str: @pytest.fixture(scope=package_scope_in_live_mode()) def tenant_id() -> str: + if not AZURE_INSTALLED: + return "" if pytest.is_replay: from promptflow.recording.azure import SanitizedValues From 870094427f66311e812978b812465bcbbd84ff9e Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 17:55:33 -0700 Subject: [PATCH 14/22] Fix recording --- src/promptflow-evals/tests/evals/conftest.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py index b0f22a35259..f57a77ff837 100644 --- a/src/promptflow-evals/tests/evals/conftest.py +++ b/src/promptflow-evals/tests/evals/conftest.py @@ -330,9 +330,12 @@ def tenant_id() -> str: @pytest.fixture(scope=package_scope_in_live_mode()) def variable_recorder(): - from promptflow.recording.azure import VariableRecorder - - yield VariableRecorder() + if pytest.is_record or pytest.is_replay: + from promptflow.recording.azure import VariableRecorder + + yield VariableRecorder() + else: + yield None @pytest.fixture(scope=package_scope_in_live_mode()) From 9e9d29ad162e5e77ccf4a2b97a79d97c40116b95 Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Wed, 10 Jul 2024 18:21:01 -0700 Subject: [PATCH 15/22] Fix linter --- src/promptflow-evals/tests/evals/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py index f57a77ff837..eec82c01508 100644 --- a/src/promptflow-evals/tests/evals/conftest.py +++ b/src/promptflow-evals/tests/evals/conftest.py @@ -332,7 +332,7 @@ def tenant_id() -> str: def variable_recorder(): if pytest.is_record or pytest.is_replay: from promptflow.recording.azure import VariableRecorder - + yield VariableRecorder() else: yield None From 150a081c7130ba879fb06617ce4cdc17ac581afd Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Thu, 11 Jul 2024 10:00:15 -0700 Subject: [PATCH 16/22] Fixes --- .../promptflow-evals-e2e-test-local.yml | 1 - .../promptflow/evals/evaluate/_eval_run.py | 19 +++++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/promptflow-evals-e2e-test-local.yml b/.github/workflows/promptflow-evals-e2e-test-local.yml index 30fb666d64e..4adaf86c5a9 100644 --- a/.github/workflows/promptflow-evals-e2e-test-local.yml +++ b/.github/workflows/promptflow-evals-e2e-test-local.yml @@ -60,7 +60,6 @@ jobs: id: run_e2e_tests_local run: | poetry run pytest -m localtest tests/evals/e2etests --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml - poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_local --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }} working-directory: ${{ env.WORKING_DIRECTORY }} - name: upload coverage report diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py index 57aed2a5441..6c689c7a016 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py @@ -22,6 +22,7 @@ # Handle optional import. The azure libraries are only present if # promptflow-azure is installed. +_AZURE_IMPORTED = True try: from azure.ai.ml.entities._credentials import AccountKeyConfiguration from azure.ai.ml.entities._datastore.datastore import Datastore @@ -30,12 +31,11 @@ # If the above mentioned modules cannot be imported, we are running # in local mode and MLClient in the constructor will be None, so # we will not arrive to Azure-dependent code. - LOGGER.warning( - "azure-ai-ml cannot be imported. " - "The results will be saved locally, but will not be logged to Azure. " - "To log results to azure please install promptflow-evals with the command " - "pip install promptflow-evals[azure]" - ) + + # We are not logging the import failure because + # - If the project configuration was not provided this import is not needed. + # - If the project configuration was provided, the error will be raised by PFClient. + pass @dataclasses.dataclass @@ -133,8 +133,11 @@ def __init__(self, self._is_promptflow_run: bool = promptflow_run is not None self._is_broken = False if self._tracking_uri is None: - LOGGER.warning("tracking_uri was not provided, " - "The results will be saved locally, but will not be logged to Azure.") + if self._tracking_uri: + LOGGER.warning( + "tracking_uri was not provided, " + "The results will be saved locally, but will not be logged to Azure." + ) self._url_base = None self._is_broken = True self.info = RunInfo.generate(run_name) From 3dc5c9bbbb1d1b89fe343a3450c66ad968b98bd3 Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Thu, 11 Jul 2024 10:01:28 -0700 Subject: [PATCH 17/22] Fix indentation --- .github/workflows/promptflow-evals-e2e-test-local.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/promptflow-evals-e2e-test-local.yml b/.github/workflows/promptflow-evals-e2e-test-local.yml index 4adaf86c5a9..f5cef2aa4d2 100644 --- a/.github/workflows/promptflow-evals-e2e-test-local.yml +++ b/.github/workflows/promptflow-evals-e2e-test-local.yml @@ -59,7 +59,7 @@ jobs: - name: run e2e tests id: run_e2e_tests_local run: | - poetry run pytest -m localtest tests/evals/e2etests --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml + poetry run pytest -m localtest tests/evals/e2etests --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_local --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }} working-directory: ${{ env.WORKING_DIRECTORY }} - name: upload coverage report From 96fd9fbef034f30e593f5a3d0617e1169f15b24d Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Thu, 11 Jul 2024 10:18:59 -0700 Subject: [PATCH 18/22] Remove extra code --- .../promptflow/evals/evaluate/_eval_run.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py index 6c689c7a016..75539ed7381 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py @@ -22,7 +22,6 @@ # Handle optional import. The azure libraries are only present if # promptflow-azure is installed. -_AZURE_IMPORTED = True try: from azure.ai.ml.entities._credentials import AccountKeyConfiguration from azure.ai.ml.entities._datastore.datastore import Datastore @@ -133,11 +132,10 @@ def __init__(self, self._is_promptflow_run: bool = promptflow_run is not None self._is_broken = False if self._tracking_uri is None: - if self._tracking_uri: - LOGGER.warning( - "tracking_uri was not provided, " - "The results will be saved locally, but will not be logged to Azure." - ) + LOGGER.warning( + "tracking_uri was not provided, " + "The results will be saved locally, but will not be logged to Azure." + ) self._url_base = None self._is_broken = True self.info = RunInfo.generate(run_name) From abca0e9d887b223abe8c0e121ab7f5794e255fb4 Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Thu, 11 Jul 2024 12:12:01 -0700 Subject: [PATCH 19/22] Add logging back and add .settings to gitignore --- .gitignore | 1 + src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 9ef59b176df..5489d5b6745 100644 --- a/.gitignore +++ b/.gitignore @@ -197,6 +197,7 @@ src/promptflow-*/promptflow/__init__.py # Eclipse project files **/.project **/.pydevproject +**/.settings # benchmark results benchmark/promptflow-serve/test_runner/locust-results/ \ No newline at end of file diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py index 75539ed7381..86deb73fbc2 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py @@ -31,10 +31,10 @@ # in local mode and MLClient in the constructor will be None, so # we will not arrive to Azure-dependent code. - # We are not logging the import failure because + # We are logging the import failure only if debug logging level is set because: # - If the project configuration was not provided this import is not needed. # - If the project configuration was provided, the error will be raised by PFClient. - pass + LOGGER.debug("promptflow.azure is not installed.") @dataclasses.dataclass From b94bb2bf908c4f8becf602e62b496144edc7b85b Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Fri, 12 Jul 2024 12:37:16 -0700 Subject: [PATCH 20/22] Fix --- .../tests/evals/e2etests/test_builtin_evaluators.py | 7 +------ .../tests/evals/e2etests/test_evaluate.py | 9 +-------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py index 6e714a013b5..77306cdb576 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py @@ -11,8 +11,8 @@ @pytest.mark.usefixtures("recording_injection", "vcr_recording") +@pytest.mark.localtest class TestBuiltInEvaluators: - @pytest.mark.localtest def test_individual_evaluator_prompt_based(self, model_config): eval_fn = FluencyEvaluator(model_config) score = eval_fn( @@ -22,7 +22,6 @@ def test_individual_evaluator_prompt_based(self, model_config): assert score is not None assert score["gpt_fluency"] > 1.0 - @pytest.mark.localtest def test_individual_evaluator_prompt_based_with_dict_input(self, model_config): eval_fn = FluencyEvaluator(model_config) score = eval_fn( @@ -32,7 +31,6 @@ def test_individual_evaluator_prompt_based_with_dict_input(self, model_config): assert score is not None assert score["gpt_fluency"] > 0.0 - @pytest.mark.azuretest def test_individual_evaluator_service_based(self, project_scope, azure_cred): eval_fn = ViolenceEvaluator(project_scope, azure_cred) score = eval_fn( @@ -44,7 +42,6 @@ def test_individual_evaluator_service_based(self, project_scope, azure_cred): assert score["violence_score"] < 1.0 assert score["violence_reason"], "violence_reason must not be None or empty." - @pytest.mark.azuretest @pytest.mark.skip(reason="Not working in ci pipeline. For local run.") def test_content_safety_service_unavailable(self, project_scope, azure_cred): eval_fn = ViolenceEvaluator(project_scope, azure_cred) @@ -58,7 +55,6 @@ def test_content_safety_service_unavailable(self, project_scope, azure_cred): assert "RAI service is not available in this region" in exc_info._excinfo[1].inner_exception.args[0] - @pytest.mark.localtest @pytest.mark.parametrize("parallel", [False, True]) def test_composite_evaluator_qa(self, model_config, parallel): qa_eval = QAEvaluator(model_config, parallel=parallel) @@ -99,7 +95,6 @@ def test_composite_evaluator_content_safety(self, project_scope, azure_cred): assert score["hate_unfairness_score"] < 1.0 assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty." - @pytest.mark.localtest @pytest.mark.parametrize( "eval_last_turn, parallel", [ diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py index 66123c77406..f57c05e35ce 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py @@ -80,8 +80,8 @@ def _get_run_from_run_history(flow_run_id, ml_client, project_scope): @pytest.mark.usefixtures("recording_injection") +@pytest.mark.localtest class TestEvaluate: - @pytest.mark.localtest def test_evaluate_with_groundedness_evaluator(self, model_config, data_file): # data input_data = pd.read_json(data_file, lines=True) @@ -154,7 +154,6 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file, assert 0 <= metrics.get("content_safety.self_harm_defect_rate") <= 1 assert 0 <= metrics.get("content_safety.hate_unfairness_defect_rate") <= 1 - @pytest.mark.localtest @pytest.mark.parametrize( "use_pf_client,function,column", [ @@ -188,7 +187,6 @@ def test_evaluate_python_function(self, data_file, use_pf_client, function, colu assert metrics.get(metric) == np.nanmean(row_result_df[out_column]) assert row_result_df[out_column][2] == 31 - @pytest.mark.localtest def test_evaluate_with_target(self, questions_file): """Test evaluation with target function.""" # We cannot define target in this file as pytest will load @@ -212,7 +210,6 @@ def test_evaluate_with_target(self, questions_file): assert "outputs.f1.f1_score" in row_result_df.columns assert not any(np.isnan(f1) for f1 in row_result_df["outputs.f1.f1_score"]) - @pytest.mark.localtest @pytest.mark.parametrize( "evaluation_config", [ @@ -255,7 +252,6 @@ def test_evaluate_another_questions(self, questions_file, evaluation_config): expected = list(row_result_df[question].str.len()) assert expected == list(row_result_df["outputs.question_ev.length"]) - @pytest.mark.localtest @pytest.mark.parametrize( "evaluate_config", [ @@ -393,7 +389,6 @@ def test_evaluate_track_in_cloud_no_target( assert remote_run["runMetadata"]["properties"]["_azureml.evaluation_run"] == "azure-ai-generative-parent" assert remote_run["runMetadata"]["displayName"] == evaluation_name - @pytest.mark.localtest @pytest.mark.parametrize( "return_json, aggregate_return_json", [ @@ -418,7 +413,6 @@ def test_evaluate_aggregation_with_threadpool(self, data_file, return_json, aggr if aggregate_return_json: assert "answer_length.median" in result["metrics"].keys() - @pytest.mark.localtest @pytest.mark.parametrize( "return_json, aggregate_return_json", [ @@ -443,7 +437,6 @@ def test_evaluate_aggregation(self, data_file, return_json, aggregate_return_jso if aggregate_return_json: assert "answer_length.median" in result["metrics"].keys() - @pytest.mark.localtest @pytest.mark.skip(reason="TODO: Add test back") def test_prompty_with_threadpool_implementation(self): pass From f9bf2173de00c8d701edd21563fa8baec8da33a3 Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Fri, 12 Jul 2024 12:52:31 -0700 Subject: [PATCH 21/22] Add conftest change --- src/promptflow-evals/tests/evals/conftest.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py index eec82c01508..396673637a3 100644 --- a/src/promptflow-evals/tests/evals/conftest.py +++ b/src/promptflow-evals/tests/evals/conftest.py @@ -362,3 +362,9 @@ def vcr_recording(request: pytest.FixtureRequest, user_object_id: str, tenant_id yield recording else: yield None + + +def pytest_collection_modifyitems(items): + for item in items: + if item.get_closest_marker('azuretest'): + item.own_markers = [marker for marker in item.own_markers if marker.name != 'localtest'] From 851385c370d006f2e03146b85c6e95ee574d3b6e Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Fri, 12 Jul 2024 15:36:35 -0700 Subject: [PATCH 22/22] Fix conftest --- src/promptflow-evals/tests/evals/conftest.py | 13 +++++++++++-- .../tests/evals/e2etests/test_builtin_evaluators.py | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py index 396673637a3..e184b334628 100644 --- a/src/promptflow-evals/tests/evals/conftest.py +++ b/src/promptflow-evals/tests/evals/conftest.py @@ -365,6 +365,15 @@ def vcr_recording(request: pytest.FixtureRequest, user_object_id: str, tenant_id def pytest_collection_modifyitems(items): + parents = {} for item in items: - if item.get_closest_marker('azuretest'): - item.own_markers = [marker for marker in item.own_markers if marker.name != 'localtest'] + # Check if parent contains 'localtest' marker and remove it. + if any(mark.name == 'localtest' for mark in item.parent.own_markers) or id(item.parent) in parents: + if id(item.parent) not in parents: + item.parent.own_markers = [ + marker for marker in item.own_markers if getattr(marker, 'name', None) != 'localtest'] + parents[id(item.parent)] = item.parent + if not item.get_closest_marker('azuretest'): + # If item's parent was marked as 'localtest', mark the child as such, but not if + # it was marked as 'azuretest'. + item.add_marker(pytest.mark.localtest) diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py index 77306cdb576..e1a305ca388 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py @@ -31,6 +31,7 @@ def test_individual_evaluator_prompt_based_with_dict_input(self, model_config): assert score is not None assert score["gpt_fluency"] > 0.0 + @pytest.mark.azuretest def test_individual_evaluator_service_based(self, project_scope, azure_cred): eval_fn = ViolenceEvaluator(project_scope, azure_cred) score = eval_fn(