Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add local e2e test gate #3534

Merged
merged 26 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
name: promptflow-evals-e2e-test
name: promptflow-evals-e2e-test-azure

on:
schedule:
- cron: "40 10 * * *" # 2:40 PST every day
pull_request:
paths:
- src/promptflow-evals/**
- .github/workflows/promptflow-evals-e2e-test.yml
- .github/workflows/promptflow-evals-e2e-test-azure.yml
workflow_dispatch:

env:
Expand Down Expand Up @@ -83,10 +83,10 @@ jobs:
creds: ${{ secrets.PF_EVALS_SP_CREDENTIALS }}
enable-AzPSSession: true
- name: run e2e tests
id: run_all_e2e_tests
id: run_e2e_tests_azure
run: |
poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity all_e2e_tests_run_times --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }}
poetry run pytest -m azuretest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_azure --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }}
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: upload coverage report
uses: actions/upload-artifact@v4
Expand Down
98 changes: 98 additions & 0 deletions .github/workflows/promptflow-evals-e2e-test-local.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
name: promptflow-evals-e2e-test-local

on:
schedule:
- cron: "40 10 * * *" # 2:40 PST every day
pull_request:
paths:
- src/promptflow-evals/**
- .github/workflows/promptflow-evals-e2e-test-local.yml
workflow_dispatch:

env:
IS_IN_CI_PIPELINE: "true"
WORKING_DIRECTORY: ${{ github.workspace }}/src/promptflow-evals

jobs:
test:
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macos-13]
# TODO: Encounter hash mismatch for ubuntu-latest and 3.9 combination during installing promptflow-evals package
# https://github.com/microsoft/promptflow/actions/runs/9009397933/job/24753518853?pr=3158
# Add 3.9 back after we figure out the issue
python-version: ['3.8', '3.9', '3.10', '3.11']
fail-fast: false
# snok/install-poetry need this to support Windows
defaults:
run:
shell: bash
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- name: set test mode
# Always run in replay mode for now until we figure out the test resource to run live mode
run: echo "PROMPT_FLOW_TEST_MODE=replay" >> $GITHUB_ENV
#run: echo "PROMPT_FLOW_TEST_MODE=$(if [[ "${{ github.event_name }}" == "pull_request" ]]; then echo replay; else echo live; fi)" >> $GITHUB_ENV
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- uses: snok/install-poetry@v1
- name: install test dependency group
run: poetry install --only test
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: install promptflow packages in editable mode
run: |
poetry run pip install -e ../promptflow
poetry run pip install -e ../promptflow-core
poetry run pip install -e ../promptflow-devkit
poetry run pip install -e ../promptflow-tracing
poetry run pip install -e ../promptflow-tools
poetry run pip install -e ../promptflow-evals
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: generate end-to-end test config from secret
run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: check azure is not installed
run: poetry run pytest ../../scripts/code_qa/assert_local_install.py
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: run e2e tests
id: run_e2e_tests_local
run: |
poetry run pytest -m localtest tests/evals/e2etests --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_local --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }}
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: upload coverage report
uses: actions/upload-artifact@v4
with:
name: report-${{ matrix.os }}-py${{ matrix.python-version }}
path: |
${{ env.WORKING_DIRECTORY }}/*.xml
${{ env.WORKING_DIRECTORY }}/htmlcov/

report:
needs: test
runs-on: ubuntu-latest
permissions:
checks: write
pull-requests: write
contents: read
issues: read
steps:
- uses: actions/download-artifact@v4
with:
path: artifacts
- uses: EnricoMi/publish-unit-test-result-action@v2
with:
check_name: promptflow-evals test result
comment_title: promptflow-evals test result
files: "artifacts/**/test-results.xml" # align with `--junit-xml` in pyproject.toml
- uses: irongut/[email protected]
with:
filename: "artifacts/report-ubuntu-latest-py3.11/coverage.xml"
badge: true
fail_below_min: false
format: markdown
hide_complexity: true
output: both
thresholds: 40 80
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ src/promptflow-*/promptflow/__init__.py
# Eclipse project files
**/.project
**/.pydevproject
**/.settings

# benchmark results
benchmark/promptflow-serve/test_runner/locust-results/
21 changes: 21 additions & 0 deletions scripts/code_qa/assert_local_install.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""Tests checking that azure packages are NOT installed."""
import importlib
import pytest


class TestPackagesNotInstalles():
"""Test imports."""

@pytest.mark.parametrize('package', [
'promptflow.azure',
'azure.ai.ml',
'azure.identity',
'azure.storage.blob'
])
def test_promptflow_azure(self, package):
"""Test promptflow. azure is not installed."""
try:
importlib.import_module(package)
assert False, f'Package {package} must be uninstalled for local test.'
except (ModuleNotFoundError, ImportError):
pass
17 changes: 14 additions & 3 deletions src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,21 @@

LOGGER = logging.getLogger(__name__)


# Handle optional import. The azure libraries are only present if
# promptflow-azure is installed.
try:
from azure.storage.blob import BlobServiceClient
from azure.ai.ml.entities._credentials import AccountKeyConfiguration
except ImportError:
from azure.ai.ml.entities._datastore.datastore import Datastore
from azure.storage.blob import BlobServiceClient
except (ModuleNotFoundError, ImportError):
# If the above mentioned modules cannot be imported, we are running
# in local mode and MLClient in the constructor will be None, so
# we will not arrive to Azure-dependent code.

# We are logging the import failure only if debug logging level is set because:
# - If the project configuration was not provided this import is not needed.
# - If the project configuration was provided, the error will be raised by PFClient.
LOGGER.debug("promptflow.azure is not installed.")


Expand Down Expand Up @@ -413,7 +424,7 @@ def log_artifact(self, artifact_folder: str, artifact_name: str = EVALUATION_ART
if response.status_code != 200:
self._log_warning('register artifact', response)

def _get_datastore_credential(self, datastore: 'Datastore'):
def _get_datastore_credential(self, datastore: "Datastore"):
# Reference the logic in azure.ai.ml._artifact._artifact_utilities
# https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
credential = datastore.credentials
Expand Down
53 changes: 42 additions & 11 deletions src/promptflow-evals/tests/evals/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
from typing import Dict
from unittest.mock import patch

import jwt
import pytest
from azure.ai.ml._ml_client import MLClient

from pytest_mock import MockerFixture

from promptflow.client import PFClient
Expand All @@ -20,8 +19,8 @@
from promptflow.recording.record_mode import is_in_ci_pipeline, is_live, is_record, is_replay
except ImportError as e:
print(f"Failed to import promptflow-recording: {e}")

# Run test in empty mode if promptflow-recording is not installed

def recording_array_reset():
pass

Expand All @@ -37,6 +36,13 @@ def is_record():
def is_replay():
return False

# Import of optional packages
AZURE_INSTALLED = True
try:
import jwt
from azure.ai.ml._ml_client import MLClient
except ImportError:
AZURE_INSTALLED = False

PROMPTFLOW_ROOT = Path(__file__) / "../../../.."
CONNECTION_FILE = (PROMPTFLOW_ROOT / "promptflow-evals/connections.json").resolve().absolute().as_posix()
Expand Down Expand Up @@ -147,12 +153,15 @@ def mock_validate_trace_destination():
@pytest.fixture
def azure_ml_client(project_scope: Dict):
"""The fixture, returning MLClient"""
return MLClient(
subscription_id=project_scope["subscription_id"],
resource_group_name=project_scope["resource_group_name"],
workspace_name=project_scope["project_name"],
credential=get_cred(),
)
if AZURE_INSTALLED:
return MLClient(
subscription_id=project_scope["subscription_id"],
resource_group_name=project_scope["resource_group_name"],
workspace_name=project_scope["project_name"],
credential=get_cred(),
)
else:
return None


@pytest.fixture
Expand Down Expand Up @@ -293,6 +302,8 @@ def azure_cred():

@pytest.fixture(scope=package_scope_in_live_mode())
def user_object_id() -> str:
if not AZURE_INSTALLED:
return ""
if pytest.is_replay:
from promptflow.recording.azure import SanitizedValues

Expand All @@ -305,6 +316,8 @@ def user_object_id() -> str:

@pytest.fixture(scope=package_scope_in_live_mode())
def tenant_id() -> str:
if not AZURE_INSTALLED:
return ""
if pytest.is_replay:
from promptflow.recording.azure import SanitizedValues

Expand All @@ -317,9 +330,12 @@ def tenant_id() -> str:

@pytest.fixture(scope=package_scope_in_live_mode())
def variable_recorder():
from promptflow.recording.azure import VariableRecorder
if pytest.is_record or pytest.is_replay:
from promptflow.recording.azure import VariableRecorder

yield VariableRecorder()
yield VariableRecorder()
else:
yield None


@pytest.fixture(scope=package_scope_in_live_mode())
Expand All @@ -346,3 +362,18 @@ def vcr_recording(request: pytest.FixtureRequest, user_object_id: str, tenant_id
yield recording
else:
yield None


def pytest_collection_modifyitems(items):
parents = {}
for item in items:
# Check if parent contains 'localtest' marker and remove it.
if any(mark.name == 'localtest' for mark in item.parent.own_markers) or id(item.parent) in parents:
if id(item.parent) not in parents:
item.parent.own_markers = [
marker for marker in item.own_markers if getattr(marker, 'name', None) != 'localtest']
parents[id(item.parent)] = item.parent
if not item.get_closest_marker('azuretest'):
# If item's parent was marked as 'localtest', mark the child as such, but not if
# it was marked as 'azuretest'.
item.add_marker(pytest.mark.localtest)
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


@pytest.mark.usefixtures("recording_injection")
@pytest.mark.e2etest
@pytest.mark.azuretest
class TestAdvSimulator:
@pytest.mark.usefixtures("vcr_recording")
def test_adv_sim_init_with_prod_url(self, azure_cred, project_scope):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


@pytest.mark.usefixtures("recording_injection", "vcr_recording")
@pytest.mark.e2etest
nick863 marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.localtest
class TestBuiltInEvaluators:
def test_individual_evaluator_prompt_based(self, model_config):
eval_fn = FluencyEvaluator(model_config)
Expand All @@ -31,6 +31,7 @@ def test_individual_evaluator_prompt_based_with_dict_input(self, model_config):
assert score is not None
assert score["gpt_fluency"] > 0.0

@pytest.mark.azuretest
def test_individual_evaluator_service_based(self, project_scope, azure_cred):
eval_fn = ViolenceEvaluator(project_scope, azure_cred)
score = eval_fn(
Expand Down Expand Up @@ -73,6 +74,7 @@ def test_composite_evaluator_qa(self, model_config, parallel):
assert score["gpt_similarity"] > 0.0
assert score["f1_score"] > 0.0

@pytest.mark.azuretest
def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
safety_eval = ContentSafetyEvaluator(project_scope, parallel=False, credential=azure_cred)
score = safety_eval(
Expand Down Expand Up @@ -156,6 +158,7 @@ def test_composite_evaluator_chat(self, model_config, eval_last_turn, parallel):
assert score["evaluation_per_turn"]["gpt_retrieval"] is not None
assert len(score["evaluation_per_turn"]["gpt_retrieval"]["score"]) == turn_count

@pytest.mark.azuretest
@pytest.mark.parametrize(
"eval_last_turn, parallel",
[
Expand Down
7 changes: 5 additions & 2 deletions src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import pandas as pd
import pytest
import requests
from azure.identity import DefaultAzureCredential

from promptflow.evals.evaluate import evaluate
from promptflow.evals.evaluators import ContentSafetyEvaluator, F1ScoreEvaluator, GroundednessEvaluator
Expand Down Expand Up @@ -46,6 +45,7 @@ def question_evaluator(question):

def _get_run_from_run_history(flow_run_id, ml_client, project_scope):
"""Get run info from run history"""
from azure.identity import DefaultAzureCredential
token = "Bearer " + DefaultAzureCredential().get_token("https://management.azure.com/.default").token
headers = {
"Authorization": token,
Expand Down Expand Up @@ -80,7 +80,7 @@ def _get_run_from_run_history(flow_run_id, ml_client, project_scope):


@pytest.mark.usefixtures("recording_injection")
@pytest.mark.e2etest
@pytest.mark.localtest
class TestEvaluate:
def test_evaluate_with_groundedness_evaluator(self, model_config, data_file):
# data
Expand Down Expand Up @@ -118,6 +118,7 @@ def test_evaluate_with_groundedness_evaluator(self, model_config, data_file):
assert row_result_df["outputs.f1_score.f1_score"][2] == 1
assert result["studio_url"] is None

@pytest.mark.azuretest
@pytest.mark.skip(reason="Failed in CI pipeline. Pending for investigation.")
def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file, azure_cred):
input_data = pd.read_json(data_file, lines=True)
Expand Down Expand Up @@ -301,6 +302,7 @@ def test_evaluate_with_evaluator_config(self, questions_file, evaluate_config):
assert "answer.length" in metrics.keys()
assert "f1_score.f1_score" in metrics.keys()

@pytest.mark.azuretest
def test_evaluate_track_in_cloud(
self,
questions_file,
Expand Down Expand Up @@ -344,6 +346,7 @@ def test_evaluate_track_in_cloud(
assert remote_run["runMetadata"]["properties"]["runType"] == "eval_run"
assert remote_run["runMetadata"]["displayName"] == evaluation_name

@pytest.mark.azuretest
def test_evaluate_track_in_cloud_no_target(
self,
data_file,
Expand Down
Loading
Loading