From 8df5185d668292d5adc11ebf9477e2fdd44599d4 Mon Sep 17 00:00:00 2001 From: Sara Robinson Date: Thu, 17 Aug 2023 21:53:13 -0700 Subject: [PATCH] feat: LLM - release model evaluation for TextGenerationModel to public preview PiperOrigin-RevId: 558027302 --- tests/unit/aiplatform/test_language_models.py | 836 +++++++++++++++++- .../_evaluatable_language_models.py | 755 ++++++++++++++++ vertexai/language_models/_language_models.py | 22 +- vertexai/preview/language_models.py | 16 + 4 files changed, 1622 insertions(+), 7 deletions(-) create mode 100644 vertexai/language_models/_evaluatable_language_models.py diff --git a/tests/unit/aiplatform/test_language_models.py b/tests/unit/aiplatform/test_language_models.py index 5dd1c74dad..76d6c144d6 100644 --- a/tests/unit/aiplatform/test_language_models.py +++ b/tests/unit/aiplatform/test_language_models.py @@ -41,6 +41,7 @@ ) from google.cloud.aiplatform.compat.services import prediction_service_client from google.cloud.aiplatform.compat.types import ( + artifact as gca_artifact, prediction_service as gca_prediction_service, context as gca_context, endpoint as gca_endpoint, @@ -58,6 +59,9 @@ ) from vertexai import language_models from vertexai.language_models import _language_models +from vertexai.language_models import ( + _evaluatable_language_models, +) from google.cloud.aiplatform_v1 import Execution as GapicExecution from google.cloud.aiplatform.compat.types import ( encryption_spec as gca_encryption_spec, @@ -326,6 +330,251 @@ def reverse_string_2(s):""", } ) +_TEST_TEXT_GENERATION_METRICS = { + "bleu": 3.9311041439597427, + "rougeLSum": 19.014677479620463, +} + + +_TEST_TEXT_CLASSIFICATION_METRICS = {"auPrc": 0.9, "auRoc": 0.8, "logLoss": 0.5} + +_TEST_EVAL_DATA = [ + { + "prompt": "Basketball teams in the Midwest.", + "ground_truth": "There are several basketball teams located in the Midwest region of the United States. Here are some of them:", + }, + { + "prompt": "How to bake gluten-free bread?", + "ground_truth": "Baking gluten-free bread can be a bit challenging because gluten is the protein that gives bread its structure and elasticity.", + }, +] + +_TEST_EVAL_DATA_DF = pd.DataFrame(_TEST_EVAL_DATA) + +_TEST_ARTIFACT_ID = "123456" +_TEST_ARTIFACT_NAME = f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}/metadataStores/default/artifacts/{_TEST_ARTIFACT_ID}" + +_TEST_EVAL_PIPELINE_SPEC = { + "components": {}, + "pipelineInfo": {"name": "evaluation-llm-text-generation-pipeline"}, + "root": { + "dag": {"tasks": {}}, + "inputDefinitions": { + "parameters": { + "batch_predict_accelerator_count": { + "defaultValue": 0.0, + "isOptional": True, + "parameterType": "NUMBER_INTEGER", + }, + "batch_predict_accelerator_type": { + "defaultValue": "", + "isOptional": True, + "parameterType": "STRING", + }, + "batch_predict_gcs_source_uris": { + "defaultValue": [], + "isOptional": True, + "parameterType": "LIST", + }, + "batch_predict_gcs_destination_output_uri": { + "defaultValue": "", + "isOptional": True, + "parameterType": "STRING", + }, + "batch_predict_predictions_format": { + "defaultValue": "jsonl", + "isOptional": True, + "parameterType": "STRING", + }, + "enable_web_access": { + "defaultValue": True, + "isOptional": True, + "parameterType": "BOOLEAN", + }, + "encryption_spec_key_name": { + "defaultValue": "", + "isOptional": True, + "parameterType": "STRING", + }, + "evaluation_display_name": { + "defaultValue": "evaluation-text-generation", + "isOptional": True, + "parameterType": "STRING", + }, + "location": { + "defaultValue": "us-central1", + "isOptional": True, + "parameterType": "STRING", + }, + "machine_type": { + "defaultValue": "e2-highmem-16", + "isOptional": True, + "parameterType": "STRING", + }, + "model_name": {"parameterType": "STRING"}, + "evaluation_task": {"parameterType": "STRING"}, + "network": { + "defaultValue": "", + "isOptional": True, + "parameterType": "STRING", + }, + "nlp_task": { + "defaultValue": "text-generation", + "isOptional": True, + "parameterType": "STRING", + }, + "predictions_format": { + "defaultValue": "jsonl", + "isOptional": True, + "parameterType": "STRING", + }, + "predictions_gcs_source": { + "defaultValue": [], + "isOptional": True, + "parameterType": "LIST", + }, + "project": {"parameterType": "STRING"}, + "root_dir": {"parameterType": "STRING"}, + "service_account": { + "defaultValue": "", + "isOptional": True, + "parameterType": "STRING", + }, + } + }, + }, + "schemaVersion": "2.1.0", + "sdkVersion": "kfp-2.0.0-beta.14", +} + + +_TEST_EVAL_PIPELINE_SPEC_JSON = json.dumps( + _TEST_EVAL_PIPELINE_SPEC, +) + +_TEST_EVAL_PIPELINE_JOB = json.dumps( + { + "runtimeConfig": {"parameterValues": {}}, + "pipelineSpec": json.loads(_TEST_EVAL_PIPELINE_SPEC_JSON), + } +) + +# Eval classification spec + +_TEST_EVAL_CLASSIFICATION_PIPELINE_SPEC = { + "components": {}, + "pipelineInfo": {"name": "evaluation-llm-text-generation-pipeline"}, + "root": { + "dag": {"tasks": {}}, + "inputDefinitions": { + "parameters": { + "batch_predict_accelerator_count": { + "defaultValue": 0.0, + "isOptional": True, + "parameterType": "NUMBER_INTEGER", + }, + "batch_predict_accelerator_type": { + "defaultValue": "", + "isOptional": True, + "parameterType": "STRING", + }, + "batch_predict_gcs_source_uris": { + "defaultValue": [], + "isOptional": True, + "parameterType": "LIST", + }, + "batch_predict_gcs_destination_output_uri": { + "defaultValue": "", + "isOptional": True, + "parameterType": "STRING", + }, + "batch_predict_predictions_format": { + "defaultValue": "jsonl", + "isOptional": True, + "parameterType": "STRING", + }, + "enable_web_access": { + "defaultValue": True, + "isOptional": True, + "parameterType": "BOOLEAN", + }, + "encryption_spec_key_name": { + "defaultValue": "", + "isOptional": True, + "parameterType": "STRING", + }, + "evaluation_display_name": { + "defaultValue": "evaluation-text-generation", + "isOptional": True, + "parameterType": "STRING", + }, + "location": { + "defaultValue": "us-central1", + "isOptional": True, + "parameterType": "STRING", + }, + "machine_type": { + "defaultValue": "e2-highmem-16", + "isOptional": True, + "parameterType": "STRING", + }, + "model_name": {"parameterType": "STRING"}, + "evaluation_task": {"parameterType": "STRING"}, + "network": { + "defaultValue": "", + "isOptional": True, + "parameterType": "STRING", + }, + "nlp_task": { + "defaultValue": "text-generation", + "isOptional": True, + "parameterType": "STRING", + }, + "predictions_format": { + "defaultValue": "jsonl", + "isOptional": True, + "parameterType": "STRING", + }, + "predictions_gcs_source": { + "defaultValue": [], + "isOptional": True, + "parameterType": "LIST", + }, + "evaluation_class_labels": { + "defaultValue": [], + "isOptional": True, + "parameterType": "LIST", + }, + "target_field_name": { + "defaultValue": "", + "isOptional": True, + "parameterType": "STRING", + }, + "project": {"parameterType": "STRING"}, + "root_dir": {"parameterType": "STRING"}, + "service_account": { + "defaultValue": "", + "isOptional": True, + "parameterType": "STRING", + }, + } + }, + }, + "schemaVersion": "2.1.0", + "sdkVersion": "kfp-2.0.0-beta.14", +} + +_TEST_EVAL_CLASSIFICATION_PIPELINE_SPEC_JSON = json.dumps( + _TEST_EVAL_CLASSIFICATION_PIPELINE_SPEC, +) + +_TEST_EVAL_CLASSIFICATION_PIPELINE_JOB = json.dumps( + { + "runtimeConfig": {"parameterValues": {}}, + "pipelineSpec": json.loads(_TEST_EVAL_PIPELINE_SPEC_JSON), + } +) + @pytest.fixture def mock_pipeline_bucket_exists(): @@ -382,6 +631,96 @@ def make_pipeline_job(state): ) +def make_eval_pipeline_job(state): + return gca_pipeline_job.PipelineJob( + name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_NAME, + state=state, + create_time=test_constants.PipelineJobConstants._TEST_PIPELINE_CREATE_TIME, + service_account=test_constants.ProjectConstants._TEST_SERVICE_ACCOUNT, + network=test_constants.TrainingJobConstants._TEST_NETWORK, + job_detail=gca_pipeline_job.PipelineJobDetail( + pipeline_run_context=gca_context.Context( + name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_NAME, + ), + task_details=[ + gca_pipeline_job.PipelineTaskDetail( + task_id=456, + task_name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_ID, + outputs={ + "evaluation_metrics": gca_pipeline_job.PipelineTaskDetail.ArtifactList( + artifacts=[ + gca_artifact.Artifact( + name="test-metric-artifact", + metadata=_TEST_TEXT_GENERATION_METRICS, + ), + ], + ) + }, + ), + gca_pipeline_job.PipelineTaskDetail( + task_id=789, + task_name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_ID, + outputs={ + "evaluation_metrics": gca_pipeline_job.PipelineTaskDetail.ArtifactList( + artifacts=[ + gca_artifact.Artifact( + display_name="evaluation_metrics", + uri="gs://test-bucket/evaluation_metrics", + ), + ] + ) + }, + ), + ], + ), + ) + + +def make_eval_classification_pipeline_job(state): + return gca_pipeline_job.PipelineJob( + name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_NAME, + state=state, + create_time=test_constants.PipelineJobConstants._TEST_PIPELINE_CREATE_TIME, + service_account=test_constants.ProjectConstants._TEST_SERVICE_ACCOUNT, + network=test_constants.TrainingJobConstants._TEST_NETWORK, + job_detail=gca_pipeline_job.PipelineJobDetail( + pipeline_run_context=gca_context.Context( + name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_NAME, + ), + task_details=[ + gca_pipeline_job.PipelineTaskDetail( + task_id=456, + task_name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_ID, + outputs={ + "evaluation_metrics": gca_pipeline_job.PipelineTaskDetail.ArtifactList( + artifacts=[ + gca_artifact.Artifact( + name="test-metric-artifact", + metadata=_TEST_TEXT_CLASSIFICATION_METRICS, + ), + ], + ) + }, + ), + gca_pipeline_job.PipelineTaskDetail( + task_id=789, + task_name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_ID, + outputs={ + "evaluation_metrics": gca_pipeline_job.PipelineTaskDetail.ArtifactList( + artifacts=[ + gca_artifact.Artifact( + display_name="evaluation_metrics", + uri="gs://test-bucket/evaluation_metrics", + ), + ] + ) + }, + ), + ], + ), + ) + + @pytest.fixture def mock_pipeline_service_create(): with mock.patch.object( @@ -393,6 +732,28 @@ def mock_pipeline_service_create(): yield mock_create_pipeline_job +@pytest.fixture +def mock_pipeline_service_create_eval(): + with mock.patch.object( + pipeline_service_client.PipelineServiceClient, "create_pipeline_job" + ) as mock_create_pipeline_job: + mock_create_pipeline_job.return_value = make_eval_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ) + yield mock_create_pipeline_job + + +@pytest.fixture +def mock_pipeline_service_create_eval_classification(): + with mock.patch.object( + pipeline_service_client.PipelineServiceClient, "create_pipeline_job" + ) as mock_create_pipeline_job: + mock_create_pipeline_job.return_value = make_eval_classification_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ) + yield mock_create_pipeline_job + + @pytest.fixture def mock_pipeline_job_get(): with mock.patch.object( @@ -429,6 +790,82 @@ def mock_pipeline_job_get(): yield mock_get_pipeline_job +@pytest.fixture +def mock_pipeline_job_get_eval(): + with mock.patch.object( + pipeline_service_client.PipelineServiceClient, "get_pipeline_job" + ) as mock_get_pipeline_job: + mock_get_pipeline_job.side_effect = [ + make_eval_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_RUNNING + ), + make_eval_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + make_eval_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + make_eval_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + make_eval_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + make_eval_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + make_eval_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + make_eval_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + make_eval_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + ] + + yield mock_get_pipeline_job + + +@pytest.fixture +def mock_pipeline_job_get_eval_classification(): + with mock.patch.object( + pipeline_service_client.PipelineServiceClient, "get_pipeline_job" + ) as mock_get_pipeline_job: + mock_get_pipeline_job.side_effect = [ + make_eval_classification_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_RUNNING + ), + make_eval_classification_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + make_eval_classification_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + make_eval_classification_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + make_eval_classification_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + make_eval_classification_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + make_eval_classification_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + make_eval_classification_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + make_eval_classification_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ), + ] + + yield mock_get_pipeline_job + + @pytest.fixture def mock_load_yaml_and_json(job_spec): with mock.patch.object( @@ -451,8 +888,34 @@ def mock_gcs_upload(): @pytest.fixture -def mock_request_urlopen(request: str) -> Tuple[str, mock.MagicMock]: - data = _TEST_PIPELINE_SPEC +def mock_request_urlopen(request: str) -> Tuple[str, mock.MagicMock]: + data = _TEST_PIPELINE_SPEC + with mock.patch.object(urllib_request, "urlopen") as mock_urlopen: + mock_read_response = mock.MagicMock() + mock_decode_response = mock.MagicMock() + mock_decode_response.return_value = json.dumps(data) + mock_read_response.return_value.decode = mock_decode_response + mock_urlopen.return_value.read = mock_read_response + yield request.param, mock_urlopen + + +@pytest.fixture +def mock_request_urlopen_eval(request: str) -> Tuple[str, mock.MagicMock]: + data = _TEST_EVAL_PIPELINE_SPEC + with mock.patch.object(urllib_request, "urlopen") as mock_urlopen: + mock_read_response = mock.MagicMock() + mock_decode_response = mock.MagicMock() + mock_decode_response.return_value = json.dumps(data) + mock_read_response.return_value.decode = mock_decode_response + mock_urlopen.return_value.read = mock_read_response + yield request.param, mock_urlopen + + +@pytest.fixture +def mock_request_urlopen_eval_classification( + request: str, +) -> Tuple[str, mock.MagicMock]: + data = _TEST_EVAL_CLASSIFICATION_PIPELINE_SPEC with mock.patch.object(urllib_request, "urlopen") as mock_urlopen: mock_read_response = mock.MagicMock() mock_decode_response = mock.MagicMock() @@ -528,6 +991,48 @@ def get_endpoint_with_models_mock(): yield get_endpoint_models_mock +# Model Evaluation fixtures +@pytest.fixture +def mock_model_evaluate(): + with mock.patch.object( + _evaluatable_language_models._EvaluatableLanguageModel, "evaluate" + ) as mock_model_evaluate: + mock_model_evaluate.return_value = _TEST_TEXT_GENERATION_METRICS + yield mock_model_evaluate + + +@pytest.fixture +def mock_successfully_completed_eval_job(): + with mock.patch.object( + pipeline_service_client.PipelineServiceClient, "get_pipeline_job" + ) as mock_get_model_eval_job: + mock_get_model_eval_job.return_value = make_eval_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ) + yield mock_get_model_eval_job + + +@pytest.fixture +def mock_successfully_completed_eval_classification_job(): + with mock.patch.object( + pipeline_service_client.PipelineServiceClient, "get_pipeline_job" + ) as mock_get_model_eval_job: + mock_get_model_eval_job.return_value = make_eval_classification_pipeline_job( + gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ) + yield mock_get_model_eval_job + + +@pytest.fixture +def mock_storage_blob_upload_from_filename(): + with mock.patch( + "google.cloud.storage.Blob.upload_from_filename" + ) as mock_blob_upload_from_filename, mock.patch( + "google.cloud.storage.Bucket.exists", return_value=True + ): + yield mock_blob_upload_from_filename + + @pytest.mark.usefixtures("google_auth_mock") class TestLanguageModels: """Unit tests for the language models.""" @@ -1530,3 +2035,330 @@ def test_batch_prediction_for_text_embedding(self): gcs_destination_prefix="gs://test-bucket/results/", model_parameters={}, ) + + +# TODO (b/285946649): add more test coverage before public preview release +@pytest.mark.usefixtures("google_auth_mock") +class TestLanguageModelEvaluation: + @pytest.mark.usefixtures( + "get_model_with_tuned_version_label_mock", + "get_endpoint_with_models_mock", + ) + @pytest.mark.parametrize( + "job_spec", + [_TEST_EVAL_PIPELINE_SPEC_JSON, _TEST_EVAL_PIPELINE_JOB], + ) + @pytest.mark.parametrize( + "mock_request_urlopen_eval", + ["https://us-kfp.pkg.dev/proj/repo/pack/latest"], + indirect=True, + ) + def test_model_evaluation_text_generation_task_with_gcs_input( + self, + job_spec, + mock_pipeline_service_create_eval, + mock_pipeline_job_get_eval, + mock_successfully_completed_eval_job, + mock_pipeline_bucket_exists, + mock_load_yaml_and_json, + mock_request_urlopen_eval, + ): + + aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION) + + with mock.patch.object( + target=model_garden_service_client.ModelGardenServiceClient, + attribute="get_publisher_model", + return_value=gca_publisher_model.PublisherModel( + _TEXT_BISON_PUBLISHER_MODEL_DICT + ), + ): + + my_model = preview_language_models.TextGenerationModel.get_tuned_model( + test_constants.ModelConstants._TEST_MODEL_RESOURCE_NAME + ) + + eval_metrics = my_model.evaluate( + task_spec=preview_language_models.EvaluationTextGenerationSpec( + ground_truth_data="gs://my-bucket/ground-truth.jsonl", + ), + ) + + assert isinstance(eval_metrics, preview_language_models.EvaluationMetric) + assert eval_metrics.bleu == _TEST_TEXT_GENERATION_METRICS["bleu"] + + @pytest.mark.usefixtures( + "get_model_with_tuned_version_label_mock", + "get_endpoint_with_models_mock", + ) + @pytest.mark.parametrize( + "job_spec", + [_TEST_EVAL_PIPELINE_SPEC_JSON, _TEST_EVAL_PIPELINE_JOB], + ) + def test_populate_eval_template_params( + self, + job_spec, + mock_pipeline_service_create, + mock_model_evaluate, + mock_pipeline_job_get, + mock_successfully_completed_eval_job, + mock_pipeline_bucket_exists, + mock_load_yaml_and_json, + ): + + aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION) + + with mock.patch.object( + target=model_garden_service_client.ModelGardenServiceClient, + attribute="get_publisher_model", + return_value=gca_publisher_model.PublisherModel( + _TEXT_BISON_PUBLISHER_MODEL_DICT + ), + ): + + my_model = preview_language_models.TextGenerationModel.get_tuned_model( + test_constants.ModelConstants._TEST_MODEL_RESOURCE_NAME + ) + + task_spec = preview_language_models.EvaluationTextGenerationSpec( + ground_truth_data="gs://my-bucket/ground-truth.jsonl", + ) + + formatted_template_params = ( + _evaluatable_language_models._populate_eval_template_params( + task_spec=task_spec, model_name=my_model._model_resource_name + ) + ) + + assert ( + "batch_predict_gcs_destination_output_uri" in formatted_template_params + ) + assert "model_name" in formatted_template_params + assert "evaluation_task" in formatted_template_params + + # This should only be in the classification task pipeline template + assert "evaluation_class_labels" not in formatted_template_params + assert "target_column_name" not in formatted_template_params + + @pytest.mark.usefixtures( + "get_model_with_tuned_version_label_mock", + "get_endpoint_with_models_mock", + ) + @pytest.mark.parametrize( + "job_spec", + [_TEST_EVAL_PIPELINE_SPEC_JSON, _TEST_EVAL_PIPELINE_JOB], + ) + def test_populate_template_params_for_classification_task( + self, + job_spec, + mock_pipeline_service_create, + mock_model_evaluate, + mock_pipeline_job_get, + mock_successfully_completed_eval_job, + mock_pipeline_bucket_exists, + mock_load_yaml_and_json, + ): + + aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION) + + with mock.patch.object( + target=model_garden_service_client.ModelGardenServiceClient, + attribute="get_publisher_model", + return_value=gca_publisher_model.PublisherModel( + _TEXT_BISON_PUBLISHER_MODEL_DICT + ), + ): + + my_model = preview_language_models.TextGenerationModel.get_tuned_model( + test_constants.ModelConstants._TEST_MODEL_RESOURCE_NAME + ) + + task_spec = preview_language_models.EvaluationTextClassificationSpec( + ground_truth_data="gs://my-bucket/ground-truth.jsonl", + target_column_name="test_targ_name", + class_names=["test_class_name_1", "test_class_name_2"], + ) + + formatted_template_params = ( + _evaluatable_language_models._populate_eval_template_params( + task_spec=task_spec, model_name=my_model._model_resource_name + ) + ) + + assert "evaluation_class_labels" in formatted_template_params + assert "target_field_name" in formatted_template_params + + @pytest.mark.usefixtures( + "get_model_with_tuned_version_label_mock", + "get_endpoint_with_models_mock", + "mock_storage_blob_upload_from_filename", + ) + @pytest.mark.parametrize( + "job_spec", + [_TEST_EVAL_PIPELINE_SPEC_JSON, _TEST_EVAL_PIPELINE_JOB], + ) + def test_populate_template_params_with_dataframe_input( + self, + job_spec, + mock_pipeline_service_create, + mock_pipeline_job_get, + mock_successfully_completed_eval_job, + mock_pipeline_bucket_exists, + mock_load_yaml_and_json, + ): + + aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION) + + with mock.patch.object( + target=model_garden_service_client.ModelGardenServiceClient, + attribute="get_publisher_model", + return_value=gca_publisher_model.PublisherModel( + _TEXT_BISON_PUBLISHER_MODEL_DICT + ), + ): + + my_model = preview_language_models.TextGenerationModel.get_tuned_model( + test_constants.ModelConstants._TEST_MODEL_RESOURCE_NAME + ) + + task_spec = preview_language_models.EvaluationTextGenerationSpec( + ground_truth_data=_TEST_EVAL_DATA_DF, + ) + + formatted_template_params = ( + _evaluatable_language_models._populate_eval_template_params( + task_spec=task_spec, model_name=my_model._model_resource_name + ) + ) + + # The utility method should not modify task_spec + assert isinstance(task_spec.ground_truth_data, pd.DataFrame) + + assert ( + "batch_predict_gcs_destination_output_uri" in formatted_template_params + ) + assert "model_name" in formatted_template_params + assert "evaluation_task" in formatted_template_params + + # This should only be in the classification task pipeline template + assert "evaluation_class_labels" not in formatted_template_params + assert "target_column_name" not in formatted_template_params + + def test_evaluate_raises_on_ga_language_model( + self, + ): + + aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION) + + with mock.patch.object( + target=model_garden_service_client.ModelGardenServiceClient, + attribute="get_publisher_model", + return_value=gca_publisher_model.PublisherModel( + _TEXT_BISON_PUBLISHER_MODEL_DICT + ), + ): + model = language_models.TextGenerationModel.from_pretrained( + "text-bison@001" + ) + + with pytest.raises(AttributeError): + model.evaluate() + + @pytest.mark.usefixtures( + "get_endpoint_with_models_mock", + ) + @pytest.mark.parametrize( + "job_spec", + [_TEST_EVAL_PIPELINE_SPEC_JSON, _TEST_EVAL_PIPELINE_JOB], + ) + @pytest.mark.parametrize( + "mock_request_urlopen_eval", + ["https://us-kfp.pkg.dev/proj/repo/pack/latest"], + indirect=True, + ) + def test_model_evaluation_text_generation_task_on_base_model( + self, + job_spec, + mock_pipeline_service_create_eval, + mock_pipeline_job_get_eval, + mock_successfully_completed_eval_job, + mock_pipeline_bucket_exists, + mock_load_yaml_and_json, + mock_request_urlopen_eval, + ): + + aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION) + + with mock.patch.object( + target=model_garden_service_client.ModelGardenServiceClient, + attribute="get_publisher_model", + return_value=gca_publisher_model.PublisherModel( + _TEXT_BISON_PUBLISHER_MODEL_DICT + ), + ): + + my_model = preview_language_models.TextGenerationModel.from_pretrained( + "text-bison@001" + ) + + eval_metrics = my_model.evaluate( + task_spec=preview_language_models.EvaluationTextGenerationSpec( + ground_truth_data="gs://my-bucket/ground-truth.jsonl", + ), + ) + + assert isinstance(eval_metrics, preview_language_models.EvaluationMetric) + + @pytest.mark.usefixtures( + "get_endpoint_with_models_mock", + ) + @pytest.mark.parametrize( + "job_spec", + [ + _TEST_EVAL_CLASSIFICATION_PIPELINE_SPEC_JSON, + _TEST_EVAL_CLASSIFICATION_PIPELINE_JOB, + ], + ) + @pytest.mark.parametrize( + "mock_request_urlopen_eval_classification", + ["https://us-central1-kfp.pkg.dev/proj/repo/pack/latest"], + indirect=True, + ) + def test_model_evaluation_text_classification_base_model_only_summary_metrics( + self, + job_spec, + mock_pipeline_service_create_eval_classification, + mock_pipeline_job_get_eval_classification, + mock_successfully_completed_eval_classification_job, + mock_pipeline_bucket_exists, + mock_load_yaml_and_json, + mock_request_urlopen_eval_classification, + ): + + aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION) + + with mock.patch.object( + target=model_garden_service_client.ModelGardenServiceClient, + attribute="get_publisher_model", + return_value=gca_publisher_model.PublisherModel( + _TEXT_BISON_PUBLISHER_MODEL_DICT + ), + ): + my_model = preview_language_models.TextGenerationModel.from_pretrained( + "text-bison@001" + ) + + eval_metrics = my_model.evaluate( + task_spec=preview_language_models.EvaluationTextClassificationSpec( + ground_truth_data="gs://my-bucket/ground-truth.jsonl", + target_column_name="test_targ_name", + class_names=["test_class_name_1", "test_class_name_2"], + ) + ) + + assert isinstance( + eval_metrics, + preview_language_models.EvaluationClassificationMetric, + ) + assert eval_metrics.confidenceMetrics is None + assert eval_metrics.auPrc == _TEST_TEXT_CLASSIFICATION_METRICS["auPrc"] diff --git a/vertexai/language_models/_evaluatable_language_models.py b/vertexai/language_models/_evaluatable_language_models.py new file mode 100644 index 0000000000..eb3423fc93 --- /dev/null +++ b/vertexai/language_models/_evaluatable_language_models.py @@ -0,0 +1,755 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Classes for working with language models.""" + +import dataclasses +import os +from typing import Any, Dict, List, Optional, Type, TypeVar, Union + +from google.cloud import storage + +from google.cloud import aiplatform +from google.cloud.aiplatform import base +from google.cloud.aiplatform import initializer as aiplatform_initializer +from google.cloud.aiplatform import utils as aiplatform_utils +from google.cloud.aiplatform.utils import gcs_utils +from vertexai._model_garden import _model_garden_models + +from google.cloud.aiplatform.compat.services import ( + model_garden_service_client, +) +from google.cloud.aiplatform.compat.types import ( + pipeline_state as gca_pipeline_state, +) + +try: + import pandas +except ImportError: + pandas = None + + +_LOGGER = base.Logger(__name__) + +# Model Evaluation constants +_TEXT_CLASSIFICATION_TASK_NAME = "text-classification" +_TEXT_GENERATION_TASK_NAME = "text-generation" +_QA_TASK_NAME = "question-answering" +_SUMMARIZATION_TASK_NAME = "summarization" + +_EVALUATION_TASKS = frozenset( + [ + _TEXT_CLASSIFICATION_TASK_NAME, + _TEXT_GENERATION_TASK_NAME, + _QA_TASK_NAME, + _SUMMARIZATION_TASK_NAME, + ] +) + + +_TEXT_CLASSIFICATION_TEMPLATE_URL = "https://us-kfp.pkg.dev/vertex-evaluation/pipeline-templates/evaluation-llm-classification-pipeline" +_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL = "https://us-kfp.pkg.dev/vertex-evaluation/pipeline-templates/evaluation-llm-text-generation-pipeline" + +_EVALUATION_TEMPLATE_VERSION_TAG = "1.0.1" + +_EVALUATION_TEMPLATE_URLS = { + _TEXT_CLASSIFICATION_TASK_NAME: f"{_TEXT_CLASSIFICATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}", + _TEXT_GENERATION_TASK_NAME: f"{_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}", + _QA_TASK_NAME: f"{_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}", + _SUMMARIZATION_TASK_NAME: f"{_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}", +} + + +_EVALUATION_PIPELINE_COMPONENT_IDENTIFIER = "fpc-llm-evaluation" + +# TODO: update this when BP removes the input size limit +_BATCH_PREDICTION_ROW_LIMIT = 1000 + +_EVAL_SUPPORTED_BASE_MODELS = ["text-bison@001"] + +T = TypeVar("T", bound="_EvaluationMetricBase") + + +def _check_dataset_is_within_size_limit( + data: "pandas.DataFrame", +) -> None: + + if len(data) < _BATCH_PREDICTION_ROW_LIMIT: + return + + raise ValueError( + f"Your evaluation dataset size exceeds the limit of {_BATCH_PREDICTION_ROW_LIMIT}" + ) + + +def _get_model_resource_name_and_validate( + model_name: str, + model_info: _model_garden_models._ModelInfo, +) -> str: + """Returns the resource name string for the model. + + Model Registry resource names will stay the same. For Publisher Models, we need to + pass the full resource name (publishers/google/models/text-bison@001) to the evaluation + template and ensure the base model supports evaluation. + + Args: + model_name (str): + Required. The full resource name of the Model Registry model or base publisher model + to run evaluation on. + model_info (_model_garden_models._ModelInfo): + Required. The _ModelInfo object for the instance. + + Returns: + The formatted model_name string. + + Raises: + ValueError + If a base PublisherModel was provided and the model doesn't support evaluation. + """ + + if "publishers/" not in model_name: + # Model Registry resource + return model_name + + else: + if model_info.tuning_model_id in _EVAL_SUPPORTED_BASE_MODELS: + return f"{model_info.publisher_model_resource.name}@{model_info.publisher_model_resource.version_id}" + + raise ValueError( + f"The provided model {model_name} does not support evaluation." + ) + + +def _get_template_url(task_name: str) -> Optional[str]: + """Returns the pipeline template to use for the evaluation task. + + Args: + task_name (str): + Required. The name of the evaluation task to run. + + Returns: + The evaluation pipeline template path. + """ + + return _EVALUATION_TEMPLATE_URLS.get(task_name) + + +@dataclasses.dataclass +class _EvaluationTaskSpec: + """Base class for task-specific model evaluation configuration parameters. + + This class should not be instantiated directly, instead use the subclass corresponding + to your evaluation task. + + Args: + ground_truth_data (Union[List[str], str, pandas.DataFrame]): + Required. The ground truth data to use for this evaluation job. This can be + either a Pandas DataFrame, a Cloud Storage URI of your JSONL data file, or a list of multiple + JSONL files on Cloud Storage. + + Raises: + ValueError: + If task_spec.ground_truth_data is formatted incorrectly. + If task_spec.ground_truth_data is a Pandas DataFrame and exceeds 1000 rows. + If task_spec.ground_truth_data is not a string, list, or Pandas DataFrame. + """ + + ground_truth_data: Union[List[str], str, "pandas.DataFrame"] + + @property + def task_name(self) -> str: + pass + + def __post_init__(self): + + if isinstance(self.ground_truth_data, str): + self.ground_truth_data = [self.ground_truth_data] + + if isinstance(self.ground_truth_data, list) and not all( + item.startswith("gs://") for item in self.ground_truth_data + ): + raise ValueError("Please provide a valid GCS URI starting with 'gs://'") + + if pandas and isinstance(self.ground_truth_data, pandas.DataFrame): + + _check_dataset_is_within_size_limit(self.ground_truth_data) + + +@dataclasses.dataclass +class EvaluationTextClassificationSpec(_EvaluationTaskSpec): + """Spec for text classification model evaluation tasks. + + Args: + target_column_name (str): + Required. The label column in the dataset provided in `ground_truth_data`. Required when task_name='text-classification'. + class_names (List[str]): + Required. A list of all possible label names in your dataset. Required when task_name='text-classification'. + """ + + target_column_name: str + class_names: List[str] + + @property + def task_name(self) -> str: + return "text-classification" + + +@dataclasses.dataclass +class EvaluationTextGenerationSpec(_EvaluationTaskSpec): + """Spec for text generation model evaluation tasks.""" + + @property + def task_name(self) -> str: + return "text-generation" + + +@dataclasses.dataclass +class EvaluationQuestionAnsweringSpec(_EvaluationTaskSpec): + """Spec for question answering model evaluation tasks.""" + + task_name: str = "question-answering" + + +@dataclasses.dataclass +class EvaluationTextSummarizationSpec(_EvaluationTaskSpec): + """Spec for text summarization model evaluation tasks.""" + + task_name: str = "summarization" + + +@dataclasses.dataclass +class _EvaluationMetricBase: + """Base class for returned evaulation metrics""" + + @property + def input_dataset_paths(self) -> str: + """The Google Cloud Storage paths to the dataset used for this evaluation.""" + pass + + @property + def task_name(self) -> str: + """The type of evaluation task for the evaluation..""" + pass + + +@dataclasses.dataclass +class EvaluationMetric(_EvaluationMetricBase): + """The evaluation metric response. + + Args: + bleu (float): + Optional. BLEU (Bilingual evauation understudy). Scores based on sacrebleu implementation. + rougeLSum (float): + Optional. ROUGE-L (Longest Common Subsequence) scoring at summary level. + """ + + bleu: Optional[float] = None + rougeLSum: Optional[float] = None + + +@dataclasses.dataclass +class EvaluationClassificationMetric(_EvaluationMetricBase): + """The evaluation metric response for classification metrics. + + Args: + label_name (str): + Optional. The name of the label associated with the metrics. This is only + returned when `only_summary_metrics=False` is passed to evaluate(). + auPrc (float): + Optional. The area under the precision recall curve. + auRoc (float): + Optional. The area under the receiver operating characteristic curve. + logLoss (float): + Optional. Logarithmic loss. + confidenceMetrics (List[Dict[str, Any]]): + Optional. This is only returned when `only_summary_metrics=False` is + passed to evaluate(). + confusionMatrix (Dict[str, Any]): + Optional. This is only returned when `only_summary_metrics=False` is + passed to evaluate(). + """ + + label_name: Optional[str] = None + auPrc: Optional[float] = None + auRoc: Optional[float] = None + logLoss: Optional[float] = None + confidenceMetrics: Optional[List[Dict[str, Any]]] = None + confusionMatrix: Optional[Dict[str, Any]] = None + + +@dataclasses.dataclass +class EvaluationSlicedClassificationMetric(_EvaluationMetricBase): + """The evaluation metric slices returned for classification metrics. + + This is returned when `only_summary_metrics=False` is passed to evaluate(). + + Args: + overall_metrics (EvaluationClassificationMetric): + The evaluation metrics across all slices of data + slices (List[EvaluationClassificationMetric]): + The evaluation metrics for each label slice. + """ + + overall_metrics: Optional[EvaluationClassificationMetric] = None + slices: Optional[List[EvaluationClassificationMetric]] = None + + +def _populate_eval_template_params( + task_spec: _EvaluationTaskSpec, + model_name: str, + service_account: Optional[str] = None, + machine_type: Optional[str] = None, + network: Optional[str] = None, + encryption_spec_key_name: Optional[str] = None, +) -> Dict[str, Any]: + """Populates a dictionary of template parameters for the evaluation PipelineJob. + + Args: + task_spec (EvaluationTaskSpec): + The EvaluationTaskSpec passed to evaluate() for this job + model_name (str): + The resource name of the model being evaluated. Either a PublisherModel or + ModelRegistry resource name. + service_account (Optional[str]): + The default service account for workload run-as account. + machine_type (Optional[str]): + Optional. The type of the machine to run the evaluation job on. + network (Optional[str]): + Optional. + encryption_spec_key_name (Optional[str]): + Optional. + + Returns: + Dict[str, Any]: + A dictionary of template parameter names and values to be passed to the PipelineJob + running the model evaluation. + """ + + ground_truth_data_gcs_path = task_spec.ground_truth_data + + staging_bucket = aiplatform_initializer.global_config.staging_bucket + + if not staging_bucket: + staging_bucket = ( + gcs_utils.create_gcs_bucket_for_pipeline_artifacts_if_it_does_not_exist() + ) + + timestamped_eval_directory = ( + f"evaluation_data_{aiplatform_utils.timestamped_unique_name()}" + ) + + if isinstance(task_spec.ground_truth_data, pandas.DataFrame): + + # Convert to jsonl file and upload to gcs + dataset_uri = os.path.join( + staging_bucket, + timestamped_eval_directory, + "eval_data.jsonl", + ) + + gcs_utils._upload_pandas_df_to_gcs( + df=task_spec.ground_truth_data, upload_gcs_path=dataset_uri + ) + ground_truth_data_gcs_path = [dataset_uri] + + template_params = { + "project": aiplatform_initializer.global_config.project, + "location": aiplatform_initializer.global_config.location, + "batch_predict_gcs_destination_output_uri": f"{staging_bucket}/{timestamped_eval_directory}", + "model_name": model_name, + "batch_predict_gcs_source_uris": ground_truth_data_gcs_path, + "service_account": service_account, + "machine_type": machine_type, + "encrytion_spec_key_name": encryption_spec_key_name + or aiplatform_initializer.global_config.encryption_spec_key_name, + "network": network or aiplatform_initializer.global_config.network, + } + + if task_spec.task_name == _TEXT_CLASSIFICATION_TASK_NAME: + template_params["evaluation_class_labels"] = task_spec.class_names + template_params["target_field_name"] = task_spec.target_column_name + else: + template_params["evaluation_task"] = task_spec.task_name + + return template_params + + +# TODO (b/285947054): update to use public pipeline contract +def _get_gcs_uri_from_pipeline_task_details( + pipeline_job: aiplatform.PipelineJob, +) -> Optional[str]: + """Gets the GCS URI from the PipelineJob output. + + Args: + pipeline_job (aiplatform.PipelineJob) + The PipelineJob resource to get the metrics GCS URI from + + Returns: + The GCS URI of the evaluation metrics as a string. + """ + + for task in pipeline_job.task_details: + if task.task_name == pipeline_job.name and "evaluation_metrics" in task.outputs: + return task.outputs["evaluation_metrics"].artifacts[0].uri + + +def _convert_metrics_dict_to_response_type( + metrics_json: Dict[str, Any], + metric_type: Type[T], + metric_name: Optional[str] = None, +) -> EvaluationClassificationMetric: + metrics_response = metric_type() + if metric_name: + metrics_response.label_name = metric_name + + for metric, value in metrics_json.items(): + if hasattr(metrics_response, metric): + setattr(metrics_response, metric, value) + return metrics_response + + +def _format_classification_metrics( + metrics: Dict[str, Any] +) -> EvaluationSlicedClassificationMetric: + """Reformats classification metrics returned by the eval pipeline to make them more readable. + + Returned metrics are of type EvaluationSlicedClassificationMetric, with `overall` representing + the metrics for all data, and `slices` representing the metrics for each label in the dataset. + + Example schema of reformatted metrics: + + EvaluationSlicedClassificationMetrics( + overall_metrics=EvaluationClassificationMetric( + auPrc=... + ) + slices=[ + EvaluationClassificationMetric( + label_name="overall", + auPrc=..., + ... + ), + EvaluationClassificationMetric( + label_name="label_1", + auPrc=..., + ... + ), + EvaluationClassificationMetric( + label_name="label_2", + auPrc=..., + ... + ) + ] + ) + """ + + reformatted_metrics = EvaluationSlicedClassificationMetric() + + # TODO: see if we can do this without relying on specific keys, i.e. slicedMetrics + + # First add overall metrics + overall_metrics = _convert_metrics_dict_to_response_type( + metrics_json=metrics["slicedMetrics"][0]["metrics"]["classification"], + metric_type=EvaluationClassificationMetric, + ) + reformatted_metrics.overall_metrics = overall_metrics + + sliced_metrics = [] + + # Then add metrics for each slice + for idx in range(1, len(metrics["slicedMetrics"])): + metric_slice_name = metrics["slicedMetrics"][idx]["singleOutputSlicingSpec"][ + "value" + ] + + sliced_metric = _convert_metrics_dict_to_response_type( + metrics_json=metrics["slicedMetrics"][idx]["metrics"]["classification"], + metric_type=EvaluationClassificationMetric, + metric_name=metric_slice_name, + ) + sliced_metrics.append(sliced_metric) + + reformatted_metrics.sliced_metrics = sliced_metrics + + return reformatted_metrics + + +def _get_metrics_from_gcs_uri( + gcs_uri: str, +) -> Union[ + EvaluationMetric, + EvaluationClassificationMetric, + EvaluationSlicedClassificationMetric, +]: + """Downloads evaluation metrics from GCS path.""" + + storage_client = storage.Client( + credentials=aiplatform_initializer.global_config.credentials + ) + + metrics_json = storage.Blob.from_string( + uri=gcs_uri, client=storage_client + ).download_as_text() + + # Sliced classification metrics case, format data + if "slicedMetrics" in metrics_json: + return _format_classification_metrics(metrics_json) + # If classification metrics don't contain slices, use EvaluationClassificationMetric type + if "auPrc" in metrics_json: + metrics_response = _convert_metrics_dict_to_response_type( + metrics_json=metrics_json, + metric_type=EvaluationClassificationMetric, + ) + # All other metric types + else: + metrics_response = _convert_metrics_dict_to_response_type( + metrics_json=metrics_json, + metric_type=EvaluationMetric, + ) + return metrics_response + + +def _get_metrics_from_pipeline_task_details( + pipeline_job: aiplatform.PipelineJob, +) -> Union[EvaluationMetric, EvaluationClassificationMetric]: + """Gets the evaluation metrics from the PipelineJob TaskDetails. + + Args: + pipeline_job (aiplatform.PipelineJob) + The PipelineJob resource to get the metrics from + + Returns: + A dictionary with the evaluation metrics + """ + metrics = {} + + # TODO (b/292076101): this now uses a public pipelines contract, but still relies on task_details + for task in pipeline_job.task_details: + if task.task_name == pipeline_job.name: + for output in task.outputs: + for metric_name, metric_value in ( + task.outputs[output].artifacts[0].metadata.items() + ): + metrics[metric_name] = metric_value + + if "auPrc" in metrics: + metrics_response = EvaluationClassificationMetric() + else: + metrics_response = EvaluationMetric() + + for metric, value in metrics.items(): + if hasattr(metrics_response, metric): + setattr(metrics_response, metric, value) + return metrics_response + + +class _LanguageModelEvaluationJob: + """Represents a model evaluation job for LLM models. + + These evaluation jobs are run as a Vertex Pipeline. + """ + + def __init__( + self, + pipeline_job: aiplatform.PipelineJob, + ): + self._pipeline_job = pipeline_job + + def result( + self, *, only_summary_metrics: bool + ) -> Union[EvaluationMetric, EvaluationClassificationMetric]: + """Blocks on completion of the model evaluation PipelineJob and returns metrics.""" + + self._pipeline_job.wait() + + if only_summary_metrics: + return _get_metrics_from_pipeline_task_details(self._pipeline_job) + else: + gcs_uri = _get_gcs_uri_from_pipeline_task_details(self._pipeline_job) + if gcs_uri: + return _get_metrics_from_gcs_uri(gcs_uri) + + +class _EvaluatableLanguageModel: + """Mixin class for LLMs that support model evaluation.""" + + # TODO (b/282975912): convert training job specific args to a TrainingConfig + def evaluate( + self, + *, + task_spec: _EvaluationTaskSpec, + only_summary_metrics: Optional[bool] = True, + machine_type: Optional[str] = None, + ) -> Union[ + EvaluationMetric, + EvaluationClassificationMetric, + EvaluationSlicedClassificationMetric, + ]: + """Runs model evaluation using the provided input and ground truth data. + + This creates an evaluation job and blocks until the job completes, about + 10 - 20 minutes. + + Example: + ``` + model = TextGenerationModel.from_pretrained("text-bison@001") + eval_metrics = model.evaluate( + task_spec=EvaluationTextGenerationSpec( + ground_truth_data="gs://my-bucket/ground-truth.jsonl", + ) + ) + ``` + + Args: + task_spec (_EvaluationTaskSpec): + Required. The configuration spec for your model evaluation job. Choose the spec corresponding + with the evaluation task you are performing, one of: EvaluationClassificationSpec, EvaluationTextGenerationSpec, + EvaluationTextSummarizationSpec, EvaluationQuestionAnsweringSpec. + + For example, a valid classification `task_spec` is: + EvaluationTextClassificationSpec( + ground_truth_data=["gs://bucket/path/to/your/data.jsonl"], + class_names=["cheddar", "gouda", "camembert"], + target_column_name="cheese_type", + ) + only_summary_metrics (bool): + Optional. Setting this field to False only affects the metrics returned for text classification tasks. + When False, text classification metrics will include additional sliced metrics fields, with metrics for + each label slice in the data. + machine_type (str): + Optional. The type of the machine to run the evaluation job on. The default value is "e2-highmem-16". For + tasks with a large evaluation dataset, a bigger machine type may be required. + For more details about this input config, see + https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types. + + Returns: + Union[EvaluationMetric, EvaluationClassificationMetric, List[EvaluationClassificationMetric]] + The evaluation metrics from this evaluation job. When `only_summary_metrics=False` is passed + and the evaluation task type is 'text-classification', the return type will be List[EvaluationClassificationMetric], + where each value in the list is the metrics associated with a particular classification label. + """ + + model_info = _model_garden_models._get_model_info( + self._model_id, + schema_to_class_map={self._INSTANCE_SCHEMA_URI: type(self)}, + ) + model_name = _get_model_resource_name_and_validate( + model_name=self._model_resource_name, model_info=model_info + ) + + # TODO(b/296402511): get service_account from aiplatform_initializer and pass it to the template here and to PipelineJob after cl/539823838 is submitted + template_params = _populate_eval_template_params( + task_spec=task_spec, + model_name=model_name, + machine_type=machine_type, + network=aiplatform_initializer.global_config.network, + encryption_spec_key_name=aiplatform_initializer.global_config.encryption_spec_key_name, + ) + + template_path = _get_template_url(task_spec.task_name) + + pipeline_job = aiplatform.PipelineJob( + template_path=template_path, + parameter_values=template_params, + display_name=f"llm-eval-sdk-{aiplatform_utils.timestamped_unique_name()}", + ) + pipeline_job.submit() + + eval_job = _LanguageModelEvaluationJob(pipeline_job=pipeline_job) + + _LOGGER.info( + "Your evaluation job is running and will take 15-20 minutes to complete. Click on the PipelineJob link to view progress." + ) + + # NOTE: only_summary_metrics is passed because getting metrics from the artifact is faster than downloading from GCS + # GCS is only needed for additional metrics for text-classification tasks + return eval_job.result(only_summary_metrics=only_summary_metrics) + + def list_evaluation_metrics( + self, + *, + task_name: Optional[str] = None, + only_summary_metrics: Optional[bool] = True, + ) -> List[Union[EvaluationMetric, EvaluationClassificationMetric]]: + """Lists the evaluation metrics from all evaluation jobs run on this model. + + Args: + task_name (str): + Optional. The task name to return evaluation metrics for. If provided, this will only return evaluation + metrics for tasks of the provided type. This matches the possible values passed to EvaluationTaskType.task_name, + and must be one of 'text-generation', 'text-classification', 'summarization', or 'question-answering'. + + Returns: + Dict[str, Any] + The evaluation metrics from all evaluation jobs run on this model. + + """ + + model_name = self._model_resource_name + + publisher_model_parts = model_garden_service_client.ModelGardenServiceClient.parse_publisher_model_path( + "".join(model_name.rpartition("publishers")[1:]) + ) + + if publisher_model_parts: + model_id = publisher_model_parts["model"] + model_name = f"publishers/google/models/{model_id}" + + filters = f'metadata.component_type.string_value={_EVALUATION_PIPELINE_COMPONENT_IDENTIFIER} AND metadata."input:model_name".string_value={model_name} AND (metadata."input:evaluation_task".string_value={_TEXT_GENERATION_TASK_NAME} OR metadata."input:evaluation_task".string_value={_SUMMARIZATION_TASK_NAME} OR metadata."input:evaluation_task".string_value={_QA_TASK_NAME} OR metadata."input:evaluation_task".string_value={_TEXT_CLASSIFICATION_TASK_NAME})' + + # NOTE: when task_name is appended to the filter the block of OR filters in `filters` above becomes a no-op + if task_name: + filters += f' AND metadata."input:evaluation_task".string_value={task_name}' + + filtered_pipeline_executions = aiplatform.Execution.list( + filter=filters, + project=aiplatform_initializer.global_config.project, + location=aiplatform_initializer.global_config.location, + credentials=aiplatform_initializer.global_config.credentials, + ) + + model_eval_metrics = [] + + # TODO (b/285950380): improve performance of this method + for pipeline_execution in filtered_pipeline_executions: + if "pipeline_job_resource_name" not in pipeline_execution.metadata: + continue + + pipeline_job_resource = aiplatform.PipelineJob.get( + resource_name=pipeline_execution.metadata["pipeline_job_resource_name"] + ) + eval_job_state = pipeline_job_resource._gca_resource.state + + if ( + eval_job_state + != gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ): + continue + + metrics = None + + if only_summary_metrics: + metrics = _get_metrics_from_pipeline_task_details(pipeline_job_resource) + else: + gcs_uri = _get_gcs_uri_from_pipeline_task_details(pipeline_job_resource) + if gcs_uri: + metrics = _get_metrics_from_gcs_uri(gcs_uri) + + metrics.input_dataset_paths = pipeline_execution.metadata[ + "input:batch_predict_gcs_source_uris" + ] + metrics.task_name = pipeline_execution.metadata["input:evaluation_task"] + + model_eval_metrics.append(metrics) + + return model_eval_metrics diff --git a/vertexai/language_models/_language_models.py b/vertexai/language_models/_language_models.py index 3ef46cc88e..28d73f4eac 100644 --- a/vertexai/language_models/_language_models.py +++ b/vertexai/language_models/_language_models.py @@ -24,6 +24,9 @@ from google.cloud.aiplatform import utils as aiplatform_utils from google.cloud.aiplatform.utils import gcs_utils from vertexai._model_garden import _model_garden_models +from vertexai.language_models import ( + _evaluatable_language_models, +) try: import pandas @@ -497,7 +500,10 @@ class TextGenerationModel(_TextGenerationModel, _ModelWithBatchPredict): class _PreviewTextGenerationModel( - _TextGenerationModel, _TunableModelMixin, _PreviewModelWithBatchPredict + _TextGenerationModel, + _TunableModelMixin, + _PreviewModelWithBatchPredict, + _evaluatable_language_models._EvaluatableLanguageModel, ): # Do not add docstring so that it's inherited from the base class. _LAUNCH_STAGE = _model_garden_models._SDK_PUBLIC_PREVIEW_LAUNCH_STAGE @@ -696,7 +702,9 @@ def start_chat( *, context: Optional[str] = None, examples: Optional[List[InputOutputTextPair]] = None, - max_output_tokens: Optional[int] = _TextGenerationModel._DEFAULT_MAX_OUTPUT_TOKENS, + max_output_tokens: Optional[ + int + ] = _TextGenerationModel._DEFAULT_MAX_OUTPUT_TOKENS, temperature: Optional[float] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, @@ -801,7 +809,7 @@ def start_chat( model=self, max_output_tokens=max_output_tokens, temperature=temperature, - message_history=message_history + message_history=message_history, ) @@ -820,7 +828,9 @@ def __init__( model: _ChatModelBase, context: Optional[str] = None, examples: Optional[List[InputOutputTextPair]] = None, - max_output_tokens: Optional[int] = _TextGenerationModel._DEFAULT_MAX_OUTPUT_TOKENS, + max_output_tokens: Optional[ + int + ] = _TextGenerationModel._DEFAULT_MAX_OUTPUT_TOKENS, temperature: Optional[float] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, @@ -955,7 +965,9 @@ def __init__( model: ChatModel, context: Optional[str] = None, examples: Optional[List[InputOutputTextPair]] = None, - max_output_tokens: Optional[int] = _TextGenerationModel._DEFAULT_MAX_OUTPUT_TOKENS, + max_output_tokens: Optional[ + int + ] = _TextGenerationModel._DEFAULT_MAX_OUTPUT_TOKENS, temperature: Optional[float] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, diff --git a/vertexai/preview/language_models.py b/vertexai/preview/language_models.py index 6ecf2a6d54..447b3a0f9f 100644 --- a/vertexai/preview/language_models.py +++ b/vertexai/preview/language_models.py @@ -29,6 +29,16 @@ TextGenerationResponse, ) +from vertexai.language_models._evaluatable_language_models import ( + EvaluationTextGenerationSpec, + EvaluationTextSummarizationSpec, + EvaluationQuestionAnsweringSpec, + EvaluationTextClassificationSpec, + EvaluationClassificationMetric, + EvaluationMetric, +) + + ChatModel = _PreviewChatModel CodeChatModel = _PreviewCodeChatModel CodeGenerationModel = _PreviewCodeGenerationModel @@ -42,6 +52,12 @@ "CodeChatModel", "CodeChatSession", "CodeGenerationModel", + "EvaluationClassificationMetric", + "EvaluationMetric", + "EvaluationTextGenerationSpec", + "EvaluationTextSummarizationSpec", + "EvaluationQuestionAnsweringSpec", + "EvaluationTextClassificationSpec", "InputOutputTextPair", "TextEmbedding", "TextEmbeddingModel",