From 8df5185d668292d5adc11ebf9477e2fdd44599d4 Mon Sep 17 00:00:00 2001
From: Sara Robinson <sararob@google.com>
Date: Thu, 17 Aug 2023 21:53:13 -0700
Subject: [PATCH] feat: LLM - release model evaluation for TextGenerationModel
 to public preview

PiperOrigin-RevId: 558027302
---
 tests/unit/aiplatform/test_language_models.py | 836 +++++++++++++++++-
 .../_evaluatable_language_models.py           | 755 ++++++++++++++++
 vertexai/language_models/_language_models.py  |  22 +-
 vertexai/preview/language_models.py           |  16 +
 4 files changed, 1622 insertions(+), 7 deletions(-)
 create mode 100644 vertexai/language_models/_evaluatable_language_models.py

diff --git a/tests/unit/aiplatform/test_language_models.py b/tests/unit/aiplatform/test_language_models.py
index 5dd1c74dad..76d6c144d6 100644
--- a/tests/unit/aiplatform/test_language_models.py
+++ b/tests/unit/aiplatform/test_language_models.py
@@ -41,6 +41,7 @@
 )
 from google.cloud.aiplatform.compat.services import prediction_service_client
 from google.cloud.aiplatform.compat.types import (
+    artifact as gca_artifact,
     prediction_service as gca_prediction_service,
     context as gca_context,
     endpoint as gca_endpoint,
@@ -58,6 +59,9 @@
 )
 from vertexai import language_models
 from vertexai.language_models import _language_models
+from vertexai.language_models import (
+    _evaluatable_language_models,
+)
 from google.cloud.aiplatform_v1 import Execution as GapicExecution
 from google.cloud.aiplatform.compat.types import (
     encryption_spec as gca_encryption_spec,
@@ -326,6 +330,251 @@ def reverse_string_2(s):""",
     }
 )
 
+_TEST_TEXT_GENERATION_METRICS = {
+    "bleu": 3.9311041439597427,
+    "rougeLSum": 19.014677479620463,
+}
+
+
+_TEST_TEXT_CLASSIFICATION_METRICS = {"auPrc": 0.9, "auRoc": 0.8, "logLoss": 0.5}
+
+_TEST_EVAL_DATA = [
+    {
+        "prompt": "Basketball teams in the Midwest.",
+        "ground_truth": "There are several basketball teams located in the Midwest region of the United States. Here are some of them:",
+    },
+    {
+        "prompt": "How to bake gluten-free bread?",
+        "ground_truth": "Baking gluten-free bread can be a bit challenging because gluten is the protein that gives bread its structure and elasticity.",
+    },
+]
+
+_TEST_EVAL_DATA_DF = pd.DataFrame(_TEST_EVAL_DATA)
+
+_TEST_ARTIFACT_ID = "123456"
+_TEST_ARTIFACT_NAME = f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}/metadataStores/default/artifacts/{_TEST_ARTIFACT_ID}"
+
+_TEST_EVAL_PIPELINE_SPEC = {
+    "components": {},
+    "pipelineInfo": {"name": "evaluation-llm-text-generation-pipeline"},
+    "root": {
+        "dag": {"tasks": {}},
+        "inputDefinitions": {
+            "parameters": {
+                "batch_predict_accelerator_count": {
+                    "defaultValue": 0.0,
+                    "isOptional": True,
+                    "parameterType": "NUMBER_INTEGER",
+                },
+                "batch_predict_accelerator_type": {
+                    "defaultValue": "",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "batch_predict_gcs_source_uris": {
+                    "defaultValue": [],
+                    "isOptional": True,
+                    "parameterType": "LIST",
+                },
+                "batch_predict_gcs_destination_output_uri": {
+                    "defaultValue": "",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "batch_predict_predictions_format": {
+                    "defaultValue": "jsonl",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "enable_web_access": {
+                    "defaultValue": True,
+                    "isOptional": True,
+                    "parameterType": "BOOLEAN",
+                },
+                "encryption_spec_key_name": {
+                    "defaultValue": "",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "evaluation_display_name": {
+                    "defaultValue": "evaluation-text-generation",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "location": {
+                    "defaultValue": "us-central1",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "machine_type": {
+                    "defaultValue": "e2-highmem-16",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "model_name": {"parameterType": "STRING"},
+                "evaluation_task": {"parameterType": "STRING"},
+                "network": {
+                    "defaultValue": "",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "nlp_task": {
+                    "defaultValue": "text-generation",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "predictions_format": {
+                    "defaultValue": "jsonl",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "predictions_gcs_source": {
+                    "defaultValue": [],
+                    "isOptional": True,
+                    "parameterType": "LIST",
+                },
+                "project": {"parameterType": "STRING"},
+                "root_dir": {"parameterType": "STRING"},
+                "service_account": {
+                    "defaultValue": "",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+            }
+        },
+    },
+    "schemaVersion": "2.1.0",
+    "sdkVersion": "kfp-2.0.0-beta.14",
+}
+
+
+_TEST_EVAL_PIPELINE_SPEC_JSON = json.dumps(
+    _TEST_EVAL_PIPELINE_SPEC,
+)
+
+_TEST_EVAL_PIPELINE_JOB = json.dumps(
+    {
+        "runtimeConfig": {"parameterValues": {}},
+        "pipelineSpec": json.loads(_TEST_EVAL_PIPELINE_SPEC_JSON),
+    }
+)
+
+# Eval classification spec
+
+_TEST_EVAL_CLASSIFICATION_PIPELINE_SPEC = {
+    "components": {},
+    "pipelineInfo": {"name": "evaluation-llm-text-generation-pipeline"},
+    "root": {
+        "dag": {"tasks": {}},
+        "inputDefinitions": {
+            "parameters": {
+                "batch_predict_accelerator_count": {
+                    "defaultValue": 0.0,
+                    "isOptional": True,
+                    "parameterType": "NUMBER_INTEGER",
+                },
+                "batch_predict_accelerator_type": {
+                    "defaultValue": "",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "batch_predict_gcs_source_uris": {
+                    "defaultValue": [],
+                    "isOptional": True,
+                    "parameterType": "LIST",
+                },
+                "batch_predict_gcs_destination_output_uri": {
+                    "defaultValue": "",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "batch_predict_predictions_format": {
+                    "defaultValue": "jsonl",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "enable_web_access": {
+                    "defaultValue": True,
+                    "isOptional": True,
+                    "parameterType": "BOOLEAN",
+                },
+                "encryption_spec_key_name": {
+                    "defaultValue": "",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "evaluation_display_name": {
+                    "defaultValue": "evaluation-text-generation",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "location": {
+                    "defaultValue": "us-central1",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "machine_type": {
+                    "defaultValue": "e2-highmem-16",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "model_name": {"parameterType": "STRING"},
+                "evaluation_task": {"parameterType": "STRING"},
+                "network": {
+                    "defaultValue": "",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "nlp_task": {
+                    "defaultValue": "text-generation",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "predictions_format": {
+                    "defaultValue": "jsonl",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "predictions_gcs_source": {
+                    "defaultValue": [],
+                    "isOptional": True,
+                    "parameterType": "LIST",
+                },
+                "evaluation_class_labels": {
+                    "defaultValue": [],
+                    "isOptional": True,
+                    "parameterType": "LIST",
+                },
+                "target_field_name": {
+                    "defaultValue": "",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+                "project": {"parameterType": "STRING"},
+                "root_dir": {"parameterType": "STRING"},
+                "service_account": {
+                    "defaultValue": "",
+                    "isOptional": True,
+                    "parameterType": "STRING",
+                },
+            }
+        },
+    },
+    "schemaVersion": "2.1.0",
+    "sdkVersion": "kfp-2.0.0-beta.14",
+}
+
+_TEST_EVAL_CLASSIFICATION_PIPELINE_SPEC_JSON = json.dumps(
+    _TEST_EVAL_CLASSIFICATION_PIPELINE_SPEC,
+)
+
+_TEST_EVAL_CLASSIFICATION_PIPELINE_JOB = json.dumps(
+    {
+        "runtimeConfig": {"parameterValues": {}},
+        "pipelineSpec": json.loads(_TEST_EVAL_PIPELINE_SPEC_JSON),
+    }
+)
+
 
 @pytest.fixture
 def mock_pipeline_bucket_exists():
@@ -382,6 +631,96 @@ def make_pipeline_job(state):
     )
 
 
+def make_eval_pipeline_job(state):
+    return gca_pipeline_job.PipelineJob(
+        name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_NAME,
+        state=state,
+        create_time=test_constants.PipelineJobConstants._TEST_PIPELINE_CREATE_TIME,
+        service_account=test_constants.ProjectConstants._TEST_SERVICE_ACCOUNT,
+        network=test_constants.TrainingJobConstants._TEST_NETWORK,
+        job_detail=gca_pipeline_job.PipelineJobDetail(
+            pipeline_run_context=gca_context.Context(
+                name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_NAME,
+            ),
+            task_details=[
+                gca_pipeline_job.PipelineTaskDetail(
+                    task_id=456,
+                    task_name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_ID,
+                    outputs={
+                        "evaluation_metrics": gca_pipeline_job.PipelineTaskDetail.ArtifactList(
+                            artifacts=[
+                                gca_artifact.Artifact(
+                                    name="test-metric-artifact",
+                                    metadata=_TEST_TEXT_GENERATION_METRICS,
+                                ),
+                            ],
+                        )
+                    },
+                ),
+                gca_pipeline_job.PipelineTaskDetail(
+                    task_id=789,
+                    task_name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_ID,
+                    outputs={
+                        "evaluation_metrics": gca_pipeline_job.PipelineTaskDetail.ArtifactList(
+                            artifacts=[
+                                gca_artifact.Artifact(
+                                    display_name="evaluation_metrics",
+                                    uri="gs://test-bucket/evaluation_metrics",
+                                ),
+                            ]
+                        )
+                    },
+                ),
+            ],
+        ),
+    )
+
+
+def make_eval_classification_pipeline_job(state):
+    return gca_pipeline_job.PipelineJob(
+        name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_NAME,
+        state=state,
+        create_time=test_constants.PipelineJobConstants._TEST_PIPELINE_CREATE_TIME,
+        service_account=test_constants.ProjectConstants._TEST_SERVICE_ACCOUNT,
+        network=test_constants.TrainingJobConstants._TEST_NETWORK,
+        job_detail=gca_pipeline_job.PipelineJobDetail(
+            pipeline_run_context=gca_context.Context(
+                name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_NAME,
+            ),
+            task_details=[
+                gca_pipeline_job.PipelineTaskDetail(
+                    task_id=456,
+                    task_name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_ID,
+                    outputs={
+                        "evaluation_metrics": gca_pipeline_job.PipelineTaskDetail.ArtifactList(
+                            artifacts=[
+                                gca_artifact.Artifact(
+                                    name="test-metric-artifact",
+                                    metadata=_TEST_TEXT_CLASSIFICATION_METRICS,
+                                ),
+                            ],
+                        )
+                    },
+                ),
+                gca_pipeline_job.PipelineTaskDetail(
+                    task_id=789,
+                    task_name=test_constants.PipelineJobConstants._TEST_PIPELINE_JOB_ID,
+                    outputs={
+                        "evaluation_metrics": gca_pipeline_job.PipelineTaskDetail.ArtifactList(
+                            artifacts=[
+                                gca_artifact.Artifact(
+                                    display_name="evaluation_metrics",
+                                    uri="gs://test-bucket/evaluation_metrics",
+                                ),
+                            ]
+                        )
+                    },
+                ),
+            ],
+        ),
+    )
+
+
 @pytest.fixture
 def mock_pipeline_service_create():
     with mock.patch.object(
@@ -393,6 +732,28 @@ def mock_pipeline_service_create():
         yield mock_create_pipeline_job
 
 
+@pytest.fixture
+def mock_pipeline_service_create_eval():
+    with mock.patch.object(
+        pipeline_service_client.PipelineServiceClient, "create_pipeline_job"
+    ) as mock_create_pipeline_job:
+        mock_create_pipeline_job.return_value = make_eval_pipeline_job(
+            gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+        )
+        yield mock_create_pipeline_job
+
+
+@pytest.fixture
+def mock_pipeline_service_create_eval_classification():
+    with mock.patch.object(
+        pipeline_service_client.PipelineServiceClient, "create_pipeline_job"
+    ) as mock_create_pipeline_job:
+        mock_create_pipeline_job.return_value = make_eval_classification_pipeline_job(
+            gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+        )
+        yield mock_create_pipeline_job
+
+
 @pytest.fixture
 def mock_pipeline_job_get():
     with mock.patch.object(
@@ -429,6 +790,82 @@ def mock_pipeline_job_get():
         yield mock_get_pipeline_job
 
 
+@pytest.fixture
+def mock_pipeline_job_get_eval():
+    with mock.patch.object(
+        pipeline_service_client.PipelineServiceClient, "get_pipeline_job"
+    ) as mock_get_pipeline_job:
+        mock_get_pipeline_job.side_effect = [
+            make_eval_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_RUNNING
+            ),
+            make_eval_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+            make_eval_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+            make_eval_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+            make_eval_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+            make_eval_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+            make_eval_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+            make_eval_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+            make_eval_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+        ]
+
+        yield mock_get_pipeline_job
+
+
+@pytest.fixture
+def mock_pipeline_job_get_eval_classification():
+    with mock.patch.object(
+        pipeline_service_client.PipelineServiceClient, "get_pipeline_job"
+    ) as mock_get_pipeline_job:
+        mock_get_pipeline_job.side_effect = [
+            make_eval_classification_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_RUNNING
+            ),
+            make_eval_classification_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+            make_eval_classification_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+            make_eval_classification_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+            make_eval_classification_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+            make_eval_classification_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+            make_eval_classification_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+            make_eval_classification_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+            make_eval_classification_pipeline_job(
+                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ),
+        ]
+
+        yield mock_get_pipeline_job
+
+
 @pytest.fixture
 def mock_load_yaml_and_json(job_spec):
     with mock.patch.object(
@@ -451,8 +888,34 @@ def mock_gcs_upload():
 
 
 @pytest.fixture
-def mock_request_urlopen(request: str) -> Tuple[str, mock.MagicMock]:
-    data = _TEST_PIPELINE_SPEC
+def mock_request_urlopen(request: str) -> Tuple[str, mock.MagicMock]:
+    data = _TEST_PIPELINE_SPEC
+    with mock.patch.object(urllib_request, "urlopen") as mock_urlopen:
+        mock_read_response = mock.MagicMock()
+        mock_decode_response = mock.MagicMock()
+        mock_decode_response.return_value = json.dumps(data)
+        mock_read_response.return_value.decode = mock_decode_response
+        mock_urlopen.return_value.read = mock_read_response
+        yield request.param, mock_urlopen
+
+
+@pytest.fixture
+def mock_request_urlopen_eval(request: str) -> Tuple[str, mock.MagicMock]:
+    data = _TEST_EVAL_PIPELINE_SPEC
+    with mock.patch.object(urllib_request, "urlopen") as mock_urlopen:
+        mock_read_response = mock.MagicMock()
+        mock_decode_response = mock.MagicMock()
+        mock_decode_response.return_value = json.dumps(data)
+        mock_read_response.return_value.decode = mock_decode_response
+        mock_urlopen.return_value.read = mock_read_response
+        yield request.param, mock_urlopen
+
+
+@pytest.fixture
+def mock_request_urlopen_eval_classification(
+    request: str,
+) -> Tuple[str, mock.MagicMock]:
+    data = _TEST_EVAL_CLASSIFICATION_PIPELINE_SPEC
     with mock.patch.object(urllib_request, "urlopen") as mock_urlopen:
         mock_read_response = mock.MagicMock()
         mock_decode_response = mock.MagicMock()
@@ -528,6 +991,48 @@ def get_endpoint_with_models_mock():
         yield get_endpoint_models_mock
 
 
+# Model Evaluation fixtures
+@pytest.fixture
+def mock_model_evaluate():
+    with mock.patch.object(
+        _evaluatable_language_models._EvaluatableLanguageModel, "evaluate"
+    ) as mock_model_evaluate:
+        mock_model_evaluate.return_value = _TEST_TEXT_GENERATION_METRICS
+        yield mock_model_evaluate
+
+
+@pytest.fixture
+def mock_successfully_completed_eval_job():
+    with mock.patch.object(
+        pipeline_service_client.PipelineServiceClient, "get_pipeline_job"
+    ) as mock_get_model_eval_job:
+        mock_get_model_eval_job.return_value = make_eval_pipeline_job(
+            gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+        )
+        yield mock_get_model_eval_job
+
+
+@pytest.fixture
+def mock_successfully_completed_eval_classification_job():
+    with mock.patch.object(
+        pipeline_service_client.PipelineServiceClient, "get_pipeline_job"
+    ) as mock_get_model_eval_job:
+        mock_get_model_eval_job.return_value = make_eval_classification_pipeline_job(
+            gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+        )
+        yield mock_get_model_eval_job
+
+
+@pytest.fixture
+def mock_storage_blob_upload_from_filename():
+    with mock.patch(
+        "google.cloud.storage.Blob.upload_from_filename"
+    ) as mock_blob_upload_from_filename, mock.patch(
+        "google.cloud.storage.Bucket.exists", return_value=True
+    ):
+        yield mock_blob_upload_from_filename
+
+
 @pytest.mark.usefixtures("google_auth_mock")
 class TestLanguageModels:
     """Unit tests for the language models."""
@@ -1530,3 +2035,330 @@ def test_batch_prediction_for_text_embedding(self):
                 gcs_destination_prefix="gs://test-bucket/results/",
                 model_parameters={},
             )
+
+
+# TODO (b/285946649): add more test coverage before public preview release
+@pytest.mark.usefixtures("google_auth_mock")
+class TestLanguageModelEvaluation:
+    @pytest.mark.usefixtures(
+        "get_model_with_tuned_version_label_mock",
+        "get_endpoint_with_models_mock",
+    )
+    @pytest.mark.parametrize(
+        "job_spec",
+        [_TEST_EVAL_PIPELINE_SPEC_JSON, _TEST_EVAL_PIPELINE_JOB],
+    )
+    @pytest.mark.parametrize(
+        "mock_request_urlopen_eval",
+        ["https://us-kfp.pkg.dev/proj/repo/pack/latest"],
+        indirect=True,
+    )
+    def test_model_evaluation_text_generation_task_with_gcs_input(
+        self,
+        job_spec,
+        mock_pipeline_service_create_eval,
+        mock_pipeline_job_get_eval,
+        mock_successfully_completed_eval_job,
+        mock_pipeline_bucket_exists,
+        mock_load_yaml_and_json,
+        mock_request_urlopen_eval,
+    ):
+
+        aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION)
+
+        with mock.patch.object(
+            target=model_garden_service_client.ModelGardenServiceClient,
+            attribute="get_publisher_model",
+            return_value=gca_publisher_model.PublisherModel(
+                _TEXT_BISON_PUBLISHER_MODEL_DICT
+            ),
+        ):
+
+            my_model = preview_language_models.TextGenerationModel.get_tuned_model(
+                test_constants.ModelConstants._TEST_MODEL_RESOURCE_NAME
+            )
+
+            eval_metrics = my_model.evaluate(
+                task_spec=preview_language_models.EvaluationTextGenerationSpec(
+                    ground_truth_data="gs://my-bucket/ground-truth.jsonl",
+                ),
+            )
+
+            assert isinstance(eval_metrics, preview_language_models.EvaluationMetric)
+            assert eval_metrics.bleu == _TEST_TEXT_GENERATION_METRICS["bleu"]
+
+    @pytest.mark.usefixtures(
+        "get_model_with_tuned_version_label_mock",
+        "get_endpoint_with_models_mock",
+    )
+    @pytest.mark.parametrize(
+        "job_spec",
+        [_TEST_EVAL_PIPELINE_SPEC_JSON, _TEST_EVAL_PIPELINE_JOB],
+    )
+    def test_populate_eval_template_params(
+        self,
+        job_spec,
+        mock_pipeline_service_create,
+        mock_model_evaluate,
+        mock_pipeline_job_get,
+        mock_successfully_completed_eval_job,
+        mock_pipeline_bucket_exists,
+        mock_load_yaml_and_json,
+    ):
+
+        aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION)
+
+        with mock.patch.object(
+            target=model_garden_service_client.ModelGardenServiceClient,
+            attribute="get_publisher_model",
+            return_value=gca_publisher_model.PublisherModel(
+                _TEXT_BISON_PUBLISHER_MODEL_DICT
+            ),
+        ):
+
+            my_model = preview_language_models.TextGenerationModel.get_tuned_model(
+                test_constants.ModelConstants._TEST_MODEL_RESOURCE_NAME
+            )
+
+            task_spec = preview_language_models.EvaluationTextGenerationSpec(
+                ground_truth_data="gs://my-bucket/ground-truth.jsonl",
+            )
+
+            formatted_template_params = (
+                _evaluatable_language_models._populate_eval_template_params(
+                    task_spec=task_spec, model_name=my_model._model_resource_name
+                )
+            )
+
+            assert (
+                "batch_predict_gcs_destination_output_uri" in formatted_template_params
+            )
+            assert "model_name" in formatted_template_params
+            assert "evaluation_task" in formatted_template_params
+
+            # This should only be in the classification task pipeline template
+            assert "evaluation_class_labels" not in formatted_template_params
+            assert "target_column_name" not in formatted_template_params
+
+    @pytest.mark.usefixtures(
+        "get_model_with_tuned_version_label_mock",
+        "get_endpoint_with_models_mock",
+    )
+    @pytest.mark.parametrize(
+        "job_spec",
+        [_TEST_EVAL_PIPELINE_SPEC_JSON, _TEST_EVAL_PIPELINE_JOB],
+    )
+    def test_populate_template_params_for_classification_task(
+        self,
+        job_spec,
+        mock_pipeline_service_create,
+        mock_model_evaluate,
+        mock_pipeline_job_get,
+        mock_successfully_completed_eval_job,
+        mock_pipeline_bucket_exists,
+        mock_load_yaml_and_json,
+    ):
+
+        aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION)
+
+        with mock.patch.object(
+            target=model_garden_service_client.ModelGardenServiceClient,
+            attribute="get_publisher_model",
+            return_value=gca_publisher_model.PublisherModel(
+                _TEXT_BISON_PUBLISHER_MODEL_DICT
+            ),
+        ):
+
+            my_model = preview_language_models.TextGenerationModel.get_tuned_model(
+                test_constants.ModelConstants._TEST_MODEL_RESOURCE_NAME
+            )
+
+            task_spec = preview_language_models.EvaluationTextClassificationSpec(
+                ground_truth_data="gs://my-bucket/ground-truth.jsonl",
+                target_column_name="test_targ_name",
+                class_names=["test_class_name_1", "test_class_name_2"],
+            )
+
+            formatted_template_params = (
+                _evaluatable_language_models._populate_eval_template_params(
+                    task_spec=task_spec, model_name=my_model._model_resource_name
+                )
+            )
+
+            assert "evaluation_class_labels" in formatted_template_params
+            assert "target_field_name" in formatted_template_params
+
+    @pytest.mark.usefixtures(
+        "get_model_with_tuned_version_label_mock",
+        "get_endpoint_with_models_mock",
+        "mock_storage_blob_upload_from_filename",
+    )
+    @pytest.mark.parametrize(
+        "job_spec",
+        [_TEST_EVAL_PIPELINE_SPEC_JSON, _TEST_EVAL_PIPELINE_JOB],
+    )
+    def test_populate_template_params_with_dataframe_input(
+        self,
+        job_spec,
+        mock_pipeline_service_create,
+        mock_pipeline_job_get,
+        mock_successfully_completed_eval_job,
+        mock_pipeline_bucket_exists,
+        mock_load_yaml_and_json,
+    ):
+
+        aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION)
+
+        with mock.patch.object(
+            target=model_garden_service_client.ModelGardenServiceClient,
+            attribute="get_publisher_model",
+            return_value=gca_publisher_model.PublisherModel(
+                _TEXT_BISON_PUBLISHER_MODEL_DICT
+            ),
+        ):
+
+            my_model = preview_language_models.TextGenerationModel.get_tuned_model(
+                test_constants.ModelConstants._TEST_MODEL_RESOURCE_NAME
+            )
+
+            task_spec = preview_language_models.EvaluationTextGenerationSpec(
+                ground_truth_data=_TEST_EVAL_DATA_DF,
+            )
+
+            formatted_template_params = (
+                _evaluatable_language_models._populate_eval_template_params(
+                    task_spec=task_spec, model_name=my_model._model_resource_name
+                )
+            )
+
+            # The utility method should not modify task_spec
+            assert isinstance(task_spec.ground_truth_data, pd.DataFrame)
+
+            assert (
+                "batch_predict_gcs_destination_output_uri" in formatted_template_params
+            )
+            assert "model_name" in formatted_template_params
+            assert "evaluation_task" in formatted_template_params
+
+            # This should only be in the classification task pipeline template
+            assert "evaluation_class_labels" not in formatted_template_params
+            assert "target_column_name" not in formatted_template_params
+
+    def test_evaluate_raises_on_ga_language_model(
+        self,
+    ):
+
+        aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION)
+
+        with mock.patch.object(
+            target=model_garden_service_client.ModelGardenServiceClient,
+            attribute="get_publisher_model",
+            return_value=gca_publisher_model.PublisherModel(
+                _TEXT_BISON_PUBLISHER_MODEL_DICT
+            ),
+        ):
+            model = language_models.TextGenerationModel.from_pretrained(
+                "text-bison@001"
+            )
+
+            with pytest.raises(AttributeError):
+                model.evaluate()
+
+    @pytest.mark.usefixtures(
+        "get_endpoint_with_models_mock",
+    )
+    @pytest.mark.parametrize(
+        "job_spec",
+        [_TEST_EVAL_PIPELINE_SPEC_JSON, _TEST_EVAL_PIPELINE_JOB],
+    )
+    @pytest.mark.parametrize(
+        "mock_request_urlopen_eval",
+        ["https://us-kfp.pkg.dev/proj/repo/pack/latest"],
+        indirect=True,
+    )
+    def test_model_evaluation_text_generation_task_on_base_model(
+        self,
+        job_spec,
+        mock_pipeline_service_create_eval,
+        mock_pipeline_job_get_eval,
+        mock_successfully_completed_eval_job,
+        mock_pipeline_bucket_exists,
+        mock_load_yaml_and_json,
+        mock_request_urlopen_eval,
+    ):
+
+        aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION)
+
+        with mock.patch.object(
+            target=model_garden_service_client.ModelGardenServiceClient,
+            attribute="get_publisher_model",
+            return_value=gca_publisher_model.PublisherModel(
+                _TEXT_BISON_PUBLISHER_MODEL_DICT
+            ),
+        ):
+
+            my_model = preview_language_models.TextGenerationModel.from_pretrained(
+                "text-bison@001"
+            )
+
+            eval_metrics = my_model.evaluate(
+                task_spec=preview_language_models.EvaluationTextGenerationSpec(
+                    ground_truth_data="gs://my-bucket/ground-truth.jsonl",
+                ),
+            )
+
+            assert isinstance(eval_metrics, preview_language_models.EvaluationMetric)
+
+    @pytest.mark.usefixtures(
+        "get_endpoint_with_models_mock",
+    )
+    @pytest.mark.parametrize(
+        "job_spec",
+        [
+            _TEST_EVAL_CLASSIFICATION_PIPELINE_SPEC_JSON,
+            _TEST_EVAL_CLASSIFICATION_PIPELINE_JOB,
+        ],
+    )
+    @pytest.mark.parametrize(
+        "mock_request_urlopen_eval_classification",
+        ["https://us-central1-kfp.pkg.dev/proj/repo/pack/latest"],
+        indirect=True,
+    )
+    def test_model_evaluation_text_classification_base_model_only_summary_metrics(
+        self,
+        job_spec,
+        mock_pipeline_service_create_eval_classification,
+        mock_pipeline_job_get_eval_classification,
+        mock_successfully_completed_eval_classification_job,
+        mock_pipeline_bucket_exists,
+        mock_load_yaml_and_json,
+        mock_request_urlopen_eval_classification,
+    ):
+
+        aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION)
+
+        with mock.patch.object(
+            target=model_garden_service_client.ModelGardenServiceClient,
+            attribute="get_publisher_model",
+            return_value=gca_publisher_model.PublisherModel(
+                _TEXT_BISON_PUBLISHER_MODEL_DICT
+            ),
+        ):
+            my_model = preview_language_models.TextGenerationModel.from_pretrained(
+                "text-bison@001"
+            )
+
+            eval_metrics = my_model.evaluate(
+                task_spec=preview_language_models.EvaluationTextClassificationSpec(
+                    ground_truth_data="gs://my-bucket/ground-truth.jsonl",
+                    target_column_name="test_targ_name",
+                    class_names=["test_class_name_1", "test_class_name_2"],
+                )
+            )
+
+            assert isinstance(
+                eval_metrics,
+                preview_language_models.EvaluationClassificationMetric,
+            )
+            assert eval_metrics.confidenceMetrics is None
+            assert eval_metrics.auPrc == _TEST_TEXT_CLASSIFICATION_METRICS["auPrc"]
diff --git a/vertexai/language_models/_evaluatable_language_models.py b/vertexai/language_models/_evaluatable_language_models.py
new file mode 100644
index 0000000000..eb3423fc93
--- /dev/null
+++ b/vertexai/language_models/_evaluatable_language_models.py
@@ -0,0 +1,755 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Classes for working with language models."""
+
+import dataclasses
+import os
+from typing import Any, Dict, List, Optional, Type, TypeVar, Union
+
+from google.cloud import storage
+
+from google.cloud import aiplatform
+from google.cloud.aiplatform import base
+from google.cloud.aiplatform import initializer as aiplatform_initializer
+from google.cloud.aiplatform import utils as aiplatform_utils
+from google.cloud.aiplatform.utils import gcs_utils
+from vertexai._model_garden import _model_garden_models
+
+from google.cloud.aiplatform.compat.services import (
+    model_garden_service_client,
+)
+from google.cloud.aiplatform.compat.types import (
+    pipeline_state as gca_pipeline_state,
+)
+
+try:
+    import pandas
+except ImportError:
+    pandas = None
+
+
+_LOGGER = base.Logger(__name__)
+
+# Model Evaluation constants
+_TEXT_CLASSIFICATION_TASK_NAME = "text-classification"
+_TEXT_GENERATION_TASK_NAME = "text-generation"
+_QA_TASK_NAME = "question-answering"
+_SUMMARIZATION_TASK_NAME = "summarization"
+
+_EVALUATION_TASKS = frozenset(
+    [
+        _TEXT_CLASSIFICATION_TASK_NAME,
+        _TEXT_GENERATION_TASK_NAME,
+        _QA_TASK_NAME,
+        _SUMMARIZATION_TASK_NAME,
+    ]
+)
+
+
+_TEXT_CLASSIFICATION_TEMPLATE_URL = "https://us-kfp.pkg.dev/vertex-evaluation/pipeline-templates/evaluation-llm-classification-pipeline"
+_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL = "https://us-kfp.pkg.dev/vertex-evaluation/pipeline-templates/evaluation-llm-text-generation-pipeline"
+
+_EVALUATION_TEMPLATE_VERSION_TAG = "1.0.1"
+
+_EVALUATION_TEMPLATE_URLS = {
+    _TEXT_CLASSIFICATION_TASK_NAME: f"{_TEXT_CLASSIFICATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}",
+    _TEXT_GENERATION_TASK_NAME: f"{_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}",
+    _QA_TASK_NAME: f"{_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}",
+    _SUMMARIZATION_TASK_NAME: f"{_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}",
+}
+
+
+_EVALUATION_PIPELINE_COMPONENT_IDENTIFIER = "fpc-llm-evaluation"
+
+# TODO: update this when BP removes the input size limit
+_BATCH_PREDICTION_ROW_LIMIT = 1000
+
+_EVAL_SUPPORTED_BASE_MODELS = ["text-bison@001"]
+
+T = TypeVar("T", bound="_EvaluationMetricBase")
+
+
+def _check_dataset_is_within_size_limit(
+    data: "pandas.DataFrame",
+) -> None:
+
+    if len(data) < _BATCH_PREDICTION_ROW_LIMIT:
+        return
+
+    raise ValueError(
+        f"Your evaluation dataset size exceeds the limit of {_BATCH_PREDICTION_ROW_LIMIT}"
+    )
+
+
+def _get_model_resource_name_and_validate(
+    model_name: str,
+    model_info: _model_garden_models._ModelInfo,
+) -> str:
+    """Returns the resource name string for the model.
+
+    Model Registry resource names will stay the same. For Publisher Models, we need to
+    pass the full resource name (publishers/google/models/text-bison@001) to the evaluation
+    template and ensure the base model supports evaluation.
+
+    Args:
+        model_name (str):
+            Required. The full resource name of the Model Registry model or base publisher model
+            to run evaluation on.
+        model_info (_model_garden_models._ModelInfo):
+            Required. The _ModelInfo object for the instance.
+
+    Returns:
+        The formatted model_name string.
+
+    Raises:
+        ValueError
+            If a base PublisherModel was provided and the model doesn't support evaluation.
+    """
+
+    if "publishers/" not in model_name:
+        # Model Registry resource
+        return model_name
+
+    else:
+        if model_info.tuning_model_id in _EVAL_SUPPORTED_BASE_MODELS:
+            return f"{model_info.publisher_model_resource.name}@{model_info.publisher_model_resource.version_id}"
+
+        raise ValueError(
+            f"The provided model {model_name} does not support evaluation."
+        )
+
+
+def _get_template_url(task_name: str) -> Optional[str]:
+    """Returns the pipeline template to use for the evaluation task.
+
+    Args:
+        task_name (str):
+            Required. The name of the evaluation task to run.
+
+    Returns:
+        The evaluation pipeline template path.
+    """
+
+    return _EVALUATION_TEMPLATE_URLS.get(task_name)
+
+
+@dataclasses.dataclass
+class _EvaluationTaskSpec:
+    """Base class for task-specific model evaluation configuration parameters.
+
+    This class should not be instantiated directly, instead use the subclass corresponding
+    to your evaluation task.
+
+    Args:
+        ground_truth_data (Union[List[str], str, pandas.DataFrame]):
+            Required. The ground truth data to use for this evaluation job. This can be
+            either a Pandas DataFrame, a Cloud Storage URI of your JSONL data file, or a list of multiple
+            JSONL files on Cloud Storage.
+
+    Raises:
+        ValueError:
+            If task_spec.ground_truth_data is formatted incorrectly.
+            If task_spec.ground_truth_data is a Pandas DataFrame and exceeds 1000 rows.
+            If task_spec.ground_truth_data is not a string, list, or Pandas DataFrame.
+    """
+
+    ground_truth_data: Union[List[str], str, "pandas.DataFrame"]
+
+    @property
+    def task_name(self) -> str:
+        pass
+
+    def __post_init__(self):
+
+        if isinstance(self.ground_truth_data, str):
+            self.ground_truth_data = [self.ground_truth_data]
+
+        if isinstance(self.ground_truth_data, list) and not all(
+            item.startswith("gs://") for item in self.ground_truth_data
+        ):
+            raise ValueError("Please provide a valid GCS URI starting with 'gs://'")
+
+        if pandas and isinstance(self.ground_truth_data, pandas.DataFrame):
+
+            _check_dataset_is_within_size_limit(self.ground_truth_data)
+
+
+@dataclasses.dataclass
+class EvaluationTextClassificationSpec(_EvaluationTaskSpec):
+    """Spec for text classification model evaluation tasks.
+
+    Args:
+        target_column_name (str):
+            Required. The label column in the dataset provided in `ground_truth_data`. Required when task_name='text-classification'.
+        class_names (List[str]):
+            Required. A list of all possible label names in your dataset. Required when task_name='text-classification'.
+    """
+
+    target_column_name: str
+    class_names: List[str]
+
+    @property
+    def task_name(self) -> str:
+        return "text-classification"
+
+
+@dataclasses.dataclass
+class EvaluationTextGenerationSpec(_EvaluationTaskSpec):
+    """Spec for text generation model evaluation tasks."""
+
+    @property
+    def task_name(self) -> str:
+        return "text-generation"
+
+
+@dataclasses.dataclass
+class EvaluationQuestionAnsweringSpec(_EvaluationTaskSpec):
+    """Spec for question answering model evaluation tasks."""
+
+    task_name: str = "question-answering"
+
+
+@dataclasses.dataclass
+class EvaluationTextSummarizationSpec(_EvaluationTaskSpec):
+    """Spec for text summarization model evaluation tasks."""
+
+    task_name: str = "summarization"
+
+
+@dataclasses.dataclass
+class _EvaluationMetricBase:
+    """Base class for returned evaulation metrics"""
+
+    @property
+    def input_dataset_paths(self) -> str:
+        """The Google Cloud Storage paths to the dataset used for this evaluation."""
+        pass
+
+    @property
+    def task_name(self) -> str:
+        """The type of evaluation task for the evaluation.."""
+        pass
+
+
+@dataclasses.dataclass
+class EvaluationMetric(_EvaluationMetricBase):
+    """The evaluation metric response.
+
+    Args:
+        bleu (float):
+            Optional. BLEU (Bilingual evauation understudy). Scores based on sacrebleu implementation.
+        rougeLSum (float):
+            Optional. ROUGE-L (Longest Common Subsequence) scoring at summary level.
+    """
+
+    bleu: Optional[float] = None
+    rougeLSum: Optional[float] = None
+
+
+@dataclasses.dataclass
+class EvaluationClassificationMetric(_EvaluationMetricBase):
+    """The evaluation metric response for classification metrics.
+
+    Args:
+        label_name (str):
+            Optional. The name of the label associated with the metrics. This is only
+            returned when `only_summary_metrics=False` is passed to evaluate().
+        auPrc (float):
+            Optional. The area under the precision recall curve.
+        auRoc (float):
+            Optional. The area under the receiver operating characteristic curve.
+        logLoss (float):
+            Optional. Logarithmic loss.
+        confidenceMetrics (List[Dict[str, Any]]):
+            Optional. This is only returned when `only_summary_metrics=False` is
+            passed to evaluate().
+        confusionMatrix (Dict[str, Any]):
+          Optional. This is only returned when `only_summary_metrics=False` is
+          passed to evaluate().
+    """
+
+    label_name: Optional[str] = None
+    auPrc: Optional[float] = None
+    auRoc: Optional[float] = None
+    logLoss: Optional[float] = None
+    confidenceMetrics: Optional[List[Dict[str, Any]]] = None
+    confusionMatrix: Optional[Dict[str, Any]] = None
+
+
+@dataclasses.dataclass
+class EvaluationSlicedClassificationMetric(_EvaluationMetricBase):
+    """The evaluation metric slices returned for classification metrics.
+
+    This is returned when `only_summary_metrics=False` is passed to evaluate().
+
+    Args:
+        overall_metrics (EvaluationClassificationMetric):
+            The evaluation metrics across all slices of data
+        slices (List[EvaluationClassificationMetric]):
+            The evaluation metrics for each label slice.
+    """
+
+    overall_metrics: Optional[EvaluationClassificationMetric] = None
+    slices: Optional[List[EvaluationClassificationMetric]] = None
+
+
+def _populate_eval_template_params(
+    task_spec: _EvaluationTaskSpec,
+    model_name: str,
+    service_account: Optional[str] = None,
+    machine_type: Optional[str] = None,
+    network: Optional[str] = None,
+    encryption_spec_key_name: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Populates a dictionary of template parameters for the evaluation PipelineJob.
+
+    Args:
+        task_spec (EvaluationTaskSpec):
+            The EvaluationTaskSpec passed to evaluate() for this job
+        model_name (str):
+            The resource name of the model being evaluated. Either a PublisherModel or
+            ModelRegistry resource name.
+        service_account (Optional[str]):
+            The default service account for workload run-as account.
+        machine_type (Optional[str]):
+            Optional. The type of the machine to run the evaluation job on.
+        network (Optional[str]):
+            Optional.
+        encryption_spec_key_name (Optional[str]):
+            Optional.
+
+    Returns:
+        Dict[str, Any]:
+            A dictionary of template parameter names and values to be passed to the PipelineJob
+            running the model evaluation.
+    """
+
+    ground_truth_data_gcs_path = task_spec.ground_truth_data
+
+    staging_bucket = aiplatform_initializer.global_config.staging_bucket
+
+    if not staging_bucket:
+        staging_bucket = (
+            gcs_utils.create_gcs_bucket_for_pipeline_artifacts_if_it_does_not_exist()
+        )
+
+    timestamped_eval_directory = (
+        f"evaluation_data_{aiplatform_utils.timestamped_unique_name()}"
+    )
+
+    if isinstance(task_spec.ground_truth_data, pandas.DataFrame):
+
+        # Convert to jsonl file and upload to gcs
+        dataset_uri = os.path.join(
+            staging_bucket,
+            timestamped_eval_directory,
+            "eval_data.jsonl",
+        )
+
+        gcs_utils._upload_pandas_df_to_gcs(
+            df=task_spec.ground_truth_data, upload_gcs_path=dataset_uri
+        )
+        ground_truth_data_gcs_path = [dataset_uri]
+
+    template_params = {
+        "project": aiplatform_initializer.global_config.project,
+        "location": aiplatform_initializer.global_config.location,
+        "batch_predict_gcs_destination_output_uri": f"{staging_bucket}/{timestamped_eval_directory}",
+        "model_name": model_name,
+        "batch_predict_gcs_source_uris": ground_truth_data_gcs_path,
+        "service_account": service_account,
+        "machine_type": machine_type,
+        "encrytion_spec_key_name": encryption_spec_key_name
+        or aiplatform_initializer.global_config.encryption_spec_key_name,
+        "network": network or aiplatform_initializer.global_config.network,
+    }
+
+    if task_spec.task_name == _TEXT_CLASSIFICATION_TASK_NAME:
+        template_params["evaluation_class_labels"] = task_spec.class_names
+        template_params["target_field_name"] = task_spec.target_column_name
+    else:
+        template_params["evaluation_task"] = task_spec.task_name
+
+    return template_params
+
+
+# TODO (b/285947054): update to use public pipeline contract
+def _get_gcs_uri_from_pipeline_task_details(
+    pipeline_job: aiplatform.PipelineJob,
+) -> Optional[str]:
+    """Gets the GCS URI from the PipelineJob output.
+
+    Args:
+        pipeline_job (aiplatform.PipelineJob)
+            The PipelineJob resource to get the metrics GCS URI from
+
+    Returns:
+        The GCS URI of the evaluation metrics as a string.
+    """
+
+    for task in pipeline_job.task_details:
+        if task.task_name == pipeline_job.name and "evaluation_metrics" in task.outputs:
+            return task.outputs["evaluation_metrics"].artifacts[0].uri
+
+
+def _convert_metrics_dict_to_response_type(
+    metrics_json: Dict[str, Any],
+    metric_type: Type[T],
+    metric_name: Optional[str] = None,
+) -> EvaluationClassificationMetric:
+    metrics_response = metric_type()
+    if metric_name:
+        metrics_response.label_name = metric_name
+
+    for metric, value in metrics_json.items():
+        if hasattr(metrics_response, metric):
+            setattr(metrics_response, metric, value)
+    return metrics_response
+
+
+def _format_classification_metrics(
+    metrics: Dict[str, Any]
+) -> EvaluationSlicedClassificationMetric:
+    """Reformats classification metrics returned by the eval pipeline to make them more readable.
+
+    Returned metrics are of type EvaluationSlicedClassificationMetric, with `overall` representing
+    the metrics for all data, and `slices` representing the metrics for each label in the dataset.
+
+    Example schema of reformatted metrics:
+
+    EvaluationSlicedClassificationMetrics(
+        overall_metrics=EvaluationClassificationMetric(
+            auPrc=...
+        )
+        slices=[
+            EvaluationClassificationMetric(
+                label_name="overall",
+                auPrc=...,
+                ...
+            ),
+            EvaluationClassificationMetric(
+                label_name="label_1",
+                auPrc=...,
+                ...
+            ),
+            EvaluationClassificationMetric(
+                label_name="label_2",
+                auPrc=...,
+                ...
+            )
+        ]
+    )
+    """
+
+    reformatted_metrics = EvaluationSlicedClassificationMetric()
+
+    # TODO: see if we can do this without relying on specific keys, i.e. slicedMetrics
+
+    # First add overall metrics
+    overall_metrics = _convert_metrics_dict_to_response_type(
+        metrics_json=metrics["slicedMetrics"][0]["metrics"]["classification"],
+        metric_type=EvaluationClassificationMetric,
+    )
+    reformatted_metrics.overall_metrics = overall_metrics
+
+    sliced_metrics = []
+
+    # Then add metrics for each slice
+    for idx in range(1, len(metrics["slicedMetrics"])):
+        metric_slice_name = metrics["slicedMetrics"][idx]["singleOutputSlicingSpec"][
+            "value"
+        ]
+
+        sliced_metric = _convert_metrics_dict_to_response_type(
+            metrics_json=metrics["slicedMetrics"][idx]["metrics"]["classification"],
+            metric_type=EvaluationClassificationMetric,
+            metric_name=metric_slice_name,
+        )
+        sliced_metrics.append(sliced_metric)
+
+    reformatted_metrics.sliced_metrics = sliced_metrics
+
+    return reformatted_metrics
+
+
+def _get_metrics_from_gcs_uri(
+    gcs_uri: str,
+) -> Union[
+    EvaluationMetric,
+    EvaluationClassificationMetric,
+    EvaluationSlicedClassificationMetric,
+]:
+    """Downloads evaluation metrics from GCS path."""
+
+    storage_client = storage.Client(
+        credentials=aiplatform_initializer.global_config.credentials
+    )
+
+    metrics_json = storage.Blob.from_string(
+        uri=gcs_uri, client=storage_client
+    ).download_as_text()
+
+    # Sliced classification metrics case, format data
+    if "slicedMetrics" in metrics_json:
+        return _format_classification_metrics(metrics_json)
+    # If classification metrics don't contain slices, use EvaluationClassificationMetric type
+    if "auPrc" in metrics_json:
+        metrics_response = _convert_metrics_dict_to_response_type(
+            metrics_json=metrics_json,
+            metric_type=EvaluationClassificationMetric,
+        )
+    # All other metric types
+    else:
+        metrics_response = _convert_metrics_dict_to_response_type(
+            metrics_json=metrics_json,
+            metric_type=EvaluationMetric,
+        )
+    return metrics_response
+
+
+def _get_metrics_from_pipeline_task_details(
+    pipeline_job: aiplatform.PipelineJob,
+) -> Union[EvaluationMetric, EvaluationClassificationMetric]:
+    """Gets the evaluation metrics from the PipelineJob TaskDetails.
+
+    Args:
+        pipeline_job (aiplatform.PipelineJob)
+            The PipelineJob resource to get the metrics from
+
+    Returns:
+        A dictionary with the evaluation metrics
+    """
+    metrics = {}
+
+    # TODO (b/292076101): this now uses a public pipelines contract, but still relies on task_details
+    for task in pipeline_job.task_details:
+        if task.task_name == pipeline_job.name:
+            for output in task.outputs:
+                for metric_name, metric_value in (
+                    task.outputs[output].artifacts[0].metadata.items()
+                ):
+                    metrics[metric_name] = metric_value
+
+            if "auPrc" in metrics:
+                metrics_response = EvaluationClassificationMetric()
+            else:
+                metrics_response = EvaluationMetric()
+
+            for metric, value in metrics.items():
+                if hasattr(metrics_response, metric):
+                    setattr(metrics_response, metric, value)
+            return metrics_response
+
+
+class _LanguageModelEvaluationJob:
+    """Represents a model evaluation job for LLM models.
+
+    These evaluation jobs are run as a Vertex Pipeline.
+    """
+
+    def __init__(
+        self,
+        pipeline_job: aiplatform.PipelineJob,
+    ):
+        self._pipeline_job = pipeline_job
+
+    def result(
+        self, *, only_summary_metrics: bool
+    ) -> Union[EvaluationMetric, EvaluationClassificationMetric]:
+        """Blocks on completion of the model evaluation PipelineJob and returns metrics."""
+
+        self._pipeline_job.wait()
+
+        if only_summary_metrics:
+            return _get_metrics_from_pipeline_task_details(self._pipeline_job)
+        else:
+            gcs_uri = _get_gcs_uri_from_pipeline_task_details(self._pipeline_job)
+            if gcs_uri:
+                return _get_metrics_from_gcs_uri(gcs_uri)
+
+
+class _EvaluatableLanguageModel:
+    """Mixin class for LLMs that support model evaluation."""
+
+    # TODO (b/282975912): convert training job specific args to a TrainingConfig
+    def evaluate(
+        self,
+        *,
+        task_spec: _EvaluationTaskSpec,
+        only_summary_metrics: Optional[bool] = True,
+        machine_type: Optional[str] = None,
+    ) -> Union[
+        EvaluationMetric,
+        EvaluationClassificationMetric,
+        EvaluationSlicedClassificationMetric,
+    ]:
+        """Runs model evaluation using the provided input and ground truth data.
+
+        This creates an evaluation job and blocks until the job completes, about
+        10 - 20 minutes.
+
+        Example:
+        ```
+        model = TextGenerationModel.from_pretrained("text-bison@001")
+        eval_metrics = model.evaluate(
+            task_spec=EvaluationTextGenerationSpec(
+                ground_truth_data="gs://my-bucket/ground-truth.jsonl",
+            )
+        )
+        ```
+
+        Args:
+            task_spec (_EvaluationTaskSpec):
+                Required. The configuration spec for your model evaluation job. Choose the spec corresponding
+                with the evaluation task you are performing, one of: EvaluationClassificationSpec, EvaluationTextGenerationSpec,
+                EvaluationTextSummarizationSpec, EvaluationQuestionAnsweringSpec.
+
+                For example, a valid classification `task_spec` is:
+                EvaluationTextClassificationSpec(
+                    ground_truth_data=["gs://bucket/path/to/your/data.jsonl"],
+                    class_names=["cheddar", "gouda", "camembert"],
+                    target_column_name="cheese_type",
+                )
+            only_summary_metrics (bool):
+                Optional. Setting this field to False only affects the metrics returned for text classification tasks.
+                When False, text classification metrics will include additional sliced metrics fields, with metrics for
+                each label slice in the data.
+            machine_type (str):
+                Optional. The type of the machine to run the evaluation job on. The default value is "e2-highmem-16". For
+                tasks with a large evaluation dataset, a bigger machine type may be required.
+                For more details about this input config, see
+                https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types.
+
+        Returns:
+            Union[EvaluationMetric, EvaluationClassificationMetric, List[EvaluationClassificationMetric]]
+                The evaluation metrics from this evaluation job. When `only_summary_metrics=False` is passed
+                and the evaluation task type is 'text-classification', the return type will be List[EvaluationClassificationMetric],
+                where each value in the list is the metrics associated with a particular classification label.
+        """
+
+        model_info = _model_garden_models._get_model_info(
+            self._model_id,
+            schema_to_class_map={self._INSTANCE_SCHEMA_URI: type(self)},
+        )
+        model_name = _get_model_resource_name_and_validate(
+            model_name=self._model_resource_name, model_info=model_info
+        )
+
+        # TODO(b/296402511): get service_account from aiplatform_initializer and pass it to the template here and to PipelineJob after cl/539823838 is submitted
+        template_params = _populate_eval_template_params(
+            task_spec=task_spec,
+            model_name=model_name,
+            machine_type=machine_type,
+            network=aiplatform_initializer.global_config.network,
+            encryption_spec_key_name=aiplatform_initializer.global_config.encryption_spec_key_name,
+        )
+
+        template_path = _get_template_url(task_spec.task_name)
+
+        pipeline_job = aiplatform.PipelineJob(
+            template_path=template_path,
+            parameter_values=template_params,
+            display_name=f"llm-eval-sdk-{aiplatform_utils.timestamped_unique_name()}",
+        )
+        pipeline_job.submit()
+
+        eval_job = _LanguageModelEvaluationJob(pipeline_job=pipeline_job)
+
+        _LOGGER.info(
+            "Your evaluation job is running and will take 15-20 minutes to complete. Click on the PipelineJob link to view progress."
+        )
+
+        # NOTE: only_summary_metrics is passed because getting metrics from the artifact is faster than downloading from GCS
+        # GCS is only needed for additional metrics for text-classification tasks
+        return eval_job.result(only_summary_metrics=only_summary_metrics)
+
+    def list_evaluation_metrics(
+        self,
+        *,
+        task_name: Optional[str] = None,
+        only_summary_metrics: Optional[bool] = True,
+    ) -> List[Union[EvaluationMetric, EvaluationClassificationMetric]]:
+        """Lists the evaluation metrics from all evaluation jobs run on this model.
+
+        Args:
+            task_name (str):
+                Optional. The task name to return evaluation metrics for. If provided, this will only return evaluation
+                metrics for tasks of the provided type. This matches the possible values passed to EvaluationTaskType.task_name,
+                and must be one of 'text-generation', 'text-classification', 'summarization', or 'question-answering'.
+
+        Returns:
+            Dict[str, Any]
+                The evaluation metrics from all evaluation jobs run on this model.
+
+        """
+
+        model_name = self._model_resource_name
+
+        publisher_model_parts = model_garden_service_client.ModelGardenServiceClient.parse_publisher_model_path(
+            "".join(model_name.rpartition("publishers")[1:])
+        )
+
+        if publisher_model_parts:
+            model_id = publisher_model_parts["model"]
+            model_name = f"publishers/google/models/{model_id}"
+
+        filters = f'metadata.component_type.string_value={_EVALUATION_PIPELINE_COMPONENT_IDENTIFIER} AND metadata."input:model_name".string_value={model_name} AND (metadata."input:evaluation_task".string_value={_TEXT_GENERATION_TASK_NAME} OR metadata."input:evaluation_task".string_value={_SUMMARIZATION_TASK_NAME} OR metadata."input:evaluation_task".string_value={_QA_TASK_NAME} OR metadata."input:evaluation_task".string_value={_TEXT_CLASSIFICATION_TASK_NAME})'
+
+        # NOTE: when task_name is appended to the filter the block of OR filters in `filters` above becomes a no-op
+        if task_name:
+            filters += f' AND metadata."input:evaluation_task".string_value={task_name}'
+
+        filtered_pipeline_executions = aiplatform.Execution.list(
+            filter=filters,
+            project=aiplatform_initializer.global_config.project,
+            location=aiplatform_initializer.global_config.location,
+            credentials=aiplatform_initializer.global_config.credentials,
+        )
+
+        model_eval_metrics = []
+
+        # TODO (b/285950380): improve performance of this method
+        for pipeline_execution in filtered_pipeline_executions:
+            if "pipeline_job_resource_name" not in pipeline_execution.metadata:
+                continue
+
+            pipeline_job_resource = aiplatform.PipelineJob.get(
+                resource_name=pipeline_execution.metadata["pipeline_job_resource_name"]
+            )
+            eval_job_state = pipeline_job_resource._gca_resource.state
+
+            if (
+                eval_job_state
+                != gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ):
+                continue
+
+            metrics = None
+
+            if only_summary_metrics:
+                metrics = _get_metrics_from_pipeline_task_details(pipeline_job_resource)
+            else:
+                gcs_uri = _get_gcs_uri_from_pipeline_task_details(pipeline_job_resource)
+                if gcs_uri:
+                    metrics = _get_metrics_from_gcs_uri(gcs_uri)
+
+            metrics.input_dataset_paths = pipeline_execution.metadata[
+                "input:batch_predict_gcs_source_uris"
+            ]
+            metrics.task_name = pipeline_execution.metadata["input:evaluation_task"]
+
+            model_eval_metrics.append(metrics)
+
+        return model_eval_metrics
diff --git a/vertexai/language_models/_language_models.py b/vertexai/language_models/_language_models.py
index 3ef46cc88e..28d73f4eac 100644
--- a/vertexai/language_models/_language_models.py
+++ b/vertexai/language_models/_language_models.py
@@ -24,6 +24,9 @@
 from google.cloud.aiplatform import utils as aiplatform_utils
 from google.cloud.aiplatform.utils import gcs_utils
 from vertexai._model_garden import _model_garden_models
+from vertexai.language_models import (
+    _evaluatable_language_models,
+)
 
 try:
     import pandas
@@ -497,7 +500,10 @@ class TextGenerationModel(_TextGenerationModel, _ModelWithBatchPredict):
 
 
 class _PreviewTextGenerationModel(
-    _TextGenerationModel, _TunableModelMixin, _PreviewModelWithBatchPredict
+    _TextGenerationModel,
+    _TunableModelMixin,
+    _PreviewModelWithBatchPredict,
+    _evaluatable_language_models._EvaluatableLanguageModel,
 ):
     # Do not add docstring so that it's inherited from the base class.
     _LAUNCH_STAGE = _model_garden_models._SDK_PUBLIC_PREVIEW_LAUNCH_STAGE
@@ -696,7 +702,9 @@ def start_chat(
         *,
         context: Optional[str] = None,
         examples: Optional[List[InputOutputTextPair]] = None,
-        max_output_tokens: Optional[int] = _TextGenerationModel._DEFAULT_MAX_OUTPUT_TOKENS,
+        max_output_tokens: Optional[
+            int
+        ] = _TextGenerationModel._DEFAULT_MAX_OUTPUT_TOKENS,
         temperature: Optional[float] = None,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
@@ -801,7 +809,7 @@ def start_chat(
             model=self,
             max_output_tokens=max_output_tokens,
             temperature=temperature,
-            message_history=message_history
+            message_history=message_history,
         )
 
 
@@ -820,7 +828,9 @@ def __init__(
         model: _ChatModelBase,
         context: Optional[str] = None,
         examples: Optional[List[InputOutputTextPair]] = None,
-        max_output_tokens: Optional[int] = _TextGenerationModel._DEFAULT_MAX_OUTPUT_TOKENS,
+        max_output_tokens: Optional[
+            int
+        ] = _TextGenerationModel._DEFAULT_MAX_OUTPUT_TOKENS,
         temperature: Optional[float] = None,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
@@ -955,7 +965,9 @@ def __init__(
         model: ChatModel,
         context: Optional[str] = None,
         examples: Optional[List[InputOutputTextPair]] = None,
-        max_output_tokens: Optional[int] = _TextGenerationModel._DEFAULT_MAX_OUTPUT_TOKENS,
+        max_output_tokens: Optional[
+            int
+        ] = _TextGenerationModel._DEFAULT_MAX_OUTPUT_TOKENS,
         temperature: Optional[float] = None,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
diff --git a/vertexai/preview/language_models.py b/vertexai/preview/language_models.py
index 6ecf2a6d54..447b3a0f9f 100644
--- a/vertexai/preview/language_models.py
+++ b/vertexai/preview/language_models.py
@@ -29,6 +29,16 @@
     TextGenerationResponse,
 )
 
+from vertexai.language_models._evaluatable_language_models import (
+    EvaluationTextGenerationSpec,
+    EvaluationTextSummarizationSpec,
+    EvaluationQuestionAnsweringSpec,
+    EvaluationTextClassificationSpec,
+    EvaluationClassificationMetric,
+    EvaluationMetric,
+)
+
+
 ChatModel = _PreviewChatModel
 CodeChatModel = _PreviewCodeChatModel
 CodeGenerationModel = _PreviewCodeGenerationModel
@@ -42,6 +52,12 @@
     "CodeChatModel",
     "CodeChatSession",
     "CodeGenerationModel",
+    "EvaluationClassificationMetric",
+    "EvaluationMetric",
+    "EvaluationTextGenerationSpec",
+    "EvaluationTextSummarizationSpec",
+    "EvaluationQuestionAnsweringSpec",
+    "EvaluationTextClassificationSpec",
     "InputOutputTextPair",
     "TextEmbedding",
     "TextEmbeddingModel",