Experiments complex metrics (#8)

* feat: new class and API for metrics * update system test * update high level log method * fix system test * update example * change from system schema to google schema
jaycee-li · Sep 26, 2022 · 3ca2d28 · 3ca2d28
1 parent 2a906c8
commit 3ca2d28
Show file tree

Hide file tree

Showing 9 changed files with 419 additions and 3 deletions.
diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py
@@ -86,6 +86,9 @@
 
 log_params = metadata.metadata._experiment_tracker.log_params
 log_metrics = metadata.metadata._experiment_tracker.log_metrics
+log_classification_metrics = (
+    metadata.metadata._experiment_tracker.log_classification_metrics
+)
 get_experiment_df = metadata.metadata._experiment_tracker.get_experiment_df
 start_run = metadata.metadata._experiment_tracker.start_run
 start_execution = metadata.metadata._experiment_tracker.start_execution
@@ -110,6 +113,7 @@
     "log",
     "log_params",
     "log_metrics",
+    "log_classification_metrics",
     "log_time_series_metrics",
     "get_experiment_df",
     "get_pipeline_df",

diff --git a/google/cloud/aiplatform/metadata/experiment_run_resource.py b/google/cloud/aiplatform/metadata/experiment_run_resource.py
@@ -39,6 +39,7 @@
 from google.cloud.aiplatform.metadata import metadata
 from google.cloud.aiplatform.metadata import resource
 from google.cloud.aiplatform.metadata import utils as metadata_utils
+from google.cloud.aiplatform.metadata import schema
 from google.cloud.aiplatform.tensorboard import tensorboard_resource
 from google.cloud.aiplatform.utils import rest_utils
 
@@ -990,6 +991,103 @@ def log_metrics(self, metrics: Dict[str, Union[float, int, str]]):
             # TODO: query the latest metrics artifact resource before logging.
             self._metadata_node.update(metadata={constants._METRIC_KEY: metrics})
 
+    def log_classification_metrics(
+        self,
+        *,
+        labels: Optional[List[str]] = None,
+        matrix: Optional[List[List[int]]] = None,
+        fpr: Optional[List[float]] = None,
+        tpr: Optional[List[float]] = None,
+        threshold: Optional[List[float]] = None,
+        display_name: Optional[str] = None,
+    ):
+        """Create an artifact for classification metrics and log to ExperimentRun. Currently support confusion matrix and ROC curve.
+
+        ```
+        my_run = aiplatform.ExperimentRun('my-run', experiment='my-experiment')
+        my_run.log_classification_metrics(
+            display_name='my-classification-metrics',
+            labels=['cat', 'dog'],
+            matrix=[[9, 1], [1, 9]],
+            fpr=[0.1, 0.5, 0.9],
+            tpr=[0.1, 0.7, 0.9],
+            threshold=[0.9, 0.5, 0.1],
+        )
+        ```
+
+        Args:
+            labels (List[str]):
+                Optional. List of label names for the confusion matrix. Must be set if 'matrix' is set.
+            matrix (List[List[int]):
+                Optional. Values for the confusion matrix. Must be set if 'labels' is set.
+            fpr (List[float]):
+                Optional. List of false positive rates for the ROC curve. Must be set if 'tpr' or 'thresholds' is set.
+            tpr (List[float]):
+                Optional. List of true positive rates for the ROC curve. Must be set if 'fpr' or 'thresholds' is set.
+            threshold (List[float]):
+                Optional. List of thresholds for the ROC curve. Must be set if 'fpr' or 'tpr' is set.
+            display_name (str):
+                Optional. The user-defined name for the classification metric artifact.
+
+        Raises:
+            ValueError: if 'labels' and 'matrix' are not set together
+                        or if 'labels' and 'matrix' are not in the same length
+                        or if 'fpr' and 'tpr' and 'threshold' are not set together
+                        or if 'fpr' and 'tpr' and 'threshold' are not in the same length
+        """
+        if (labels or matrix) and not (labels and matrix):
+            raise ValueError("labels and matrix must be set together.")
+
+        if (fpr or tpr or threshold) and not (fpr and tpr and threshold):
+            raise ValueError("fpr, tpr, and thresholds must be set together.")
+
+        metadata = {}
+        if labels and matrix:
+            if len(matrix) != len(labels):
+                raise ValueError(
+                    "Length of labels and matrix must be the same. "
+                    "Got lengths {} and {} respectively.".format(
+                        len(labels), len(matrix)
+                    )
+                )
+
+            confusion_matrix = {
+                "annotationSpecs": [{"displayName": label} for label in labels],
+                "rows": matrix,
+            }
+            metadata["confusionMatrix"] = confusion_matrix
+
+        if fpr and tpr and threshold:
+            if (
+                len(fpr) != len(tpr)
+                or len(fpr) != len(threshold)
+                or len(tpr) != len(threshold)
+            ):
+                raise ValueError(
+                    "Length of fpr, tpr and threshold must be the same. "
+                    "Got lengths {}, {} and {} respectively.".format(
+                        len(fpr), len(tpr), len(threshold)
+                    )
+                )
+
+            metadata["confidenceMetrics"] = [
+                {
+                    "confidenceThreshold": threshold[i],
+                    "recall": tpr[i],
+                    "falsePositiveRate": fpr[i],
+                }
+                for i in range(len(fpr))
+            ]
+
+        classification_metrics = schema.google.artifact_schema.ClassificationMetrics(
+            display_name=display_name,
+            metadata=metadata,
+        )
+        classfication_metrics = classification_metrics.create()
+        self._metadata_node.add_artifacts_and_executions(
+            artifact_resource_names=[classfication_metrics.resource_name]
+        )
+
     @_v1_not_supported
     def get_time_series_data_frame(self) -> "pd.DataFrame":  # noqa: F821
         """Returns all time series in this Run as a DataFrame.
@@ -1149,6 +1247,64 @@ def get_metrics(self) -> Dict[str, Union[float, int, str]]:
         else:
             return self._metadata_node.metadata[constants._METRIC_KEY]
 
+    def get_classification_metrics(self) -> List[Dict[str, Union[str, List]]]:
+        """Get all the classification metrics logged to this run.
+
+        ```
+        my_run = aiplatform.ExperimentRun('my-run', experiment='my-experiment')
+        metric = my_run.get_classification_metrics()[0]
+        print(metric)
+        ## print result:
+            {
+                "id": "e6c893a4-222e-4c60-a028-6a3b95dfc109",
+                "display_name": "my-classification-metrics",
+                "labels": ["cat", "dog"],
+                "matrix": [[9,1], [1,9]],
+                "fpr": [0.1, 0.5, 0.9],
+                "tpr": [0.1, 0.7, 0.9],
+                "thresholds": [0.9, 0.5, 0.1]
+            }
+        ```
+
+        Returns:
+            List of classification metrics logged to this experiment run.
+        """
+
+        artifact_list = artifact.Artifact.list(
+            filter=metadata_utils._make_filter_string(
+                in_context=[self.resource_name],
+                schema_title="google.ClassificationMetrics",
+            ),
+            project=self.project,
+            location=self.location,
+            credentials=self.credentials,
+        )
+
+        metrics = []
+        for metric_artifact in artifact_list:
+            metric = {}
+            metric["id"] = metric_artifact.name
+            metric["display_name"] = metric_artifact.display_name
+            metadata = metric_artifact.metadata
+            if "confusionMatrix" in metadata:
+                metric["labels"] = [
+                    d["displayName"]
+                    for d in metadata["confusionMatrix"]["annotationSpecs"]
+                ]
+                metric["matrix"] = metadata["confusionMatrix"]["rows"]
+
+            if "confidenceMetrics" in metadata:
+                metric["fpr"] = [
+                    d["falsePositiveRate"] for d in metadata["confidenceMetrics"]
+                ]
+                metric["tpr"] = [d["recall"] for d in metadata["confidenceMetrics"]]
+                metric["threshold"] = [
+                    d["confidenceThreshold"] for d in metadata["confidenceMetrics"]
+                ]
+            metrics.append(metric)
+
+        return metrics
+
     @_v1_not_supported
     def associate_execution(self, execution: execution.Execution):
         """Associate an execution to this experiment run.

diff --git a/google/cloud/aiplatform/metadata/metadata.py b/google/cloud/aiplatform/metadata/metadata.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 #
 
-
-from typing import Dict, Union, Optional, Any
+from typing import Dict, Union, Optional, Any, List
 
 from google.api_core import exceptions
 from google.auth import credentials as auth_credentials
@@ -371,6 +370,62 @@ def log_metrics(self, metrics: Dict[str, Union[float, int, str]]):
         # query the latest metrics artifact resource before logging.
         self._experiment_run.log_metrics(metrics=metrics)
 
+    def log_classification_metrics(
+        self,
+        *,
+        labels: Optional[List[str]] = None,
+        matrix: Optional[List[List[int]]] = None,
+        fpr: Optional[List[float]] = None,
+        tpr: Optional[List[float]] = None,
+        threshold: Optional[List[float]] = None,
+        display_name: Optional[str] = None,
+    ):
+        """Create an artifact for classification metrics and log to ExperimentRun. Currently support confusion matrix and ROC curve.
+
+        ```
+        my_run = aiplatform.ExperimentRun('my-run', experiment='my-experiment')
+        my_run.log_classification_metrics(
+            display_name='my-classification-metrics',
+            labels=['cat', 'dog'],
+            matrix=[[9, 1], [1, 9]],
+            fpr=[0.1, 0.5, 0.9],
+            tpr=[0.1, 0.7, 0.9],
+            threshold=[0.9, 0.5, 0.1],
+        )
+        ```
+
+        Args:
+            labels (List[str]):
+                Optional. List of label names for the confusion matrix. Must be set if 'matrix' is set.
+            matrix (List[List[int]):
+                Optional. Values for the confusion matrix. Must be set if 'labels' is set.
+            fpr (List[float]):
+                Optional. List of false positive rates for the ROC curve. Must be set if 'tpr' or 'thresholds' is set.
+            tpr (List[float]):
+                Optional. List of true positive rates for the ROC curve. Must be set if 'fpr' or 'thresholds' is set.
+            threshold (List[float]):
+                Optional. List of thresholds for the ROC curve. Must be set if 'fpr' or 'tpr' is set.
+            display_name (str):
+                Optional. The user-defined name for the classification metric artifact.
+
+        Raises:
+            ValueError: if 'labels' and 'matrix' are not set together
+                        or if 'labels' and 'matrix' are not in the same length
+                        or if 'fpr' and 'tpr' and 'threshold' are not set together
+                        or if 'fpr' and 'tpr' and 'threshold' are not in the same length
+        """
+
+        self._validate_experiment_and_run(method_name="log_classification_metrics")
+        # query the latest metrics artifact resource before logging.
+        self._experiment_run.log_classification_metrics(
+            display_name=display_name,
+            labels=labels,
+            matrix=matrix,
+            fpr=fpr,
+            tpr=tpr,
+            threshold=threshold,
+        )
+
     def _validate_experiment_and_run(self, method_name: str):
         """Validates Experiment and Run are set and raises informative error message.
 

diff --git a/samples/model-builder/conftest.py b/samples/model-builder/conftest.py
@@ -824,6 +824,12 @@ def mock_log_params():
         mock_log_params.return_value = None
         yield mock_log_params
 
+@pytest.fixture
+def mock_log_classification_metrics():
+    with patch.object(aiplatform, "log_classification_metrics") as mock_log_metrics:
+        mock_log_metrics.return_value = None
+        yield mock_log_metrics
+
 
 @pytest.fixture
 def mock_log_pipeline_job():

diff --git a/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py b/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py
@@ -0,0 +1,47 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+from google.cloud import aiplatform
+
+
+#  [START aiplatform_sdk_log_classification_metrics_sample]
+def log_classification_metrics_sample(
+    experiment_name: str,
+    run_name: str,
+    project: str,
+    location: str,
+    labels: Optional[List[str]] = None,
+    matrix: Optional[List[List[int]]] = None,
+    fpr: Optional[List[float]] = None,
+    tpr: Optional[List[float]] = None,
+    threshold: Optional[List[float]] = None,
+    display_name: Optional[str] = None,
+):
+    aiplatform.init(experiment=experiment_name, project=project, location=location)
+
+    aiplatform.start_run(run=run_name, resume=True)
+
+    aiplatform.log_classification_metrics(
+        labels=labels,
+        matrix=matrix,
+        fpr=fpr,
+        tpr=tpr,
+        threshold=threshold,
+        display_name=display_name,
+    )
+
+
+#  [END aiplatform_sdk_log_params_sample]
diff --git a/samples/model-builder/experiment_tracking/log_classification_metrics_sample_test.py b/samples/model-builder/experiment_tracking/log_classification_metrics_sample_test.py
@@ -0,0 +1,38 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import log_classification_metrics_sample
+
+import pytest
+
+import test_constants as constants
+
+
+@pytest.mark.usefixtures("mock_sdk_init", "mock_start_run")
+def test_log_metrics_sample(mock_log_classification_metrics):
+
+    log_classification_metrics_sample.log_classification_metrics_sample(
+        experiment_name=constants.EXPERIMENT_NAME,
+        run_name=constants.EXPERIMENT_RUN_NAME,
+        project=constants.PROJECT,
+        location=constants.LOCATION,
+        labels=constants.CLASSIFICATION_METRICS["labels"],
+        matrix=constants.CLASSIFICATION_METRICS["matrix"],
+        fpr=constants.CLASSIFICATION_METRICS["fpr"],
+        tpr=constants.CLASSIFICATION_METRICS["tpr"],
+        threshold=constants.CLASSIFICATION_METRICS["threshold"],
+        display_name=constants.CLASSIFICATION_METRICS["display_name"],
+    )
+
+    mock_log_classification_metrics.assert_called_with(constants.CLASSIFICATION_METRICS)
diff --git a/samples/model-builder/test_constants.py b/samples/model-builder/test_constants.py
@@ -272,7 +272,14 @@
 
 METRICS = {"accuracy": 0.1}
 PARAMS = {"learning_rate": 0.1}
-
+CLASSIFICATION_METRICS = {
+    "display_name": "my-classification-metrics",
+    "labels": ["cat", "dog"],
+    "matrix": [[9, 1], [1, 9]],
+    "fpr": [0.1, 0.5, 0.9],
+    "tpr": [0.1, 0.7, 0.9],
+    "threshold": [0.9, 0.5, 0.1],
+}
 TEMPLATE_PATH = "pipeline.json"
 
 STEP = 1