Skip to content

Commit

Permalink
feat: add TensorBoard log uploader
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 521565504
  • Loading branch information
vertex-sdk-bot authored and copybara-github committed Apr 3, 2023
1 parent 00b853b commit 3fad7bb
Show file tree
Hide file tree
Showing 7 changed files with 526 additions and 103 deletions.
5 changes: 5 additions & 0 deletions google/cloud/aiplatform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
MatchingEngineIndexEndpoint,
)
from google.cloud.aiplatform import metadata
from google.cloud.aiplatform.tensorboard import uploader_tracker
from google.cloud.aiplatform.models import Endpoint
from google.cloud.aiplatform.models import PrivateEndpoint
from google.cloud.aiplatform.models import Model
Expand Down Expand Up @@ -100,6 +101,10 @@
log_time_series_metrics = metadata.metadata._experiment_tracker.log_time_series_metrics
end_run = metadata.metadata._experiment_tracker.end_run

upload_tb_log = uploader_tracker._tensorboard_tracker.upload_tb_log
start_upload_tb_log = uploader_tracker._tensorboard_tracker.start_upload_tb_log
end_upload_tb_log = uploader_tracker._tensorboard_tracker.end_upload_tb_log

save_model = metadata._models.save_model
get_experiment_model = metadata.schema.google.artifact_schema.ExperimentModel.get

Expand Down
63 changes: 34 additions & 29 deletions google/cloud/aiplatform/tensorboard/uploader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

# Copyright 2021 Google LLC
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -15,33 +15,46 @@
# limitations under the License.
#
"""Uploads a TensorBoard logdir to TensorBoard.gcp."""

import abc
from collections import defaultdict
import functools
import logging
import os
import time
import re
from typing import (
Dict,
FrozenSet,
Generator,
Iterable,
Optional,
ContextManager,
Tuple,
)
import time
from typing import ContextManager, Dict, FrozenSet, Generator, Iterable, Optional, Tuple
import uuid

from google.api_core import exceptions
from google.cloud import storage
from google.cloud.aiplatform import base
from google.cloud.aiplatform.compat.services import (
tensorboard_service_client,
)
from google.cloud.aiplatform.compat.types import tensorboard_data
from google.cloud.aiplatform.compat.types import tensorboard_experiment
from google.cloud.aiplatform.compat.types import tensorboard_service
from google.cloud.aiplatform.compat.types import tensorboard_time_series
from google.cloud.aiplatform.tensorboard import uploader_utils
from google.cloud.aiplatform.tensorboard.plugins.tf_profiler import (
profile_uploader,
)
import grpc
import tensorflow as tf

from google.protobuf import timestamp_pb2 as timestamp
from google.protobuf import message
from tensorboard.backend import process_graph
from tensorboard.backend.event_processing.plugin_event_accumulator import (
directory_loader,
)
from tensorboard.backend.event_processing.plugin_event_accumulator import (
event_file_loader,
)
from tensorboard.backend.event_processing.plugin_event_accumulator import io_wrapper
from tensorboard.backend.event_processing.plugin_event_accumulator import (
io_wrapper,
)
from tensorboard.compat.proto import graph_pb2
from tensorboard.compat.proto import summary_pb2
from tensorboard.compat.proto import types_pb2
Expand All @@ -52,19 +65,8 @@
from tensorboard.uploader.proto import server_info_pb2
from tensorboard.util import tb_logging
from tensorboard.util import tensor_util
import tensorflow as tf

from google.api_core import exceptions
from google.cloud import storage
from google.cloud.aiplatform.compat.services import tensorboard_service_client
from google.cloud.aiplatform.compat.types import tensorboard_data
from google.cloud.aiplatform.compat.types import tensorboard_experiment
from google.cloud.aiplatform.compat.types import tensorboard_service
from google.cloud.aiplatform.compat.types import tensorboard_time_series
from google.cloud.aiplatform.tensorboard import uploader_utils
from google.cloud.aiplatform.tensorboard.plugins.tf_profiler import profile_uploader
from google.protobuf import message
from google.protobuf import timestamp_pb2 as timestamp
_LOGGER = base.Logger(__name__)

TensorboardServiceClient = tensorboard_service_client.TensorboardServiceClient

Expand Down Expand Up @@ -189,6 +191,7 @@ def __init__(
self._allowed_plugins = frozenset(allowed_plugins)
self._run_name_prefix = run_name_prefix
self._is_brand_new_experiment = False
self._continue_uploading = True

self._upload_limits = upload_limits
if not self._upload_limits:
Expand Down Expand Up @@ -388,20 +391,22 @@ def start_uploading(self):
"performance."
)

while True:
while self._continue_uploading:
self._logdir_poll_rate_limiter.tick()
self._upload_once()
if self._one_shot:
break
if self._one_shot and not self._tracker.has_data():
logger.warning(
"One-shot mode was used on a logdir (%s) "
"without any uploadable data" % self._logdir
"One-shot mode was used on a logdir (%s) without any uploadable data"
% self._logdir
)

def _end_uploading(self):
self._continue_uploading = False

def _pre_create_runs_and_time_series(self):
"""
Iterates though the log dir to collect TensorboardRuns and
"""Iterates though the log dir to collect TensorboardRuns and
TensorboardTimeSeries that need to be created, and creates them in batch
to speed up uploading later on.
"""
Expand Down
23 changes: 23 additions & 0 deletions google/cloud/aiplatform/tensorboard/uploader_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Constants shared between TensorBoard command line uploader and SDK uploader"""

from tensorboard.plugins.distribution import (
metadata as distribution_metadata,
)
from tensorboard.plugins.graph import metadata as graphs_metadata
from tensorboard.plugins.histogram import (
metadata as histogram_metadata,
)
from tensorboard.plugins.hparams import metadata as hparams_metadata
from tensorboard.plugins.image import metadata as images_metadata
from tensorboard.plugins.scalar import metadata as scalar_metadata
from tensorboard.plugins.text import metadata as text_metadata

ALLOWED_PLUGINS = [
scalar_metadata.PLUGIN_NAME,
histogram_metadata.PLUGIN_NAME,
distribution_metadata.PLUGIN_NAME,
text_metadata.PLUGIN_NAME,
hparams_metadata.PLUGIN_NAME,
images_metadata.PLUGIN_NAME,
graphs_metadata.PLUGIN_NAME,
]
56 changes: 12 additions & 44 deletions google/cloud/aiplatform/tensorboard/uploader_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,16 @@

from absl import app
from absl import flags
import grpc
from tensorboard.plugins.scalar import metadata as scalar_metadata
from tensorboard.plugins.distribution import metadata as distribution_metadata
from tensorboard.plugins.histogram import metadata as histogram_metadata
from tensorboard.plugins.text import metadata as text_metadata
from tensorboard.plugins.hparams import metadata as hparams_metadata
from tensorboard.plugins.image import metadata as images_metadata
from tensorboard.plugins.graph import metadata as graphs_metadata

from google.api_core import exceptions
from google.cloud import storage
from google.cloud import aiplatform
from google.cloud.aiplatform.constants import base as constants
from google.cloud.aiplatform import jobs
from google.cloud.aiplatform.constants import base as constants
from google.cloud.aiplatform.tensorboard import uploader
from google.cloud.aiplatform.tensorboard import uploader_constants
from google.cloud.aiplatform.tensorboard import uploader_utils
from google.cloud.aiplatform.utils import TensorboardClientWithOverride


FLAGS = flags.FLAGS
flags.DEFINE_string("experiment_name", None, "The name of the Cloud AI Experiment.")
flags.DEFINE_string(
Expand Down Expand Up @@ -73,15 +66,7 @@

flags.DEFINE_multi_string(
"allowed_plugins",
[
scalar_metadata.PLUGIN_NAME,
histogram_metadata.PLUGIN_NAME,
distribution_metadata.PLUGIN_NAME,
text_metadata.PLUGIN_NAME,
hparams_metadata.PLUGIN_NAME,
images_metadata.PLUGIN_NAME,
graphs_metadata.PLUGIN_NAME,
],
uploader_constants.ALLOWED_PLUGINS,
"Plugins allowed by the Uploader.",
)

Expand All @@ -103,29 +88,12 @@ def main(argv):
location_override=region,
)

try:
tensorboard = api_client.get_tensorboard(name=FLAGS.tensorboard_resource_name)
except grpc.RpcError as rpc_error:
if rpc_error.code() == grpc.StatusCode.NOT_FOUND:
raise app.UsageError(
"Tensorboard resource %s not found" % FLAGS.tensorboard_resource_name,
exitcode=0,
) from rpc_error
raise

if tensorboard.blob_storage_path_prefix:
path_prefix = tensorboard.blob_storage_path_prefix + "/"
first_slash_index = path_prefix.find("/")
bucket_name = path_prefix[:first_slash_index]
blob_storage_bucket = storage.Client(project=project_id).bucket(bucket_name)
blob_storage_folder = path_prefix[first_slash_index + 1 :]
else:
raise app.UsageError(
"Tensorboard resource {} is obsolete. Please create a new one.".format(
FLAGS.tensorboard_resource_name
),
exitcode=0,
)
(
blob_storage_bucket,
blob_storage_folder,
) = uploader_utils.get_blob_storage_bucket_and_folder(
api_client, FLAGS.tensorboard_resource_name, project_id
)

experiment_name = FLAGS.experiment_name
experiment_display_name = get_experiment_display_name_with_override(
Expand All @@ -135,7 +103,7 @@ def main(argv):
tb_uploader = uploader.TensorBoardUploader(
experiment_name=experiment_name,
experiment_display_name=experiment_display_name,
tensorboard_resource_name=tensorboard.name,
tensorboard_resource_name=FLAGS.tensorboard_resource_name,
blob_storage_bucket=blob_storage_bucket,
blob_storage_folder=blob_storage_folder,
allowed_plugins=FLAGS.allowed_plugins,
Expand Down
Loading

0 comments on commit 3fad7bb

Please sign in to comment.