From 4cc457faf4873c0ce674b6b5f857b5ee85967bf6 Mon Sep 17 00:00:00 2001 From: Lauren Yu <6631887+laurenyu@users.noreply.github.com> Date: Tue, 17 Jul 2018 07:43:48 -0700 Subject: [PATCH] Move batch transform notebooks (#328) A new top-level directory requires a separate change to show up in Amazon SageMaker, so moving these back under existing top-level directories --- .../batch_transform}/Dockerfile | 0 ..._transform_pca_dbscan_movie_clusters.ipynb | 0 .../batch_transform}/dbscan.R | 0 .../batch_transform}/plumber.R | 0 batch_transform/mxnet_mnist/mnist.py | 73 ----------- .../tensorflow_distributed_mnist/mnist.py | 121 ------------------ .../tensorflow_distributed_mnist/utils.py | 39 ------ .../mxnet_mnist_with_batch_transform.ipynb | 0 .../tensorflow_transformer_mnist.ipynb | 0 9 files changed, 233 deletions(-) rename {batch_transform/pca_dbscan => advanced_functionality/batch_transform}/Dockerfile (100%) rename {batch_transform/pca_dbscan => advanced_functionality/batch_transform}/batch_transform_pca_dbscan_movie_clusters.ipynb (100%) rename {batch_transform/pca_dbscan => advanced_functionality/batch_transform}/dbscan.R (100%) rename {batch_transform/pca_dbscan => advanced_functionality/batch_transform}/plumber.R (100%) delete mode 100644 batch_transform/mxnet_mnist/mnist.py delete mode 100644 batch_transform/tensorflow_distributed_mnist/mnist.py delete mode 100644 batch_transform/tensorflow_distributed_mnist/utils.py rename {batch_transform => sagemaker-python-sdk}/mxnet_mnist/mxnet_mnist_with_batch_transform.ipynb (100%) rename {batch_transform => sagemaker-python-sdk}/tensorflow_distributed_mnist/tensorflow_transformer_mnist.ipynb (100%) diff --git a/batch_transform/pca_dbscan/Dockerfile b/advanced_functionality/batch_transform/Dockerfile similarity index 100% rename from batch_transform/pca_dbscan/Dockerfile rename to advanced_functionality/batch_transform/Dockerfile diff --git a/batch_transform/pca_dbscan/batch_transform_pca_dbscan_movie_clusters.ipynb b/advanced_functionality/batch_transform/batch_transform_pca_dbscan_movie_clusters.ipynb similarity index 100% rename from batch_transform/pca_dbscan/batch_transform_pca_dbscan_movie_clusters.ipynb rename to advanced_functionality/batch_transform/batch_transform_pca_dbscan_movie_clusters.ipynb diff --git a/batch_transform/pca_dbscan/dbscan.R b/advanced_functionality/batch_transform/dbscan.R similarity index 100% rename from batch_transform/pca_dbscan/dbscan.R rename to advanced_functionality/batch_transform/dbscan.R diff --git a/batch_transform/pca_dbscan/plumber.R b/advanced_functionality/batch_transform/plumber.R similarity index 100% rename from batch_transform/pca_dbscan/plumber.R rename to advanced_functionality/batch_transform/plumber.R diff --git a/batch_transform/mxnet_mnist/mnist.py b/batch_transform/mxnet_mnist/mnist.py deleted file mode 100644 index b237816be6..0000000000 --- a/batch_transform/mxnet_mnist/mnist.py +++ /dev/null @@ -1,73 +0,0 @@ -import logging - -import gzip -import mxnet as mx -import numpy as np -import os -import struct - - -def load_data(path): - with gzip.open(find_file(path, "labels.gz")) as flbl: - struct.unpack(">II", flbl.read(8)) - labels = np.fromstring(flbl.read(), dtype=np.int8) - with gzip.open(find_file(path, "images.gz")) as fimg: - _, _, rows, cols = struct.unpack(">IIII", fimg.read(16)) - images = np.fromstring(fimg.read(), dtype=np.uint8).reshape(len(labels), rows, cols) - images = images.reshape(images.shape[0], 1, 28, 28).astype(np.float32) / 255 - return labels, images - - -def find_file(root_path, file_name): - for root, dirs, files in os.walk(root_path): - if file_name in files: - return os.path.join(root, file_name) - - -def build_graph(): - data = mx.sym.var('data') - data = mx.sym.flatten(data=data) - fc1 = mx.sym.FullyConnected(data=data, num_hidden=128) - act1 = mx.sym.Activation(data=fc1, act_type="relu") - fc2 = mx.sym.FullyConnected(data=act1, num_hidden=64) - act2 = mx.sym.Activation(data=fc2, act_type="relu") - fc3 = mx.sym.FullyConnected(data=act2, num_hidden=10) - return mx.sym.SoftmaxOutput(data=fc3, name='softmax') - - -def train(current_host, channel_input_dirs, hyperparameters, hosts, num_cpus, num_gpus): - (train_labels, train_images) = load_data(os.path.join(channel_input_dirs['train'])) - (test_labels, test_images) = load_data(os.path.join(channel_input_dirs['test'])) - - # Alternatively to splitting in memory, the data could be pre-split in S3 and use ShardedByS3Key - # to do parallel training. - shard_size = len(train_images) // len(hosts) - for i, host in enumerate(hosts): - if host == current_host: - start = shard_size * i - end = start + shard_size - break - - batch_size = 100 - train_iter = mx.io.NDArrayIter(train_images[start:end], train_labels[start:end], batch_size, shuffle=True) - val_iter = mx.io.NDArrayIter(test_images, test_labels, batch_size) - logging.getLogger().setLevel(logging.DEBUG) - kvstore = 'local' if len(hosts) == 1 else 'dist_sync' - mlp_model = mx.mod.Module( - symbol=build_graph(), - context=get_train_context(num_cpus, num_gpus)) - mlp_model.fit(train_iter, - eval_data=val_iter, - kvstore=kvstore, - optimizer='sgd', - optimizer_params={'learning_rate': float(hyperparameters.get("learning_rate", 0.1))}, - eval_metric='acc', - batch_end_callback=mx.callback.Speedometer(batch_size, 100), - num_epoch=25) - return mlp_model - - -def get_train_context(num_cpus, num_gpus): - if num_gpus > 0: - return mx.gpu() - return mx.cpu() diff --git a/batch_transform/tensorflow_distributed_mnist/mnist.py b/batch_transform/tensorflow_distributed_mnist/mnist.py deleted file mode 100644 index b3b3461640..0000000000 --- a/batch_transform/tensorflow_distributed_mnist/mnist.py +++ /dev/null @@ -1,121 +0,0 @@ -import os -import tensorflow as tf -from tensorflow.python.estimator.model_fn import ModeKeys as Modes - -INPUT_TENSOR_NAME = 'inputs' -SIGNATURE_NAME = 'predictions' - -LEARNING_RATE = 0.001 - - -def model_fn(features, labels, mode, params): - # Input Layer - input_layer = tf.reshape(features[INPUT_TENSOR_NAME], [-1, 28, 28, 1]) - - # Convolutional Layer #1 - conv1 = tf.layers.conv2d( - inputs=input_layer, - filters=32, - kernel_size=[5, 5], - padding='same', - activation=tf.nn.relu) - - # Pooling Layer #1 - pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) - - # Convolutional Layer #2 and Pooling Layer #2 - conv2 = tf.layers.conv2d( - inputs=pool1, - filters=64, - kernel_size=[5, 5], - padding='same', - activation=tf.nn.relu) - pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) - - # Dense Layer - pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) - dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) - dropout = tf.layers.dropout( - inputs=dense, rate=0.4, training=(mode == Modes.TRAIN)) - - # Logits Layer - logits = tf.layers.dense(inputs=dropout, units=10) - - # Define operations - if mode in (Modes.PREDICT, Modes.EVAL): - predicted_indices = tf.argmax(input=logits, axis=1) - probabilities = tf.nn.softmax(logits, name='softmax_tensor') - - if mode in (Modes.TRAIN, Modes.EVAL): - global_step = tf.train.get_or_create_global_step() - label_indices = tf.cast(labels, tf.int32) - loss = tf.losses.softmax_cross_entropy( - onehot_labels=tf.one_hot(label_indices, depth=10), logits=logits) - tf.summary.scalar('OptimizeLoss', loss) - - if mode == Modes.PREDICT: - predictions = { - 'classes': predicted_indices, - 'probabilities': probabilities - } - export_outputs = { - SIGNATURE_NAME: tf.estimator.export.PredictOutput(predictions) - } - return tf.estimator.EstimatorSpec( - mode, predictions=predictions, export_outputs=export_outputs) - - if mode == Modes.TRAIN: - optimizer = tf.train.AdamOptimizer(learning_rate=0.001) - train_op = optimizer.minimize(loss, global_step=global_step) - return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op) - - if mode == Modes.EVAL: - eval_metric_ops = { - 'accuracy': tf.metrics.accuracy(label_indices, predicted_indices) - } - return tf.estimator.EstimatorSpec( - mode, loss=loss, eval_metric_ops=eval_metric_ops) - - -def serving_input_fn(params): - inputs = {INPUT_TENSOR_NAME: tf.placeholder(tf.float32, [None, 784])} - return tf.estimator.export.ServingInputReceiver(inputs, inputs) - - -def read_and_decode(filename_queue): - reader = tf.TFRecordReader() - _, serialized_example = reader.read(filename_queue) - - features = tf.parse_single_example( - serialized_example, - features={ - 'image_raw': tf.FixedLenFeature([], tf.string), - 'label': tf.FixedLenFeature([], tf.int64), - }) - - image = tf.decode_raw(features['image_raw'], tf.uint8) - image.set_shape([784]) - image = tf.cast(image, tf.float32) * (1. / 255) - label = tf.cast(features['label'], tf.int32) - - return image, label - - -def train_input_fn(training_dir, params): - return _input_fn(training_dir, 'train.tfrecords', batch_size=100) - - -def eval_input_fn(training_dir, params): - return _input_fn(training_dir, 'test.tfrecords', batch_size=100) - - -def _input_fn(training_dir, training_filename, batch_size=100): - test_file = os.path.join(training_dir, training_filename) - filename_queue = tf.train.string_input_producer([test_file]) - - image, label = read_and_decode(filename_queue) - images, labels = tf.train.batch( - [image, label], batch_size=batch_size, - capacity=1000 + 3 * batch_size) - - return {INPUT_TENSOR_NAME: images}, labels diff --git a/batch_transform/tensorflow_distributed_mnist/utils.py b/batch_transform/tensorflow_distributed_mnist/utils.py deleted file mode 100644 index bfddb0f9d1..0000000000 --- a/batch_transform/tensorflow_distributed_mnist/utils.py +++ /dev/null @@ -1,39 +0,0 @@ -"""Converts MNIST data to TFRecords file format with Example protos.""" -import os -import tensorflow as tf - - -def _int64_feature(value): - return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) - - -def _bytes_feature(value): - return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) - - -def convert_to(data_set, name, directory): - """Converts a dataset to tfrecords.""" - images = data_set.images - labels = data_set.labels - num_examples = data_set.num_examples - - if images.shape[0] != num_examples: - raise ValueError('Images size %d does not match label size %d.' % - (images.shape[0], num_examples)) - rows = images.shape[1] - cols = images.shape[2] - depth = images.shape[3] - - filename = os.path.join(directory, name + '.tfrecords') - print('Writing', filename) - writer = tf.python_io.TFRecordWriter(filename) - for index in range(num_examples): - image_raw = images[index].tostring() - example = tf.train.Example(features=tf.train.Features(feature={ - 'height': _int64_feature(rows), - 'width': _int64_feature(cols), - 'depth': _int64_feature(depth), - 'label': _int64_feature(int(labels[index])), - 'image_raw': _bytes_feature(image_raw)})) - writer.write(example.SerializeToString()) - writer.close() diff --git a/batch_transform/mxnet_mnist/mxnet_mnist_with_batch_transform.ipynb b/sagemaker-python-sdk/mxnet_mnist/mxnet_mnist_with_batch_transform.ipynb similarity index 100% rename from batch_transform/mxnet_mnist/mxnet_mnist_with_batch_transform.ipynb rename to sagemaker-python-sdk/mxnet_mnist/mxnet_mnist_with_batch_transform.ipynb diff --git a/batch_transform/tensorflow_distributed_mnist/tensorflow_transformer_mnist.ipynb b/sagemaker-python-sdk/tensorflow_distributed_mnist/tensorflow_transformer_mnist.ipynb similarity index 100% rename from batch_transform/tensorflow_distributed_mnist/tensorflow_transformer_mnist.ipynb rename to sagemaker-python-sdk/tensorflow_distributed_mnist/tensorflow_transformer_mnist.ipynb