Merge pull request #292 from awslabs/laurenyu-chainer-tuning

Add Chainer hyperparameter tuning notebook
aws · Jun 21, 2018 · 0606937 · 0606937
2 parents 6d2c0a7 + ba842e9
commit 0606937
Show file tree

Hide file tree

Showing 14 changed files with 742 additions and 0 deletions.
diff --git a/hyperparameter_tuning/chainer_cifar10/chainer_single_machine_cifar10.ipynb b/hyperparameter_tuning/chainer_cifar10/chainer_single_machine_cifar10.ipynb
diff --git a/hyperparameter_tuning/chainer_cifar10/images/airplane1.png b/hyperparameter_tuning/chainer_cifar10/images/airplane1.png
diff --git a/hyperparameter_tuning/chainer_cifar10/images/automobile1.png b/hyperparameter_tuning/chainer_cifar10/images/automobile1.png
diff --git a/hyperparameter_tuning/chainer_cifar10/images/bird1.png b/hyperparameter_tuning/chainer_cifar10/images/bird1.png
diff --git a/hyperparameter_tuning/chainer_cifar10/images/cat1.png b/hyperparameter_tuning/chainer_cifar10/images/cat1.png
diff --git a/hyperparameter_tuning/chainer_cifar10/images/deer1.png b/hyperparameter_tuning/chainer_cifar10/images/deer1.png
diff --git a/hyperparameter_tuning/chainer_cifar10/images/dog1.png b/hyperparameter_tuning/chainer_cifar10/images/dog1.png
diff --git a/hyperparameter_tuning/chainer_cifar10/images/frog1.png b/hyperparameter_tuning/chainer_cifar10/images/frog1.png
diff --git a/hyperparameter_tuning/chainer_cifar10/images/horse1.png b/hyperparameter_tuning/chainer_cifar10/images/horse1.png
diff --git a/hyperparameter_tuning/chainer_cifar10/images/ship1.png b/hyperparameter_tuning/chainer_cifar10/images/ship1.png
diff --git a/hyperparameter_tuning/chainer_cifar10/images/truck1.png b/hyperparameter_tuning/chainer_cifar10/images/truck1.png
diff --git a/hyperparameter_tuning/chainer_cifar10/s3_util.py b/hyperparameter_tuning/chainer_cifar10/s3_util.py
@@ -0,0 +1,38 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import boto3
+import tarfile
+from urllib.parse import urlparse
+import os
+
+def retrieve_output_from_s3(s3_url, output_dir):
+    """
+    Downloads output artifacts from s3 and extracts them into the given directory.
+
+    Args:
+        s3_url: S3 URL to the output artifacts
+        output_dir: directory to write artifacts to
+    """
+    o = urlparse(s3_url)
+    s3 = boto3.resource('s3')
+    output_data_path = os.path.join(output_dir)
+    output_file_name = os.path.join(output_data_path, 'output.tar.gz')
+    try:
+        os.makedirs(output_data_path)
+    except FileExistsError:
+        pass
+    s3.Bucket(o.netloc).download_file(o.path.lstrip('/'), output_file_name)
+    tar = tarfile.open(output_file_name)
+    tar.extractall(output_data_path)
+    tar.close()
diff --git a/hyperparameter_tuning/chainer_cifar10/src/chainer_cifar_vgg_single_machine.py b/hyperparameter_tuning/chainer_cifar10/src/chainer_cifar_vgg_single_machine.py
@@ -0,0 +1,148 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+from __future__ import print_function, absolute_import
+
+import argparse
+import os
+
+import numpy as np
+
+import chainer
+import chainer.functions as F
+import chainer.links as L
+from chainer import training
+from chainer import serializers
+from chainer.training import extensions
+
+import net
+
+if __name__ =='__main__':
+
+    parser = argparse.ArgumentParser()
+
+    # retrieve the hyperparameters we set from the client (with some defaults)
+    parser.add_argument('--epochs', type=int, default=50)
+    parser.add_argument('--batch-size', type=int, default=64)
+    parser.add_argument('--learning-rate', type=float, default=0.05)
+
+    # Data, model, and output directories These are required.
+    parser.add_argument('--output-dir', type=str, default=os.environ['SM_OUTPUT_DIR'])
+    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
+    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
+    parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])
+
+    args, _ = parser.parse_known_args()
+
+    num_gpus = int(os.environ['SM_NUM_GPUS'])
+
+    train_data = np.load(os.path.join(args.train, 'train.npz'))['data']
+    train_labels = np.load(os.path.join(args.train, 'train.npz'))['labels']
+
+    test_data = np.load(os.path.join(args.test, 'test.npz'))['data']
+    test_labels = np.load(os.path.join(args.test, 'test.npz'))['labels']
+
+    train = chainer.datasets.TupleDataset(train_data, train_labels)
+    test = chainer.datasets.TupleDataset(test_data, test_labels)
+
+    print('# Minibatch-size: {}'.format(args.batch_size))
+    print('# epoch: {}'.format(args.epochs))
+    print('# learning rate: {}'.format(args.learning_rate))
+
+    # Set up a neural network to train.
+    # Classifier reports softmax cross entropy loss and accuracy at every
+    # iteration, which will be used by the PrintReport extension below.
+    model = L.Classifier(net.VGG(10))
+
+    optimizer = chainer.optimizers.MomentumSGD(args.learning_rate)
+    optimizer.setup(model)
+    optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4))
+
+    # Set up a trainer
+    device = 0 if num_gpus > 0 else -1  # -1 indicates CPU, 0 indicates first GPU device.
+    if num_gpus > 1:
+        devices = range(num_gpus)
+        train_iters = [chainer.iterators.MultiprocessIterator(i, args.batch_size, n_processes=4) \
+                    for i in chainer.datasets.split_dataset_n_random(train, len(devices))]
+        test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False, n_processes=num_gpus)
+        updater = training.updaters.MultiprocessParallelUpdater(train_iters, optimizer, devices=range(num_gpus))
+    else:
+        train_iter = chainer.iterators.MultiprocessIterator(train, args.batch_size)
+        test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False)
+        updater = training.updater.StandardUpdater(train_iter, optimizer, device=device)
+
+    stop_trigger = (args.epochs, 'epoch')
+
+    output_data_dir = os.path.join(args.output_dir, 'data')
+    trainer = training.Trainer(updater, stop_trigger, out=output_data_dir)
+    # Evaluate the model with the test dataset for each epoch
+    trainer.extend(extensions.Evaluator(test_iter, model, device=device))
+
+    # Reduce the learning rate by half every 25 epochs.
+    trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(25, 'epoch'))
+
+    # Dump a computational graph from 'loss' variable at the first iteration
+    # The "main" refers to the target link of the "main" optimizer.
+    trainer.extend(extensions.dump_graph('main/loss'))
+
+    # Write a log of evaluation statistics for each epoch
+    trainer.extend(extensions.LogReport())
+
+    if extensions.PlotReport.available():
+        trainer.extend(
+            extensions.PlotReport(['main/loss', 'validation/main/loss'],
+                                  'epoch', file_name='loss.png'))
+        trainer.extend(
+            extensions.PlotReport(
+                ['main/accuracy', 'validation/main/accuracy'],
+                'epoch', file_name='accuracy.png'))
+
+    # Print selected entries of the log to stdout
+    # Here "main" refers to the target link of the "main" optimizer again, and
+    # "validation" refers to the default name of the Evaluator extension.
+    # Entries other than 'epoch' are reported by the Classifier link, called by
+    # either the updater or the evaluator.
+    trainer.extend(extensions.PrintReport(
+        ['epoch', 'main/loss', 'validation/main/loss',
+         'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
+
+    # Run the training
+    trainer.run()
+
+    # Save the model to model_dir. It's loaded below in `model_fn`.
+    serializers.save_npz(os.path.join(args.model_dir, 'model.npz'), model)
+
+
+def model_fn(model_dir):
+    """
+    This function is called by the Chainer container during hosting when running on SageMaker with
+    values populated by the hosting environment.
+    
+    This function loads models written during training into `model_dir`.
+
+    Args:
+        model_dir (str): path to the directory containing the saved model artifacts
+
+    Returns:
+        a loaded Chainer model
+    
+    For more on `model_fn`, please visit the sagemaker-python-sdk repository:
+    https://github.com/aws/sagemaker-python-sdk
+    
+    For more on the Chainer container, please visit the sagemaker-chainer-containers repository:
+    https://github.com/aws/sagemaker-chainer-containers
+    """
+    chainer.config.train = False
+    model = L.Classifier(net.VGG(10))
+    serializers.load_npz(os.path.join(model_dir, 'model.npz'), model)
+    return model.predictor
diff --git a/hyperparameter_tuning/chainer_cifar10/src/net.py b/hyperparameter_tuning/chainer_cifar10/src/net.py
@@ -0,0 +1,112 @@
+import chainer
+import chainer.functions as F
+import chainer.links as L
+
+
+class Block(chainer.Chain):
+
+    """A convolution, batch norm, ReLU block.
+    A block in a feedforward network that performs a
+    convolution followed by batch normalization followed
+    by a ReLU activation.
+    For the convolution operation, a square filter size is used.
+    Args:
+        out_channels (int): The number of output channels.
+        ksize (int): The size of the filter is ksize x ksize.
+        pad (int): The padding to use for the convolution.
+    """
+
+    def __init__(self, out_channels, ksize, pad=1):
+        super(Block, self).__init__()
+        with self.init_scope():
+            self.conv = L.Convolution2D(None, out_channels, ksize, pad=pad,
+                                        nobias=True)
+            self.bn = L.BatchNormalization(out_channels)
+
+    def __call__(self, x):
+        h = self.conv(x)
+        h = self.bn(h)
+        return F.relu(h)
+
+
+class VGG(chainer.Chain):
+
+    """A VGG-style network for very small images.
+    This model is based on the VGG-style model from
+    http://torch.ch/blog/2015/07/30/cifar.html
+    which is based on the network architecture from the paper:
+    https://arxiv.org/pdf/1409.1556v6.pdf
+    This model is intended to be used with either RGB or greyscale input
+    images that are of size 32x32 pixels, such as those in the CIFAR10
+    and CIFAR100 datasets.
+    On CIFAR10, it achieves approximately 89% accuracy on the test set with
+    no data augmentation.
+    On CIFAR100, it achieves approximately 63% accuracy on the test set with
+    no data augmentation.
+    Args:
+        class_labels (int): The number of class labels.
+    """
+
+    def __init__(self, class_labels=10):
+        super(VGG, self).__init__()
+        with self.init_scope():
+            self.block1_1 = Block(64, 3)
+            self.block1_2 = Block(64, 3)
+            self.block2_1 = Block(128, 3)
+            self.block2_2 = Block(128, 3)
+            self.block3_1 = Block(256, 3)
+            self.block3_2 = Block(256, 3)
+            self.block3_3 = Block(256, 3)
+            self.block4_1 = Block(512, 3)
+            self.block4_2 = Block(512, 3)
+            self.block4_3 = Block(512, 3)
+            self.block5_1 = Block(512, 3)
+            self.block5_2 = Block(512, 3)
+            self.block5_3 = Block(512, 3)
+            self.fc1 = L.Linear(None, 512, nobias=True)
+            self.bn_fc1 = L.BatchNormalization(512)
+            self.fc2 = L.Linear(None, class_labels, nobias=True)
+
+    def __call__(self, x):
+        # 64 channel blocks:
+        h = self.block1_1(x)
+        h = F.dropout(h, ratio=0.3)
+        h = self.block1_2(h)
+        h = F.max_pooling_2d(h, ksize=2, stride=2)
+
+        # 128 channel blocks:
+        h = self.block2_1(h)
+        h = F.dropout(h, ratio=0.4)
+        h = self.block2_2(h)
+        h = F.max_pooling_2d(h, ksize=2, stride=2)
+
+        # 256 channel blocks:
+        h = self.block3_1(h)
+        h = F.dropout(h, ratio=0.4)
+        h = self.block3_2(h)
+        h = F.dropout(h, ratio=0.4)
+        h = self.block3_3(h)
+        h = F.max_pooling_2d(h, ksize=2, stride=2)
+
+        # 512 channel blocks:
+        h = self.block4_1(h)
+        h = F.dropout(h, ratio=0.4)
+        h = self.block4_2(h)
+        h = F.dropout(h, ratio=0.4)
+        h = self.block4_3(h)
+        h = F.max_pooling_2d(h, ksize=2, stride=2)
+
+        # 512 channel blocks:
+        h = self.block5_1(h)
+        h = F.dropout(h, ratio=0.4)
+        h = self.block5_2(h)
+        h = F.dropout(h, ratio=0.4)
+        h = self.block5_3(h)
+        h = F.max_pooling_2d(h, ksize=2, stride=2)
+
+        h = F.dropout(h, ratio=0.5)
+        h = self.fc1(h)
+        h = self.bn_fc1(h)
+        h = F.relu(h)
+        h = F.dropout(h, ratio=0.5)
+        return self.fc2(h)