Skip to content

Commit

Permalink
Merge pull request #292 from awslabs/laurenyu-chainer-tuning
Browse files Browse the repository at this point in the history
Add Chainer hyperparameter tuning notebook
  • Loading branch information
djarpin authored Jun 21, 2018
2 parents 6d2c0a7 + ba842e9 commit 0606937
Show file tree
Hide file tree
Showing 14 changed files with 742 additions and 0 deletions.

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
38 changes: 38 additions & 0 deletions hyperparameter_tuning/chainer_cifar10/s3_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.

import boto3
import tarfile
from urllib.parse import urlparse
import os

def retrieve_output_from_s3(s3_url, output_dir):
"""
Downloads output artifacts from s3 and extracts them into the given directory.
Args:
s3_url: S3 URL to the output artifacts
output_dir: directory to write artifacts to
"""
o = urlparse(s3_url)
s3 = boto3.resource('s3')
output_data_path = os.path.join(output_dir)
output_file_name = os.path.join(output_data_path, 'output.tar.gz')
try:
os.makedirs(output_data_path)
except FileExistsError:
pass
s3.Bucket(o.netloc).download_file(o.path.lstrip('/'), output_file_name)
tar = tarfile.open(output_file_name)
tar.extractall(output_data_path)
tar.close()
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.

from __future__ import print_function, absolute_import

import argparse
import os

import numpy as np

import chainer
import chainer.functions as F
import chainer.links as L
from chainer import training
from chainer import serializers
from chainer.training import extensions

import net

if __name__ =='__main__':

parser = argparse.ArgumentParser()

# retrieve the hyperparameters we set from the client (with some defaults)
parser.add_argument('--epochs', type=int, default=50)
parser.add_argument('--batch-size', type=int, default=64)
parser.add_argument('--learning-rate', type=float, default=0.05)

# Data, model, and output directories These are required.
parser.add_argument('--output-dir', type=str, default=os.environ['SM_OUTPUT_DIR'])
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])

args, _ = parser.parse_known_args()

num_gpus = int(os.environ['SM_NUM_GPUS'])

train_data = np.load(os.path.join(args.train, 'train.npz'))['data']
train_labels = np.load(os.path.join(args.train, 'train.npz'))['labels']

test_data = np.load(os.path.join(args.test, 'test.npz'))['data']
test_labels = np.load(os.path.join(args.test, 'test.npz'))['labels']

train = chainer.datasets.TupleDataset(train_data, train_labels)
test = chainer.datasets.TupleDataset(test_data, test_labels)

print('# Minibatch-size: {}'.format(args.batch_size))
print('# epoch: {}'.format(args.epochs))
print('# learning rate: {}'.format(args.learning_rate))

# Set up a neural network to train.
# Classifier reports softmax cross entropy loss and accuracy at every
# iteration, which will be used by the PrintReport extension below.
model = L.Classifier(net.VGG(10))

optimizer = chainer.optimizers.MomentumSGD(args.learning_rate)
optimizer.setup(model)
optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4))

# Set up a trainer
device = 0 if num_gpus > 0 else -1 # -1 indicates CPU, 0 indicates first GPU device.
if num_gpus > 1:
devices = range(num_gpus)
train_iters = [chainer.iterators.MultiprocessIterator(i, args.batch_size, n_processes=4) \
for i in chainer.datasets.split_dataset_n_random(train, len(devices))]
test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False, n_processes=num_gpus)
updater = training.updaters.MultiprocessParallelUpdater(train_iters, optimizer, devices=range(num_gpus))
else:
train_iter = chainer.iterators.MultiprocessIterator(train, args.batch_size)
test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False)
updater = training.updater.StandardUpdater(train_iter, optimizer, device=device)

stop_trigger = (args.epochs, 'epoch')

output_data_dir = os.path.join(args.output_dir, 'data')
trainer = training.Trainer(updater, stop_trigger, out=output_data_dir)
# Evaluate the model with the test dataset for each epoch
trainer.extend(extensions.Evaluator(test_iter, model, device=device))

# Reduce the learning rate by half every 25 epochs.
trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(25, 'epoch'))

# Dump a computational graph from 'loss' variable at the first iteration
# The "main" refers to the target link of the "main" optimizer.
trainer.extend(extensions.dump_graph('main/loss'))

# Write a log of evaluation statistics for each epoch
trainer.extend(extensions.LogReport())

if extensions.PlotReport.available():
trainer.extend(
extensions.PlotReport(['main/loss', 'validation/main/loss'],
'epoch', file_name='loss.png'))
trainer.extend(
extensions.PlotReport(
['main/accuracy', 'validation/main/accuracy'],
'epoch', file_name='accuracy.png'))

# Print selected entries of the log to stdout
# Here "main" refers to the target link of the "main" optimizer again, and
# "validation" refers to the default name of the Evaluator extension.
# Entries other than 'epoch' are reported by the Classifier link, called by
# either the updater or the evaluator.
trainer.extend(extensions.PrintReport(
['epoch', 'main/loss', 'validation/main/loss',
'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))

# Run the training
trainer.run()

# Save the model to model_dir. It's loaded below in `model_fn`.
serializers.save_npz(os.path.join(args.model_dir, 'model.npz'), model)


def model_fn(model_dir):
"""
This function is called by the Chainer container during hosting when running on SageMaker with
values populated by the hosting environment.
This function loads models written during training into `model_dir`.
Args:
model_dir (str): path to the directory containing the saved model artifacts
Returns:
a loaded Chainer model
For more on `model_fn`, please visit the sagemaker-python-sdk repository:
https://github.com/aws/sagemaker-python-sdk
For more on the Chainer container, please visit the sagemaker-chainer-containers repository:
https://github.com/aws/sagemaker-chainer-containers
"""
chainer.config.train = False
model = L.Classifier(net.VGG(10))
serializers.load_npz(os.path.join(model_dir, 'model.npz'), model)
return model.predictor
112 changes: 112 additions & 0 deletions hyperparameter_tuning/chainer_cifar10/src/net.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import chainer
import chainer.functions as F
import chainer.links as L


class Block(chainer.Chain):

"""A convolution, batch norm, ReLU block.
A block in a feedforward network that performs a
convolution followed by batch normalization followed
by a ReLU activation.
For the convolution operation, a square filter size is used.
Args:
out_channels (int): The number of output channels.
ksize (int): The size of the filter is ksize x ksize.
pad (int): The padding to use for the convolution.
"""

def __init__(self, out_channels, ksize, pad=1):
super(Block, self).__init__()
with self.init_scope():
self.conv = L.Convolution2D(None, out_channels, ksize, pad=pad,
nobias=True)
self.bn = L.BatchNormalization(out_channels)

def __call__(self, x):
h = self.conv(x)
h = self.bn(h)
return F.relu(h)


class VGG(chainer.Chain):

"""A VGG-style network for very small images.
This model is based on the VGG-style model from
http://torch.ch/blog/2015/07/30/cifar.html
which is based on the network architecture from the paper:
https://arxiv.org/pdf/1409.1556v6.pdf
This model is intended to be used with either RGB or greyscale input
images that are of size 32x32 pixels, such as those in the CIFAR10
and CIFAR100 datasets.
On CIFAR10, it achieves approximately 89% accuracy on the test set with
no data augmentation.
On CIFAR100, it achieves approximately 63% accuracy on the test set with
no data augmentation.
Args:
class_labels (int): The number of class labels.
"""

def __init__(self, class_labels=10):
super(VGG, self).__init__()
with self.init_scope():
self.block1_1 = Block(64, 3)
self.block1_2 = Block(64, 3)
self.block2_1 = Block(128, 3)
self.block2_2 = Block(128, 3)
self.block3_1 = Block(256, 3)
self.block3_2 = Block(256, 3)
self.block3_3 = Block(256, 3)
self.block4_1 = Block(512, 3)
self.block4_2 = Block(512, 3)
self.block4_3 = Block(512, 3)
self.block5_1 = Block(512, 3)
self.block5_2 = Block(512, 3)
self.block5_3 = Block(512, 3)
self.fc1 = L.Linear(None, 512, nobias=True)
self.bn_fc1 = L.BatchNormalization(512)
self.fc2 = L.Linear(None, class_labels, nobias=True)

def __call__(self, x):
# 64 channel blocks:
h = self.block1_1(x)
h = F.dropout(h, ratio=0.3)
h = self.block1_2(h)
h = F.max_pooling_2d(h, ksize=2, stride=2)

# 128 channel blocks:
h = self.block2_1(h)
h = F.dropout(h, ratio=0.4)
h = self.block2_2(h)
h = F.max_pooling_2d(h, ksize=2, stride=2)

# 256 channel blocks:
h = self.block3_1(h)
h = F.dropout(h, ratio=0.4)
h = self.block3_2(h)
h = F.dropout(h, ratio=0.4)
h = self.block3_3(h)
h = F.max_pooling_2d(h, ksize=2, stride=2)

# 512 channel blocks:
h = self.block4_1(h)
h = F.dropout(h, ratio=0.4)
h = self.block4_2(h)
h = F.dropout(h, ratio=0.4)
h = self.block4_3(h)
h = F.max_pooling_2d(h, ksize=2, stride=2)

# 512 channel blocks:
h = self.block5_1(h)
h = F.dropout(h, ratio=0.4)
h = self.block5_2(h)
h = F.dropout(h, ratio=0.4)
h = self.block5_3(h)
h = F.max_pooling_2d(h, ksize=2, stride=2)

h = F.dropout(h, ratio=0.5)
h = self.fc1(h)
h = self.bn_fc1(h)
h = F.relu(h)
h = F.dropout(h, ratio=0.5)
return self.fc2(h)

0 comments on commit 0606937

Please sign in to comment.