Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Chainer hyperparameter tuning notebook #292

Merged
merged 2 commits into from
Jun 21, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
38 changes: 38 additions & 0 deletions hyperparameter_tuning/chainer_cifar10/s3_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.

import boto3
import tarfile
from urllib.parse import urlparse
import os

def retrieve_output_from_s3(s3_url, output_dir):
"""
Downloads output artifacts from s3 and extracts them into the given directory.

Args:
s3_url: S3 URL to the output artifacts
output_dir: directory to write artifacts to
"""
o = urlparse(s3_url)
s3 = boto3.resource('s3')
output_data_path = os.path.join(output_dir)
output_file_name = os.path.join(output_data_path, 'output.tar.gz')
try:
os.makedirs(output_data_path)
except FileExistsError:
pass
s3.Bucket(o.netloc).download_file(o.path.lstrip('/'), output_file_name)
tar = tarfile.open(output_file_name)
tar.extractall(output_data_path)
tar.close()
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.

from __future__ import print_function, absolute_import

import argparse
import os

import numpy as np

import chainer
import chainer.functions as F
import chainer.links as L
from chainer import training
from chainer import serializers
from chainer.training import extensions

import net

if __name__ =='__main__':

parser = argparse.ArgumentParser()

# retrieve the hyperparameters we set from the client (with some defaults)
parser.add_argument('--epochs', type=int, default=50)
parser.add_argument('--batch-size', type=int, default=64)
parser.add_argument('--learning-rate', type=float, default=0.05)

# Data, model, and output directories These are required.
parser.add_argument('--output-dir', type=str, default=os.environ['SM_OUTPUT_DIR'])
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])

args, _ = parser.parse_known_args()

num_gpus = int(os.environ['SM_NUM_GPUS'])

train_data = np.load(os.path.join(args.train, 'train.npz'))['data']
train_labels = np.load(os.path.join(args.train, 'train.npz'))['labels']

test_data = np.load(os.path.join(args.test, 'test.npz'))['data']
test_labels = np.load(os.path.join(args.test, 'test.npz'))['labels']

train = chainer.datasets.TupleDataset(train_data, train_labels)
test = chainer.datasets.TupleDataset(test_data, test_labels)

print('# Minibatch-size: {}'.format(args.batch_size))
print('# epoch: {}'.format(args.epochs))
print('# learning rate: {}'.format(args.learning_rate))

# Set up a neural network to train.
# Classifier reports softmax cross entropy loss and accuracy at every
# iteration, which will be used by the PrintReport extension below.
model = L.Classifier(net.VGG(10))

optimizer = chainer.optimizers.MomentumSGD(args.learning_rate)
optimizer.setup(model)
optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4))

# Set up a trainer
device = 0 if num_gpus > 0 else -1 # -1 indicates CPU, 0 indicates first GPU device.
if num_gpus > 1:
devices = range(num_gpus)
train_iters = [chainer.iterators.MultiprocessIterator(i, args.batch_size, n_processes=4) \
for i in chainer.datasets.split_dataset_n_random(train, len(devices))]
test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False, n_processes=num_gpus)
updater = training.updaters.MultiprocessParallelUpdater(train_iters, optimizer, devices=range(num_gpus))
else:
train_iter = chainer.iterators.MultiprocessIterator(train, args.batch_size)
test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False)
updater = training.updater.StandardUpdater(train_iter, optimizer, device=device)

stop_trigger = (args.epochs, 'epoch')

output_data_dir = os.path.join(args.output_dir, 'data')
trainer = training.Trainer(updater, stop_trigger, out=output_data_dir)
# Evaluate the model with the test dataset for each epoch
trainer.extend(extensions.Evaluator(test_iter, model, device=device))

# Reduce the learning rate by half every 25 epochs.
trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(25, 'epoch'))

# Dump a computational graph from 'loss' variable at the first iteration
# The "main" refers to the target link of the "main" optimizer.
trainer.extend(extensions.dump_graph('main/loss'))

# Write a log of evaluation statistics for each epoch
trainer.extend(extensions.LogReport())

if extensions.PlotReport.available():
trainer.extend(
extensions.PlotReport(['main/loss', 'validation/main/loss'],
'epoch', file_name='loss.png'))
trainer.extend(
extensions.PlotReport(
['main/accuracy', 'validation/main/accuracy'],
'epoch', file_name='accuracy.png'))

# Print selected entries of the log to stdout
# Here "main" refers to the target link of the "main" optimizer again, and
# "validation" refers to the default name of the Evaluator extension.
# Entries other than 'epoch' are reported by the Classifier link, called by
# either the updater or the evaluator.
trainer.extend(extensions.PrintReport(
['epoch', 'main/loss', 'validation/main/loss',
'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))

# Run the training
trainer.run()

# Save the model to model_dir. It's loaded below in `model_fn`.
serializers.save_npz(os.path.join(args.model_dir, 'model.npz'), model)


def model_fn(model_dir):
"""
This function is called by the Chainer container during hosting when running on SageMaker with
values populated by the hosting environment.

This function loads models written during training into `model_dir`.

Args:
model_dir (str): path to the directory containing the saved model artifacts

Returns:
a loaded Chainer model

For more on `model_fn`, please visit the sagemaker-python-sdk repository:
https://github.com/aws/sagemaker-python-sdk

For more on the Chainer container, please visit the sagemaker-chainer-containers repository:
https://github.com/aws/sagemaker-chainer-containers
"""
chainer.config.train = False
model = L.Classifier(net.VGG(10))
serializers.load_npz(os.path.join(model_dir, 'model.npz'), model)
return model.predictor
112 changes: 112 additions & 0 deletions hyperparameter_tuning/chainer_cifar10/src/net.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import chainer
import chainer.functions as F
import chainer.links as L


class Block(chainer.Chain):

"""A convolution, batch norm, ReLU block.
A block in a feedforward network that performs a
convolution followed by batch normalization followed
by a ReLU activation.
For the convolution operation, a square filter size is used.
Args:
out_channels (int): The number of output channels.
ksize (int): The size of the filter is ksize x ksize.
pad (int): The padding to use for the convolution.
"""

def __init__(self, out_channels, ksize, pad=1):
super(Block, self).__init__()
with self.init_scope():
self.conv = L.Convolution2D(None, out_channels, ksize, pad=pad,
nobias=True)
self.bn = L.BatchNormalization(out_channels)

def __call__(self, x):
h = self.conv(x)
h = self.bn(h)
return F.relu(h)


class VGG(chainer.Chain):

"""A VGG-style network for very small images.
This model is based on the VGG-style model from
http://torch.ch/blog/2015/07/30/cifar.html
which is based on the network architecture from the paper:
https://arxiv.org/pdf/1409.1556v6.pdf
This model is intended to be used with either RGB or greyscale input
images that are of size 32x32 pixels, such as those in the CIFAR10
and CIFAR100 datasets.
On CIFAR10, it achieves approximately 89% accuracy on the test set with
no data augmentation.
On CIFAR100, it achieves approximately 63% accuracy on the test set with
no data augmentation.
Args:
class_labels (int): The number of class labels.
"""

def __init__(self, class_labels=10):
super(VGG, self).__init__()
with self.init_scope():
self.block1_1 = Block(64, 3)
self.block1_2 = Block(64, 3)
self.block2_1 = Block(128, 3)
self.block2_2 = Block(128, 3)
self.block3_1 = Block(256, 3)
self.block3_2 = Block(256, 3)
self.block3_3 = Block(256, 3)
self.block4_1 = Block(512, 3)
self.block4_2 = Block(512, 3)
self.block4_3 = Block(512, 3)
self.block5_1 = Block(512, 3)
self.block5_2 = Block(512, 3)
self.block5_3 = Block(512, 3)
self.fc1 = L.Linear(None, 512, nobias=True)
self.bn_fc1 = L.BatchNormalization(512)
self.fc2 = L.Linear(None, class_labels, nobias=True)

def __call__(self, x):
# 64 channel blocks:
h = self.block1_1(x)
h = F.dropout(h, ratio=0.3)
h = self.block1_2(h)
h = F.max_pooling_2d(h, ksize=2, stride=2)

# 128 channel blocks:
h = self.block2_1(h)
h = F.dropout(h, ratio=0.4)
h = self.block2_2(h)
h = F.max_pooling_2d(h, ksize=2, stride=2)

# 256 channel blocks:
h = self.block3_1(h)
h = F.dropout(h, ratio=0.4)
h = self.block3_2(h)
h = F.dropout(h, ratio=0.4)
h = self.block3_3(h)
h = F.max_pooling_2d(h, ksize=2, stride=2)

# 512 channel blocks:
h = self.block4_1(h)
h = F.dropout(h, ratio=0.4)
h = self.block4_2(h)
h = F.dropout(h, ratio=0.4)
h = self.block4_3(h)
h = F.max_pooling_2d(h, ksize=2, stride=2)

# 512 channel blocks:
h = self.block5_1(h)
h = F.dropout(h, ratio=0.4)
h = self.block5_2(h)
h = F.dropout(h, ratio=0.4)
h = self.block5_3(h)
h = F.max_pooling_2d(h, ksize=2, stride=2)

h = F.dropout(h, ratio=0.5)
h = self.fc1(h)
h = self.bn_fc1(h)
h = F.relu(h)
h = F.dropout(h, ratio=0.5)
return self.fc2(h)