-
Notifications
You must be signed in to change notification settings - Fork 6.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #292 from awslabs/laurenyu-chainer-tuning
Add Chainer hyperparameter tuning notebook
- Loading branch information
Showing
14 changed files
with
742 additions
and
0 deletions.
There are no files selected for viewing
444 changes: 444 additions & 0 deletions
444
hyperparameter_tuning/chainer_cifar10/chainer_single_machine_cifar10.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"). You | ||
# may not use this file except in compliance with the License. A copy of | ||
# the License is located at | ||
# | ||
# http://aws.amazon.com/apache2.0/ | ||
# | ||
# or in the "license" file accompanying this file. This file is | ||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF | ||
# ANY KIND, either express or implied. See the License for the specific | ||
# language governing permissions and limitations under the License. | ||
|
||
import boto3 | ||
import tarfile | ||
from urllib.parse import urlparse | ||
import os | ||
|
||
def retrieve_output_from_s3(s3_url, output_dir): | ||
""" | ||
Downloads output artifacts from s3 and extracts them into the given directory. | ||
Args: | ||
s3_url: S3 URL to the output artifacts | ||
output_dir: directory to write artifacts to | ||
""" | ||
o = urlparse(s3_url) | ||
s3 = boto3.resource('s3') | ||
output_data_path = os.path.join(output_dir) | ||
output_file_name = os.path.join(output_data_path, 'output.tar.gz') | ||
try: | ||
os.makedirs(output_data_path) | ||
except FileExistsError: | ||
pass | ||
s3.Bucket(o.netloc).download_file(o.path.lstrip('/'), output_file_name) | ||
tar = tarfile.open(output_file_name) | ||
tar.extractall(output_data_path) | ||
tar.close() |
148 changes: 148 additions & 0 deletions
148
hyperparameter_tuning/chainer_cifar10/src/chainer_cifar_vgg_single_machine.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"). You | ||
# may not use this file except in compliance with the License. A copy of | ||
# the License is located at | ||
# | ||
# http://aws.amazon.com/apache2.0/ | ||
# | ||
# or in the "license" file accompanying this file. This file is | ||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF | ||
# ANY KIND, either express or implied. See the License for the specific | ||
# language governing permissions and limitations under the License. | ||
|
||
from __future__ import print_function, absolute_import | ||
|
||
import argparse | ||
import os | ||
|
||
import numpy as np | ||
|
||
import chainer | ||
import chainer.functions as F | ||
import chainer.links as L | ||
from chainer import training | ||
from chainer import serializers | ||
from chainer.training import extensions | ||
|
||
import net | ||
|
||
if __name__ =='__main__': | ||
|
||
parser = argparse.ArgumentParser() | ||
|
||
# retrieve the hyperparameters we set from the client (with some defaults) | ||
parser.add_argument('--epochs', type=int, default=50) | ||
parser.add_argument('--batch-size', type=int, default=64) | ||
parser.add_argument('--learning-rate', type=float, default=0.05) | ||
|
||
# Data, model, and output directories These are required. | ||
parser.add_argument('--output-dir', type=str, default=os.environ['SM_OUTPUT_DIR']) | ||
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) | ||
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN']) | ||
parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST']) | ||
|
||
args, _ = parser.parse_known_args() | ||
|
||
num_gpus = int(os.environ['SM_NUM_GPUS']) | ||
|
||
train_data = np.load(os.path.join(args.train, 'train.npz'))['data'] | ||
train_labels = np.load(os.path.join(args.train, 'train.npz'))['labels'] | ||
|
||
test_data = np.load(os.path.join(args.test, 'test.npz'))['data'] | ||
test_labels = np.load(os.path.join(args.test, 'test.npz'))['labels'] | ||
|
||
train = chainer.datasets.TupleDataset(train_data, train_labels) | ||
test = chainer.datasets.TupleDataset(test_data, test_labels) | ||
|
||
print('# Minibatch-size: {}'.format(args.batch_size)) | ||
print('# epoch: {}'.format(args.epochs)) | ||
print('# learning rate: {}'.format(args.learning_rate)) | ||
|
||
# Set up a neural network to train. | ||
# Classifier reports softmax cross entropy loss and accuracy at every | ||
# iteration, which will be used by the PrintReport extension below. | ||
model = L.Classifier(net.VGG(10)) | ||
|
||
optimizer = chainer.optimizers.MomentumSGD(args.learning_rate) | ||
optimizer.setup(model) | ||
optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4)) | ||
|
||
# Set up a trainer | ||
device = 0 if num_gpus > 0 else -1 # -1 indicates CPU, 0 indicates first GPU device. | ||
if num_gpus > 1: | ||
devices = range(num_gpus) | ||
train_iters = [chainer.iterators.MultiprocessIterator(i, args.batch_size, n_processes=4) \ | ||
for i in chainer.datasets.split_dataset_n_random(train, len(devices))] | ||
test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False, n_processes=num_gpus) | ||
updater = training.updaters.MultiprocessParallelUpdater(train_iters, optimizer, devices=range(num_gpus)) | ||
else: | ||
train_iter = chainer.iterators.MultiprocessIterator(train, args.batch_size) | ||
test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False) | ||
updater = training.updater.StandardUpdater(train_iter, optimizer, device=device) | ||
|
||
stop_trigger = (args.epochs, 'epoch') | ||
|
||
output_data_dir = os.path.join(args.output_dir, 'data') | ||
trainer = training.Trainer(updater, stop_trigger, out=output_data_dir) | ||
# Evaluate the model with the test dataset for each epoch | ||
trainer.extend(extensions.Evaluator(test_iter, model, device=device)) | ||
|
||
# Reduce the learning rate by half every 25 epochs. | ||
trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(25, 'epoch')) | ||
|
||
# Dump a computational graph from 'loss' variable at the first iteration | ||
# The "main" refers to the target link of the "main" optimizer. | ||
trainer.extend(extensions.dump_graph('main/loss')) | ||
|
||
# Write a log of evaluation statistics for each epoch | ||
trainer.extend(extensions.LogReport()) | ||
|
||
if extensions.PlotReport.available(): | ||
trainer.extend( | ||
extensions.PlotReport(['main/loss', 'validation/main/loss'], | ||
'epoch', file_name='loss.png')) | ||
trainer.extend( | ||
extensions.PlotReport( | ||
['main/accuracy', 'validation/main/accuracy'], | ||
'epoch', file_name='accuracy.png')) | ||
|
||
# Print selected entries of the log to stdout | ||
# Here "main" refers to the target link of the "main" optimizer again, and | ||
# "validation" refers to the default name of the Evaluator extension. | ||
# Entries other than 'epoch' are reported by the Classifier link, called by | ||
# either the updater or the evaluator. | ||
trainer.extend(extensions.PrintReport( | ||
['epoch', 'main/loss', 'validation/main/loss', | ||
'main/accuracy', 'validation/main/accuracy', 'elapsed_time'])) | ||
|
||
# Run the training | ||
trainer.run() | ||
|
||
# Save the model to model_dir. It's loaded below in `model_fn`. | ||
serializers.save_npz(os.path.join(args.model_dir, 'model.npz'), model) | ||
|
||
|
||
def model_fn(model_dir): | ||
""" | ||
This function is called by the Chainer container during hosting when running on SageMaker with | ||
values populated by the hosting environment. | ||
This function loads models written during training into `model_dir`. | ||
Args: | ||
model_dir (str): path to the directory containing the saved model artifacts | ||
Returns: | ||
a loaded Chainer model | ||
For more on `model_fn`, please visit the sagemaker-python-sdk repository: | ||
https://github.com/aws/sagemaker-python-sdk | ||
For more on the Chainer container, please visit the sagemaker-chainer-containers repository: | ||
https://github.com/aws/sagemaker-chainer-containers | ||
""" | ||
chainer.config.train = False | ||
model = L.Classifier(net.VGG(10)) | ||
serializers.load_npz(os.path.join(model_dir, 'model.npz'), model) | ||
return model.predictor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
import chainer | ||
import chainer.functions as F | ||
import chainer.links as L | ||
|
||
|
||
class Block(chainer.Chain): | ||
|
||
"""A convolution, batch norm, ReLU block. | ||
A block in a feedforward network that performs a | ||
convolution followed by batch normalization followed | ||
by a ReLU activation. | ||
For the convolution operation, a square filter size is used. | ||
Args: | ||
out_channels (int): The number of output channels. | ||
ksize (int): The size of the filter is ksize x ksize. | ||
pad (int): The padding to use for the convolution. | ||
""" | ||
|
||
def __init__(self, out_channels, ksize, pad=1): | ||
super(Block, self).__init__() | ||
with self.init_scope(): | ||
self.conv = L.Convolution2D(None, out_channels, ksize, pad=pad, | ||
nobias=True) | ||
self.bn = L.BatchNormalization(out_channels) | ||
|
||
def __call__(self, x): | ||
h = self.conv(x) | ||
h = self.bn(h) | ||
return F.relu(h) | ||
|
||
|
||
class VGG(chainer.Chain): | ||
|
||
"""A VGG-style network for very small images. | ||
This model is based on the VGG-style model from | ||
http://torch.ch/blog/2015/07/30/cifar.html | ||
which is based on the network architecture from the paper: | ||
https://arxiv.org/pdf/1409.1556v6.pdf | ||
This model is intended to be used with either RGB or greyscale input | ||
images that are of size 32x32 pixels, such as those in the CIFAR10 | ||
and CIFAR100 datasets. | ||
On CIFAR10, it achieves approximately 89% accuracy on the test set with | ||
no data augmentation. | ||
On CIFAR100, it achieves approximately 63% accuracy on the test set with | ||
no data augmentation. | ||
Args: | ||
class_labels (int): The number of class labels. | ||
""" | ||
|
||
def __init__(self, class_labels=10): | ||
super(VGG, self).__init__() | ||
with self.init_scope(): | ||
self.block1_1 = Block(64, 3) | ||
self.block1_2 = Block(64, 3) | ||
self.block2_1 = Block(128, 3) | ||
self.block2_2 = Block(128, 3) | ||
self.block3_1 = Block(256, 3) | ||
self.block3_2 = Block(256, 3) | ||
self.block3_3 = Block(256, 3) | ||
self.block4_1 = Block(512, 3) | ||
self.block4_2 = Block(512, 3) | ||
self.block4_3 = Block(512, 3) | ||
self.block5_1 = Block(512, 3) | ||
self.block5_2 = Block(512, 3) | ||
self.block5_3 = Block(512, 3) | ||
self.fc1 = L.Linear(None, 512, nobias=True) | ||
self.bn_fc1 = L.BatchNormalization(512) | ||
self.fc2 = L.Linear(None, class_labels, nobias=True) | ||
|
||
def __call__(self, x): | ||
# 64 channel blocks: | ||
h = self.block1_1(x) | ||
h = F.dropout(h, ratio=0.3) | ||
h = self.block1_2(h) | ||
h = F.max_pooling_2d(h, ksize=2, stride=2) | ||
|
||
# 128 channel blocks: | ||
h = self.block2_1(h) | ||
h = F.dropout(h, ratio=0.4) | ||
h = self.block2_2(h) | ||
h = F.max_pooling_2d(h, ksize=2, stride=2) | ||
|
||
# 256 channel blocks: | ||
h = self.block3_1(h) | ||
h = F.dropout(h, ratio=0.4) | ||
h = self.block3_2(h) | ||
h = F.dropout(h, ratio=0.4) | ||
h = self.block3_3(h) | ||
h = F.max_pooling_2d(h, ksize=2, stride=2) | ||
|
||
# 512 channel blocks: | ||
h = self.block4_1(h) | ||
h = F.dropout(h, ratio=0.4) | ||
h = self.block4_2(h) | ||
h = F.dropout(h, ratio=0.4) | ||
h = self.block4_3(h) | ||
h = F.max_pooling_2d(h, ksize=2, stride=2) | ||
|
||
# 512 channel blocks: | ||
h = self.block5_1(h) | ||
h = F.dropout(h, ratio=0.4) | ||
h = self.block5_2(h) | ||
h = F.dropout(h, ratio=0.4) | ||
h = self.block5_3(h) | ||
h = F.max_pooling_2d(h, ksize=2, stride=2) | ||
|
||
h = F.dropout(h, ratio=0.5) | ||
h = self.fc1(h) | ||
h = self.bn_fc1(h) | ||
h = F.relu(h) | ||
h = F.dropout(h, ratio=0.5) | ||
return self.fc2(h) |