speed drop when set multi_precision=True
#13650
Replies: 5 comments
-
@kaleidoscopical : Could you share the results you found? Also, it would be great if you share some minimum reproducible script so that anyone in the community could verify quickly and get back. This also seems like a very good topic for discuss.mxnet.io . You might catch the interest of wider audience there. @mxnet-label-bot add [question] |
Beta Was this translation helpful? Give feedback.
-
The script I have tried is simplified and posted below. I think the only required modification is to change import mxnet as mx
import numpy as np
import argparse
import os
import logging
class ResNetV1D(object):
def __init__(self, num_outputs=1000, workspace=1024,
bn_mom=0.9, bn_eps=1e-5):
super(ResNetV1D, self).__init__()
self.num_outputs = num_outputs
self.workspace = workspace
self.bn_mom = bn_mom
self.bn_eps = bn_eps
def conv3x3(self, data, name, num_filter, stride=None):
if stride is None:
output = mx.sym.Convolution(data=data, num_filter=num_filter,
kernel=(3, 3), stride=(1,1), pad=(1, 1),
no_bias=True, name=name, workspace=self.workspace)
else:
assert isinstance(stride, int), "stride should be int"
output = mx.sym.Convolution(data=data, num_filter=num_filter,
kernel=(3, 3), stride=(stride, stride), pad=(1, 1),
no_bias=True, name=name, workspace=self.workspace)
return output
def conv1x1(self, data, name, num_filter):
output = mx.sym.Convolution(data=data, num_filter=num_filter,
kernel=(1, 1), stride=(1, 1), pad=(0, 0),
no_bias=True, name=name, workspace=self.workspace)
return output
def bn(self, data, name, fix_gamma=False, last_gamma=False):
# config gamma and beta
if not last_gamma:
gamma = mx.sym.Variable(name=name+'_gamma', wd_mult=0)
else:
gamma = mx.sym.Variable(name=name+'_gamma', wd_mult=0, init=mx.init.Zero())
beta = mx.sym.Variable(name=name+'_beta', wd_mult=0)
# run bn
output = mx.sym.BatchNorm(data=data, gamma=gamma, beta=beta,
fix_gamma=fix_gamma, eps=self.bn_eps,
momentum=self.bn_mom, name=name)
return output
def relu(self, data, name):
output = mx.sym.Activation(data=data, act_type='relu', name=name)
return output
def pooling(self, data, name, pool_type, kernel=2, stride=2, pad=0, global_pool=False):
if not global_pool:
assert isinstance(kernel, int), "kernel should be int"
assert isinstance(stride, int), "stride should be int"
assert isinstance(pad, int), "pad should be int"
output = mx.symbol.Pooling(data=data, pool_type=pool_type,
kernel=(kernel, kernel), stride=(stride, stride),
pad=(pad, pad), name=name)
else:
output = mx.symbol.Pooling(data=data, global_pool=True,
kernel=(kernel, kernel),
pool_type=pool_type, name=name)
return output
def fc(self, data, name, num_hidden):
bias = mx.sym.Variable(name=name+'_bias', wd_mult=0)
output = mx.symbol.FullyConnected(data=data, bias=bias,
num_hidden=num_hidden, name=name)
return output
def build_stem(self, data):
# conv1
output = self.conv3x3(data=data, name="stage0_conv1", num_filter=32, stride=2)
output = self.bn(output, name="stage0_bn1")
output = self.relu(output, name="stage0_relu1")
# conv2
output = self.conv3x3(data=output, name="stage0_conv2", num_filter=32)
output = self.bn(output, name="stage0_bn2")
output = self.relu(output, name="stage0_relu2")
# conv3
output = self.conv3x3(data=output, name="stage0_conv3", num_filter=64)
output = self.bn(output, name="stage0_bn3")
output = self.relu(output, name="stage0_relu3")
# max_pooling
output = self.pooling(data=output, name="stage0_max_pooling",
pool_type='max', kernel=3, stride=2, pad=1)
return output
def build_block(self, data, name, channel_num, stride=1, down_sample=False):
# downsample
if down_sample:
if stride != 1:
downsample = self.pooling(data, name=name+"_avg_pool",
pool_type='avg', kernel=2, stride=2, pad=0)
else:
downsample = data
downsample = self.conv1x1(downsample, name=name+"_downsample_conv",
num_filter=channel_num*4)
downsample = self.bn(downsample, name=name+"_downsample_bn")
else:
downsample = data
# conv1
output = self.conv1x1(data, name=name+"_conv1", num_filter=channel_num)
output = self.bn(output, name=name+"_bn1")
output = self.relu(output, name=name+"_relu1")
# conv2
output = self.conv3x3(output, name=name+"_conv2",
num_filter=channel_num, stride=stride)
output = self.bn(output, name=name+"_bn2")
output = self.relu(output, name=name+"_relu2")
# conv3
output = self.conv1x1(output, name=name+"_conv3", num_filter=channel_num*4)
output = self.bn(output, name=name+"_bn3", last_gamma=True)
output = self.relu(output + downsample, name=name+"_relu3")
return output
def build_stage(self, data, name, channel_num, block_num, stride=2):
# config
assert isinstance(name, str), "name should be str"
# first block
output = self.build_block(data, name=name+"_block1", channel_num=channel_num,
stride=stride, down_sample=True)
# rest block
for i in range(1, block_num):
output = self.build_block(output, name=name+"_block"+str(i+1),
channel_num=channel_num)
return output
def build(self):
data = mx.sym.Variable(name="data")
label = mx.sym.Variable(name="softmax_label")
data = mx.sym.Cast(data=data, dtype=np.float16)
stage0 = self.build_stem(data)
stage1 = self.build_stage(stage0, name="stage1", channel_num=64, block_num=3, stride=1)
stage2 = self.build_stage(stage1, name="stage2", channel_num=128, block_num=4)
stage3 = self.build_stage(stage2, name="stage3", channel_num=256, block_num=6)
stage4 = self.build_stage(stage3, name="stage4", channel_num=512, block_num=3)
output = self.pooling(stage4, name="global_pooling", pool_type="avg", global_pool=True)
output = mx.symbol.Flatten(data=output)
output = self.fc(output, name="classifier", num_hidden=self.num_outputs)
output = mx.sym.Cast(data=output, dtype=np.float32)
output = mx.sym.SoftmaxOutput(data=output, label=label,
name="softmax_output", smooth_alpha=0.1)
return output
# config
parser = argparse.ArgumentParser(description='Train a model for image classification.')
parser.add_argument('--logging-file', type=str, default='train_imagenet.log',
help='name of training log file')
parser.add_argument('--save-dir', type=str, default='params',
help='directory of saved models')
opt = parser.parse_args()
filehandler = logging.FileHandler(opt.logging_file)
streamhandler = logging.StreamHandler()
logger = logging.getLogger('')
logger.setLevel(logging.INFO)
logger.addHandler(filehandler)
logger.addHandler(streamhandler)
logger.info(opt)
batch_size = 128 * 4
classes = 1000
num_training_samples = 1281167
begin_epoch = 0
epoch = 120
lr = 0.2
wd = 0.0001
warmup_epoch = 5
ctx = [mx.gpu(i) for i in range(4)]
frequent = 50
# lr_scheduler
num_batches = num_training_samples // batch_size
max_update = num_batches * epoch
warmup_steps = num_batches * warmup_epoch
lr_scheduler = mx.lr_scheduler.CosineScheduler(
max_update = max_update,
base_lr = lr,
final_lr = 0.,
warmup_steps = warmup_steps,
warmup_begin_lr = 0.,
warmup_mode = "linear"
)
# symbol
model = ResNetV1D()
symbol = model.build()
# iterator
train_iter = mx.io.ImageRecordIter(
path_imgrec = "data/imagenet/train.rec",
path_imgidx = "data/imagenet/train.idx",
data_shape = (3, 224, 224),
preprocess_threads = 60,
shuffle = True,
batch_size = batch_size,
random_resized_crop = True,
max_aspect_ratio = 4 / 3.,
min_aspect_ratio = 3 / 4.,
max_random_area = 1.,
min_random_area = 0.08,
brightness = 0.4,
saturation = 0.4,
contrast = 0.4,
pca_noise = 0.1,
random_mirror = True,
mean_r = 123.68,
mean_g = 116.779,
mean_b = 103.939,
std_r = 58.393,
std_g = 57.12,
std_b = 57.375,
)
val_iter = mx.io.ImageRecordIter(
path_imgrec = "data/imagenet/val.rec",
path_imgidx = "data/imagenet/val.idx",
data_shape = (3, 224, 224),
preprocess_threads = 60,
shuffle = False,
batch_size = batch_size,
resize = 256,
mean_r = 123.68,
mean_g = 116.779,
mean_b = 103.939,
std_r = 58.393,
std_g = 57.12,
std_b = 57.375,
)
# module
data_names = [k[0] for k in train_iter.provide_data]
label_names = [k[0] for k in train_iter.provide_label]
mod = mx.mod.Module(symbol, data_names=data_names,
label_names=label_names, logger=logger, context=ctx)
# metric
eval_metrics = mx.metric.CompositeEvalMetric()
for child_metric in [mx.metric.Accuracy()]:
eval_metrics.add(child_metric)
# callback
batch_end_callback = mx.callback.Speedometer(train_iter.batch_size, frequent=frequent, auto_reset=False)
epoch_end_callback = mx.callback.do_checkpoint(opt.save_dir)
# optimizer
optimizer_params = {'momentum': 0.9,
'wd': wd,
'lr_scheduler': lr_scheduler,
'multi_precision': True}
# fit
mod.fit(train_iter, val_iter, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback,
batch_end_callback=batch_end_callback, optimizer='nag', optimizer_params=optimizer_params,
initializer=mx.initializer.MSRAPrelu(), begin_epoch=begin_epoch, num_epoch=epoch)
|
Beta Was this translation helpful? Give feedback.
-
UPDATE: |
Beta Was this translation helpful? Give feedback.
-
I found the difference is the default choice of
Problem is solved by passing btw, I am still confused why there is a difference when setting |
Beta Was this translation helpful? Give feedback.
-
It's hard to give a definitive answer without looking at the profile, but the most probable explanation is as follows:
|
Beta Was this translation helpful? Give feedback.
-
A large speed drop is observed when set
multi_precision=True
inmx.mod.Module
, while not ingluon.Trainer
. Has anyone run into a similar situation? Any suggestion?Beta Was this translation helpful? Give feedback.
All reactions