From 72bb214c8a8afd71b98ef03fd3fd10cfe5fb3a86 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 5 Jun 2018 21:51:17 +0800 Subject: [PATCH 01/12] 1. add weight decay feature into fluid benchmark test 2. add learning rate decay feature into fluid benchmark test 3. add L1&L2 regularization feature into fluid benchmark test 4. add error clipping feature into fluid benchmark test 5. add gradient clipping feature into fluid benchmark test --- benchmark/fluid/fluid_benchmark.py | 69 +++++++++++++++++++ benchmark/fluid/models/machine_translation.py | 24 ++++++- benchmark/fluid/models/mnist.py | 26 ++++++- benchmark/fluid/models/resnet.py | 28 ++++++-- .../fluid/models/stacked_dynamic_lstm.py | 20 +++++- benchmark/fluid/models/vgg.py | 23 ++++++- 6 files changed, 176 insertions(+), 14 deletions(-) diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 9d33a841cddb8d..67d1591595a068 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -39,8 +39,67 @@ def parse_args(): help='The model to run benchmark with.') parser.add_argument( '--batch_size', type=int, default=32, help='The minibatch size.') + # args related to learning rate parser.add_argument( '--learning_rate', type=float, default=0.001, help='The learning rate.') + parser.add_argument( + '--learning_rate_decay_method', + type=str, + default=None, + choices=['exponential', 'natural_exp', 'inverse_time'], + help='Learning rate decay method, can be exponential, natural_exp, inverse_time' + ) + parser.add_argument( + '--learning_rate_decay_steps', + type=int, + default=100000, + help='Decay steps for learning rate decay method') + parser.add_argument( + '--learning_rate_decay_rate', + type=float, + default=0.5, + help='Decay rate for learning rate decay method') + # args related to regularization + parser.add_argument( + '--weight_decay_regularizer_method', + type=str, + default=None, + choices=['L1', 'L2'], + help='Weight decay regularizer method, can be L1, L2') + parser.add_argument( + '--weight_decay_regularizer_coeff', + type=float, + default=0.1, + help='Weight decay regularizer coeff, 0.1 for default') + # args related to gradient clipping + parser.add_argument( + '--gradient_clip_method', + type=str, + default=None, + choices=['Norm', 'GlobalNorm'], + help='Gradient clipping method, can be Norm, GlobalNorm') + parser.add_argument( + '--gradient_clip_norm', + type=float, + default=1., + help='Gradient clipping norm, 1. for default') + # args related to error clipping + parser.add_argument( + '--error_clip_method', + type=str, + default=None, + choices=['Value'], + help='Error clipping method, can be Value') + parser.add_argument( + '--error_clip_min', + type=float, + default=1e-6, + help='Error clipping min value, 1e-6 for default') + parser.add_argument( + '--error_clip_max', + type=float, + default=2e-6, + help='Error clipping max value, 2e-6 for default') # TODO(wuyi): add "--use_fake_data" option back. parser.add_argument( '--skip_batch_num', @@ -103,6 +162,16 @@ def parse_args(): default='local', choices=['local', 'pserver', 'nccl2'], help='Choose parameter update method, can be local, pserver, nccl2.') + parser.add_argument( + '--no_split_var', + action='store_true', + default=False, + help='Whether split variables into blocks when update_method is pserver') + parser.add_argument( + '--async_mode', + action='store_true', + default=False, + help='Whether start pserver in async mode to support ASGD') args = parser.parse_args() return args diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py index 635b3373dd27b2..637b291f8a4f6e 100644 --- a/benchmark/fluid/models/machine_translation.py +++ b/benchmark/fluid/models/machine_translation.py @@ -26,6 +26,10 @@ import paddle.fluid.core as core import paddle.fluid.framework as framework from paddle.fluid.executor import Executor +from models.model_base import get_decay_learning_rate +from models.model_base import get_regularization +from models.model_base import set_error_clip +from models.model_base import set_gradient_clip def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): @@ -50,7 +54,7 @@ def linear(inputs): def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, - target_dict_dim, is_generating, beam_size, max_length): + target_dict_dim, is_generating, beam_size, max_length, args): """Construct a seq2seq network.""" def bi_lstm_encoder(input_seq, gate_size): @@ -99,6 +103,8 @@ def bi_lstm_encoder(input_seq, gate_size): size=decoder_size, bias_attr=False, act='tanh') + set_error_clip(args.error_clip_method, encoded_proj.name, + args.error_clip_min, args.error_clip_max) def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj, decoder_boot, decoder_size): @@ -211,12 +217,24 @@ def get_model(args): dict_size, False, beam_size=beam_size, - max_length=max_length) + max_length=max_length, + args=args) # clone from default main program inference_program = fluid.default_main_program().clone() - optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + # set gradient clip + set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm) + + optimizer = fluid.optimizer.Adam( + learning_rate=get_decay_learning_rate( + decay_method=args.learning_rate_decay_method, + learning_rate=args.learning_rate, + decay_steps=args.learning_rate_decay_steps, + decay_rate=args.learning_rate_decay_rate), + regularization=get_regularization( + regularizer_method=args.weight_decay_regularizer_method, + regularizer_coeff=args.weight_decay_regularizer_coeff)) train_batch_generator = paddle.batch( paddle.reader.shuffle( diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py index d264bfc12bdb15..ef1ae433a3b639 100644 --- a/benchmark/fluid/models/mnist.py +++ b/benchmark/fluid/models/mnist.py @@ -24,6 +24,10 @@ import paddle import paddle.fluid as fluid import paddle.fluid.profiler as profiler +from models.model_base import get_decay_learning_rate +from models.model_base import get_regularization +from models.model_base import set_error_clip +from models.model_base import set_gradient_clip SEED = 1 DTYPE = "float32" @@ -32,7 +36,7 @@ # fluid.default_startup_program().random_seed = SEED -def cnn_model(data): +def cnn_model(data, args): conv_pool_1 = fluid.nets.simple_img_conv_pool( input=data, filter_size=5, @@ -48,6 +52,9 @@ def cnn_model(data): pool_stride=2, act="relu") + set_error_clip(args.error_clip_method, conv_pool_1.name, + args.error_clip_min, args.error_clip_max) + # TODO(dzhwinter) : refine the initializer and random seed settting SIZE = 10 input_shape = conv_pool_2.shape @@ -70,7 +77,8 @@ def get_model(args): label = fluid.layers.data(name='label', shape=[1], dtype='int64') # Train program - predict = cnn_model(images) + predict = cnn_model(images, args) + cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) @@ -82,9 +90,21 @@ def get_model(args): # inference program inference_program = fluid.default_main_program().clone() + # set gradient clip + # set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm) + # Optimization opt = fluid.optimizer.AdamOptimizer( - learning_rate=0.001, beta1=0.9, beta2=0.999) + learning_rate=get_decay_learning_rate( + decay_method=args.learning_rate_decay_method, + learning_rate=0.001, + decay_steps=args.learning_rate_decay_steps, + decay_rate=args.learning_rate_decay_rate), + regularization=get_regularization( + regularizer_method=args.weight_decay_regularizer_method, + regularizer_coeff=args.weight_decay_regularizer_coeff), + beta1=0.9, + beta2=0.999) # Reader train_reader = paddle.batch( diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index 9dec8911ed64e0..d7e1293fcf2923 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -26,6 +26,10 @@ import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.profiler as profiler +from models.model_base import get_decay_learning_rate +from models.model_base import get_regularization +from models.model_base import set_error_clip +from models.model_base import set_gradient_clip def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): @@ -70,7 +74,7 @@ def layer_warp(block_func, input, ch_out, count, stride): return res_out -def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'): +def resnet_imagenet(input, class_dim, args, depth=50, data_format='NCHW'): cfg = { 18: ([2, 2, 2, 1], basicblock), @@ -94,10 +98,12 @@ def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'): pool_stride=1, global_pooling=True) out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax') + set_error_clip(args.error_clip_method, out.name, args.error_clip_min, + args.error_clip_max) return out -def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): +def resnet_cifar10(input, class_dim, args, depth=32, data_format='NCHW'): assert (depth - 2) % 6 == 0 n = (depth - 2) // 6 @@ -110,6 +116,8 @@ def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): pool = fluid.layers.pool2d( input=res3, pool_size=8, pool_type='avg', pool_stride=1) out = fluid.layers.fc(input=pool, size=class_dim, act='softmax') + set_error_clip(args.error_clip_method, out.name, args.error_clip_min, + args.error_clip_max) return out @@ -132,7 +140,7 @@ def get_model(args): input = fluid.layers.data(name='data', shape=dshape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') - predict = model(input, class_dim) + predict = model(input, class_dim, args=args) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) @@ -145,7 +153,19 @@ def get_model(args): inference_program = fluid.io.get_inference_program( target_vars=[batch_acc, batch_size_tensor]) - optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + # set gradient clip + set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm) + + optimizer = fluid.optimizer.Momentum( + learning_rate=get_decay_learning_rate( + decay_method=args.learning_rate_decay_method, + learning_rate=0.01, + decay_steps=args.learning_rate_decay_steps, + decay_rate=args.learning_rate_decay_rate), + regularization=get_regularization( + regularizer_method=args.weight_decay_regularizer_method, + regularizer_coeff=args.weight_decay_regularizer_coeff), + momentum=0.9) train_reader = paddle.batch( paddle.reader.shuffle( diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py index 81a28b5f3aed0c..c84caed175434d 100644 --- a/benchmark/fluid/models/stacked_dynamic_lstm.py +++ b/benchmark/fluid/models/stacked_dynamic_lstm.py @@ -28,6 +28,10 @@ import paddle.fluid as fluid import paddle.batch as batch import paddle.fluid.profiler as profiler +from models.model_base import get_decay_learning_rate +from models.model_base import get_regularization +from models.model_base import set_error_clip +from models.model_base import set_gradient_clip word_dict = imdb.word_dict() @@ -55,6 +59,9 @@ def get_model(args): sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh') + set_error_clip(args.error_clip_method, sentence.name, args.error_clip_min, + args.error_clip_max) + rnn = fluid.layers.DynamicRNN() with rnn.block(): word = rnn.step_input(sentence) @@ -110,7 +117,18 @@ def gate_common( inference_program = fluid.io.get_inference_program( target_vars=[batch_acc, batch_size_tensor]) - adam = fluid.optimizer.Adam() + # set gradient clip + set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm) + + adam = fluid.optimizer.Adam( + learning_rate=get_decay_learning_rate( + decay_method=args.learning_rate_decay_method, + learning_rate=0.001, + decay_steps=args.learning_rate_decay_steps, + decay_rate=args.learning_rate_decay_rate), + regularization=get_regularization( + regularizer_method=args.weight_decay_regularizer_method, + regularizer_coeff=args.weight_decay_regularizer_coeff)) train_reader = batch( paddle.reader.shuffle( diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py index 53856c5f7acd3a..2d621760b2480a 100644 --- a/benchmark/fluid/models/vgg.py +++ b/benchmark/fluid/models/vgg.py @@ -22,9 +22,13 @@ import paddle.fluid.core as core import argparse import functools +from models.model_base import get_decay_learning_rate +from models.model_base import get_regularization +from models.model_base import set_error_clip +from models.model_base import set_gradient_clip -def vgg16_bn_drop(input): +def vgg16_bn_drop(input, args): def conv_block(input, num_filter, groups, dropouts): return fluid.nets.img_conv_group( input=input, @@ -48,6 +52,8 @@ def conv_block(input, num_filter, groups, dropouts): bn = fluid.layers.batch_norm(input=fc1, act='relu') drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) fc2 = fluid.layers.fc(input=drop2, size=512, act=None) + set_error_clip(args.error_clip_method, fc1.name, args.error_clip_min, + args.error_clip_max) return fc2 @@ -70,7 +76,7 @@ def get_model(args): label = fluid.layers.data(name='label', shape=[1], dtype='int64') # Train program - net = vgg16_bn_drop(images) + net = vgg16_bn_drop(images, args=args) predict = fluid.layers.fc(input=net, size=classdim, act='softmax') cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) @@ -86,8 +92,19 @@ def get_model(args): inference_program = fluid.io.get_inference_program( target_vars=[batch_acc, batch_size_tensor]) + # set gradient clip + set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm) + # Optimization - optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + optimizer = fluid.optimizer.Adam( + learning_rate=get_decay_learning_rate( + decay_method=args.learning_rate_decay_method, + learning_rate=args.learning_rate, + decay_steps=args.learning_rate_decay_steps, + decay_rate=args.learning_rate_decay_rate), + regularization=get_regularization( + regularizer_method=args.weight_decay_regularizer_method, + regularizer_coeff=args.weight_decay_regularizer_coeff)) # data reader train_reader = paddle.batch( From 3bd8f9e4f8f1218ab2398707223659062c7f26f7 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 5 Jun 2018 22:05:50 +0800 Subject: [PATCH 02/12] Add some document to README.md under benchmark/fluid/ repo --- benchmark/fluid/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md index 1b0c7dce8bd6fa..80aa863734b691 100644 --- a/benchmark/fluid/README.md +++ b/benchmark/fluid/README.md @@ -24,10 +24,14 @@ Currently supported `--model` argument include: * Run the following command to start a benchmark job locally: ```bash - python fluid_benchmark.py --model mnist --device GPU + python fluid_benchmark.py --model mnist --device GPU ``` You can choose to use GPU/CPU training. With GPU training, you can specify `--gpus ` to run multi GPU training. + You can set gradient clipping. With gradient clipping, you can specify + `--gradient_clipping_method GlobalNorm` to clip the gradient with global norm. + You can set regularizer to optimizer. With regularization, you can specify + `--weight_decay_regularizer_method L1` to add regularizer to optimizer. * Run distributed training with parameter servers: * start parameter servers: ```bash From 3bf93b3378f86b3de7fadd6422c6c7e7d7f2a173 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 6 Jun 2018 11:40:37 +0800 Subject: [PATCH 03/12] Add model_base.py --- benchmark/fluid/models/model_base.py | 86 ++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 benchmark/fluid/models/model_base.py diff --git a/benchmark/fluid/models/model_base.py b/benchmark/fluid/models/model_base.py new file mode 100644 index 00000000000000..e2135442e47e26 --- /dev/null +++ b/benchmark/fluid/models/model_base.py @@ -0,0 +1,86 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse + +import paddle.fluid as fluid +from paddle.fluid.regularizer import L1DecayRegularizer +from paddle.fluid.regularizer import L2DecayRegularizer +from paddle.fluid.clip import GradientClipByNorm +from paddle.fluid.clip import GradientClipByGlobalNorm +from paddle.fluid.clip import ErrorClipByValue + +__all__ = [ + 'get_decay_learning_rate', + 'get_regularization', + 'set_error_clip', + 'set_gradient_clip', +] + + +def get_decay_learning_rate(decay_method, + learning_rate=0.001, + decay_steps=100000, + decay_rate=0.5, + staircase=True): + if not decay_method: + return learning_rate + else: + decay_op = getattr(fluid.layers, "%s_decay" % decay_method) + return decay_op( + learning_rate=learning_rate, + decay_steps=decay_steps, + decay_rate=decay_rate) + + +def get_regularization(regularizer_method, regularizer_coeff=0.1): + if not regularizer_method: + return None + else: + RegularizerClazz = globals()["%sDecayRegularizer" % regularizer_method] + regularizer = RegularizerClazz(regularization_coeff=regularizer_coeff) + return regularizer + + +def set_error_clip(clip_method, + layer_name, + clip_min=-1e-6, + clip_max=2e-6, + program=None): + assert clip_min < clip_max + if not clip_method: + return None + else: + ClipClazz = globals()["ErrorClipBy%s" % clip_method] + if not program: + prog = fluid.default_main_program() + else: + prog = program + prog.block(0).var(layer_name).set_error_clip( + ClipClazz( + max=clip_max, min=clip_min)) + + +def set_gradient_clip(clip_method, clip_norm=1.): + if not clip_method: + return None + else: + ClipClazz = globals()["GradientClipBy%s" % clip_method] + fluid.clip.set_gradient_clip(ClipClazz(clip_norm=clip_norm)) + return clip_method From 8041e8dff6d5c9fb1a7926f6a67eeceea4b5297a Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 6 Jun 2018 14:46:40 +0800 Subject: [PATCH 04/12] Fix bugs in test_listen_and_serv_op --- .../paddle/fluid/tests/unittests/test_listen_and_serv_op.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py index 1226027ddc9c0b..836fcd651d4b11 100644 --- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py +++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py @@ -71,16 +71,17 @@ def _start_pserver(self, use_cuda, sync_mode): def _wait_ps_ready(self, pid): retry_times = self.ps_timeout + sleep_time = 0.5 while True: assert retry_times >= 0, "wait ps ready failed" - time.sleep(0.5) + time.sleep(sleep_time) try: # the listen_and_serv_op would touch a file which contains the listen port # on the /tmp directory until it was ready to process all the RPC call. os.stat("/tmp/paddle.%d.port" % pid) return except os.error: - retry_times -= 1 + retry_times -= sleep_time def test_rpc_interfaces(self): # TODO(Yancey1989): need to make sure the rpc interface correctly. From 4dd0ded52b529d1de1e69eee65b606cf5fd93e66 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 8 Jun 2018 15:18:01 +0800 Subject: [PATCH 05/12] 1. remove args out of fluid_benchmark.py 2. remove lr_decay, regularization, clipping out of fluid_benchmark.py --- benchmark/fluid/args.py | 172 +++++++++++++++++++++++++++++ benchmark/fluid/fluid_benchmark.py | 156 +------------------------- 2 files changed, 173 insertions(+), 155 deletions(-) create mode 100644 benchmark/fluid/args.py diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py new file mode 100644 index 00000000000000..7585dc4ac6390e --- /dev/null +++ b/benchmark/fluid/args.py @@ -0,0 +1,172 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +__all__ = ['parse_args', ] + +BENCHMARK_MODELS = [ + "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" +] + + +def parse_args(): + parser = argparse.ArgumentParser('Fluid model benchmarks.') + parser.add_argument( + '--model', + type=str, + choices=BENCHMARK_MODELS, + default='resnet', + help='The model to run benchmark with.') + parser.add_argument( + '--batch_size', type=int, default=32, help='The minibatch size.') + # args related to learning rate + parser.add_argument( + '--learning_rate', type=float, default=0.001, help='The learning rate.') + parser.add_argument( + '--learning_rate_decay_method', + type=str, + default=None, + choices=[], + help='Learning rate decay method, not allowed yet') + parser.add_argument( + '--learning_rate_decay_steps', + type=int, + default=100000, + help='Decay steps for learning rate decay method') + parser.add_argument( + '--learning_rate_decay_rate', + type=float, + default=0.5, + help='Decay rate for learning rate decay method') + # args related to regularization + parser.add_argument( + '--weight_decay_regularizer_method', + type=str, + default=None, + choices=[], + help='Weight decay regularizer method, not allowed yet') + parser.add_argument( + '--weight_decay_regularizer_coeff', + type=float, + default=0.1, + help='Weight decay regularizer coeff, 0.1 for default') + # args related to gradient clipping + parser.add_argument( + '--gradient_clip_method', + type=str, + default=None, + choices=[], + help='Gradient clipping method, not allowed yet') + parser.add_argument( + '--gradient_clip_norm', + type=float, + default=1., + help='Gradient clipping norm, 1. for default') + # args related to error clipping + parser.add_argument( + '--error_clip_method', + type=str, + default=None, + choices=[], + help='Error clipping method, not allowed yet') + parser.add_argument( + '--error_clip_min', + type=float, + default=1e-6, + help='Error clipping min value, 1e-6 for default') + parser.add_argument( + '--error_clip_max', + type=float, + default=2e-6, + help='Error clipping max value, 2e-6 for default') + # TODO(wuyi): add "--use_fake_data" option back. + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=100, help='The number of passes.') + parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data data_format, now only support NCHW.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--gpus', + type=int, + default=1, + help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') + parser.add_argument( + '--cpus', + type=int, + default=1, + help='If cpus > 1, will use ParallelDo to run, else use Executor.') + parser.add_argument( + '--data_set', + type=str, + default='flowers', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + parser.add_argument( + '--no_test', + action='store_true', + help='If set, do not test the testset during training.') + parser.add_argument( + '--memory_optimize', + action='store_true', + help='If set, optimize runtime memory before start.') + parser.add_argument( + '--use_fake_data', + action='store_true', + help='If set ommit the actual read data operators.') + parser.add_argument( + '--profile', action='store_true', help='If set, profile a few steps.') + parser.add_argument( + '--update_method', + type=str, + default='local', + choices=['local', 'pserver', 'nccl2'], + help='Choose parameter update method, can be local, pserver, nccl2.') + parser.add_argument( + '--no_split_var', + action='store_true', + default=False, + help='Whether split variables into blocks when update_method is pserver') + parser.add_argument( + '--async_mode', + action='store_true', + default=False, + help='Whether start pserver in async mode to support ASGD') + args = parser.parse_args() + return args diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 553395d3f8d14d..d3a29bd83c5ebd 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -24,161 +24,7 @@ import paddle.fluid.profiler as profiler import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler -BENCHMARK_MODELS = [ - "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" -] - - -def parse_args(): - parser = argparse.ArgumentParser('Fluid model benchmarks.') - parser.add_argument( - '--model', - type=str, - choices=BENCHMARK_MODELS, - default='resnet', - help='The model to run benchmark with.') - parser.add_argument( - '--batch_size', type=int, default=32, help='The minibatch size.') - # args related to learning rate - parser.add_argument( - '--learning_rate', type=float, default=0.001, help='The learning rate.') - parser.add_argument( - '--learning_rate_decay_method', - type=str, - default=None, - choices=['exponential', 'natural_exp', 'inverse_time'], - help='Learning rate decay method, can be exponential, natural_exp, inverse_time' - ) - parser.add_argument( - '--learning_rate_decay_steps', - type=int, - default=100000, - help='Decay steps for learning rate decay method') - parser.add_argument( - '--learning_rate_decay_rate', - type=float, - default=0.5, - help='Decay rate for learning rate decay method') - # args related to regularization - parser.add_argument( - '--weight_decay_regularizer_method', - type=str, - default=None, - choices=['L1', 'L2'], - help='Weight decay regularizer method, can be L1, L2') - parser.add_argument( - '--weight_decay_regularizer_coeff', - type=float, - default=0.1, - help='Weight decay regularizer coeff, 0.1 for default') - # args related to gradient clipping - parser.add_argument( - '--gradient_clip_method', - type=str, - default=None, - choices=['Norm', 'GlobalNorm'], - help='Gradient clipping method, can be Norm, GlobalNorm') - parser.add_argument( - '--gradient_clip_norm', - type=float, - default=1., - help='Gradient clipping norm, 1. for default') - # args related to error clipping - parser.add_argument( - '--error_clip_method', - type=str, - default=None, - choices=['Value'], - help='Error clipping method, can be Value') - parser.add_argument( - '--error_clip_min', - type=float, - default=1e-6, - help='Error clipping min value, 1e-6 for default') - parser.add_argument( - '--error_clip_max', - type=float, - default=2e-6, - help='Error clipping max value, 2e-6 for default') - # TODO(wuyi): add "--use_fake_data" option back. - parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test' - ) - parser.add_argument( - '--iterations', type=int, default=80, help='The number of minibatches.') - parser.add_argument( - '--pass_num', type=int, default=100, help='The number of passes.') - parser.add_argument( - '--data_format', - type=str, - default='NCHW', - choices=['NCHW', 'NHWC'], - help='The data data_format, now only support NCHW.') - parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help='The device type.') - parser.add_argument( - '--gpus', - type=int, - default=1, - help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') - parser.add_argument( - '--cpus', - type=int, - default=1, - help='If cpus > 1, will use ParallelDo to run, else use Executor.') - parser.add_argument( - '--data_set', - type=str, - default='flowers', - choices=['cifar10', 'flowers'], - help='Optional dataset for benchmark.') - parser.add_argument( - '--infer_only', action='store_true', help='If set, run forward only.') - parser.add_argument( - '--use_cprof', action='store_true', help='If set, use cProfile.') - parser.add_argument( - '--use_nvprof', - action='store_true', - help='If set, use nvprof for CUDA.') - parser.add_argument( - '--no_test', - action='store_true', - help='If set, do not test the testset during training.') - parser.add_argument( - '--memory_optimize', - action='store_true', - help='If set, optimize runtime memory before start.') - parser.add_argument( - '--use_fake_data', - action='store_true', - help='If set ommit the actual read data operators.') - parser.add_argument( - '--profile', action='store_true', help='If set, profile a few steps.') - parser.add_argument( - '--update_method', - type=str, - default='local', - choices=['local', 'pserver', 'nccl2'], - help='Choose parameter update method, can be local, pserver, nccl2.') - parser.add_argument( - '--no_split_var', - action='store_true', - default=False, - help='Whether split variables into blocks when update_method is pserver') - parser.add_argument( - '--async_mode', - action='store_true', - default=False, - help='Whether start pserver in async mode to support ASGD') - args = parser.parse_args() - return args +from args import * def append_nccl2_prepare(trainer_id): From 9c2e68d9f903b9d2666581414825d6a416d7bc10 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 8 Jun 2018 15:50:55 +0800 Subject: [PATCH 06/12] add async_mode description to doc and remove the clipping description out --- benchmark/fluid/README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md index cae9351841deda..28cade4634bb62 100644 --- a/benchmark/fluid/README.md +++ b/benchmark/fluid/README.md @@ -28,10 +28,8 @@ Currently supported `--model` argument include: ``` You can choose to use GPU/CPU training. With GPU training, you can specify `--gpus ` to run multi GPU training. - You can set gradient clipping. With gradient clipping, you can specify - `--gradient_clipping_method GlobalNorm` to clip the gradient with global norm. - You can set regularizer to optimizer. With regularization, you can specify - `--weight_decay_regularizer_method L1` to add regularizer to optimizer. + You can set async mode parameter server. With async mode, you can specify + `--async_mode` to train model asynchronous. * Run distributed training with parameter servers: * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example. * start parameter servers: From d11e2bf977ddd319704d9312619677a386763252 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 8 Jun 2018 20:22:13 +0800 Subject: [PATCH 07/12] for restart build --- benchmark/fluid/args.py | 2 +- benchmark/fluid/fluid_benchmark.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py index 20a25bc2a53386..3549b8fed7678a 100644 --- a/benchmark/fluid/args.py +++ b/benchmark/fluid/args.py @@ -172,7 +172,7 @@ def parse_args(): parser.add_argument( '--use_reader_op', action='store_true', - help='Whether to use reader op, and must specify the data path if set this to true.' + help='Whether to use reader op, and must specify the data path if set this to true' ) parser.add_argument( '--data_path', diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 89e67a6bea631f..902dca209fcc07 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -59,7 +59,7 @@ def append_nccl2_prepare(trainer_id): "nccl-based dist train.") -def dist_transpile(trainer_id): +def dist_transpile(trainer_id, args): if trainer_id < 0: return None, None @@ -81,7 +81,12 @@ def dist_transpile(trainer_id): training_role = os.getenv("PADDLE_TRAINING_ROLE") t = distribute_transpiler.DistributeTranspiler() - t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) + t.transpile( + trainer_id, + pservers=pserver_endpoints, + trainers=trainers, + sync_mode=not args.async_mode, + slice_var_up=not args.no_split_var) if training_role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program(current_endpoint, @@ -316,7 +321,7 @@ def main(): fluid.memory_optimize(fluid.default_main_program()) if args.update_method == "pserver": - train_prog, startup_prog = dist_transpile(trainer_id) + train_prog, startup_prog = dist_transpile(trainer_id, args) if not train_prog: raise Exception( "Must configure correct environments to run dist train.") From 2da70cc3a85e49850b46b6ea1ba3c33ee46e40b9 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 8 Jun 2018 21:33:09 +0800 Subject: [PATCH 08/12] to restart build --- benchmark/fluid/args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py index 3549b8fed7678a..20a25bc2a53386 100644 --- a/benchmark/fluid/args.py +++ b/benchmark/fluid/args.py @@ -172,7 +172,7 @@ def parse_args(): parser.add_argument( '--use_reader_op', action='store_true', - help='Whether to use reader op, and must specify the data path if set this to true' + help='Whether to use reader op, and must specify the data path if set this to true.' ) parser.add_argument( '--data_path', From 95cbb4309fe86710d72306d7c02a1f977cfe5dac Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 11 Jun 2018 11:21:23 +0800 Subject: [PATCH 09/12] remove optimization args from args.py --- benchmark/fluid/args.py | 57 ----------------------------------- benchmark/fluid/models/vgg.py | 25 +++------------ 2 files changed, 4 insertions(+), 78 deletions(-) diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py index 20a25bc2a53386..68a3d42d7a8a80 100644 --- a/benchmark/fluid/args.py +++ b/benchmark/fluid/args.py @@ -34,63 +34,6 @@ def parse_args(): # args related to learning rate parser.add_argument( '--learning_rate', type=float, default=0.001, help='The learning rate.') - parser.add_argument( - '--learning_rate_decay_method', - type=str, - default=None, - choices=[], - help='Learning rate decay method, not allowed yet') - parser.add_argument( - '--learning_rate_decay_steps', - type=int, - default=100000, - help='Decay steps for learning rate decay method') - parser.add_argument( - '--learning_rate_decay_rate', - type=float, - default=0.5, - help='Decay rate for learning rate decay method') - # args related to regularization - parser.add_argument( - '--weight_decay_regularizer_method', - type=str, - default=None, - choices=[], - help='Weight decay regularizer method, not allowed yet') - parser.add_argument( - '--weight_decay_regularizer_coeff', - type=float, - default=0.1, - help='Weight decay regularizer coeff, 0.1 for default') - # args related to gradient clipping - parser.add_argument( - '--gradient_clip_method', - type=str, - default=None, - choices=[], - help='Gradient clipping method, not allowed yet') - parser.add_argument( - '--gradient_clip_norm', - type=float, - default=1., - help='Gradient clipping norm, 1. for default') - # args related to error clipping - parser.add_argument( - '--error_clip_method', - type=str, - default=None, - choices=[], - help='Error clipping method, not allowed yet') - parser.add_argument( - '--error_clip_min', - type=float, - default=1e-6, - help='Error clipping min value, 1e-6 for default') - parser.add_argument( - '--error_clip_max', - type=float, - default=2e-6, - help='Error clipping max value, 2e-6 for default') # TODO(wuyi): add "--use_fake_data" option back. parser.add_argument( '--skip_batch_num', diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py index f2010cb81717b0..6092cdeb884b3a 100644 --- a/benchmark/fluid/models/vgg.py +++ b/benchmark/fluid/models/vgg.py @@ -14,7 +14,6 @@ """VGG16 benchmark in Fluid""" from __future__ import print_function -import os import sys import time import numpy as np @@ -23,13 +22,10 @@ import paddle.fluid.core as core import argparse import functools -from models.model_base import get_decay_learning_rate -from models.model_base import get_regularization -from models.model_base import set_error_clip -from models.model_base import set_gradient_clip +import os -def vgg16_bn_drop(input, args): +def vgg16_bn_drop(input): def conv_block(input, num_filter, groups, dropouts): return fluid.nets.img_conv_group( input=input, @@ -53,8 +49,6 @@ def conv_block(input, num_filter, groups, dropouts): bn = fluid.layers.batch_norm(input=fc1, act='relu') drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) fc2 = fluid.layers.fc(input=drop2, size=512, act=None) - set_error_clip(args.error_clip_method, fc1.name, args.error_clip_min, - args.error_clip_max) return fc2 @@ -92,7 +86,7 @@ def get_model(args): label = fluid.layers.data(name='label', shape=[1], dtype='int64') # Train program - net = vgg16_bn_drop(images, args=args) + net = vgg16_bn_drop(images) predict = fluid.layers.fc(input=net, size=classdim, act='softmax') cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) @@ -108,19 +102,8 @@ def get_model(args): inference_program = fluid.io.get_inference_program( target_vars=[batch_acc, batch_size_tensor]) - # set gradient clip - set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm) - # Optimization - optimizer = fluid.optimizer.Adam( - learning_rate=get_decay_learning_rate( - decay_method=args.learning_rate_decay_method, - learning_rate=args.learning_rate, - decay_steps=args.learning_rate_decay_steps, - decay_rate=args.learning_rate_decay_rate), - regularization=get_regularization( - regularizer_method=args.weight_decay_regularizer_method, - regularizer_coeff=args.weight_decay_regularizer_coeff)) + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) # data reader train_reader = paddle.batch( From e140844ec30d1ee364d20354eba4402be851c424 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 11 Jun 2018 11:25:28 +0800 Subject: [PATCH 10/12] 1. remove optimization from models 2. fix bug in test_listen_and_serv_op --- benchmark/fluid/models/machine_translation.py | 24 +----- benchmark/fluid/models/mnist.py | 27 +----- benchmark/fluid/models/model_base.py | 86 ------------------- benchmark/fluid/models/resnet.py | 30 ++----- .../fluid/models/stacked_dynamic_lstm.py | 20 +---- 5 files changed, 13 insertions(+), 174 deletions(-) delete mode 100644 benchmark/fluid/models/model_base.py diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py index 898ffaf6bb475b..69541adf6b7e53 100644 --- a/benchmark/fluid/models/machine_translation.py +++ b/benchmark/fluid/models/machine_translation.py @@ -26,10 +26,6 @@ import paddle.fluid.core as core import paddle.fluid.framework as framework from paddle.fluid.executor import Executor -from models.model_base import get_decay_learning_rate -from models.model_base import get_regularization -from models.model_base import set_error_clip -from models.model_base import set_gradient_clip def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): @@ -54,7 +50,7 @@ def linear(inputs): def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, - target_dict_dim, is_generating, beam_size, max_length, args): + target_dict_dim, is_generating, beam_size, max_length): """Construct a seq2seq network.""" def bi_lstm_encoder(input_seq, gate_size): @@ -103,8 +99,6 @@ def bi_lstm_encoder(input_seq, gate_size): size=decoder_size, bias_attr=False, act='tanh') - set_error_clip(args.error_clip_method, encoded_proj.name, - args.error_clip_min, args.error_clip_max) def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj, decoder_boot, decoder_size): @@ -219,24 +213,12 @@ def get_model(args): dict_size, False, beam_size=beam_size, - max_length=max_length, - args=args) + max_length=max_length) # clone from default main program inference_program = fluid.default_main_program().clone() - # set gradient clip - set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm) - - optimizer = fluid.optimizer.Adam( - learning_rate=get_decay_learning_rate( - decay_method=args.learning_rate_decay_method, - learning_rate=args.learning_rate, - decay_steps=args.learning_rate_decay_steps, - decay_rate=args.learning_rate_decay_rate), - regularization=get_regularization( - regularizer_method=args.weight_decay_regularizer_method, - regularizer_coeff=args.weight_decay_regularizer_coeff)) + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) train_batch_generator = paddle.batch( paddle.reader.shuffle( diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py index 00a3d76907f664..8e740dc6896b7e 100644 --- a/benchmark/fluid/models/mnist.py +++ b/benchmark/fluid/models/mnist.py @@ -25,10 +25,6 @@ import paddle import paddle.fluid as fluid import paddle.fluid.profiler as profiler -from models.model_base import get_decay_learning_rate -from models.model_base import get_regularization -from models.model_base import set_error_clip -from models.model_base import set_gradient_clip SEED = 1 DTYPE = "float32" @@ -37,7 +33,7 @@ # fluid.default_startup_program().random_seed = SEED -def cnn_model(data, args): +def cnn_model(data): conv_pool_1 = fluid.nets.simple_img_conv_pool( input=data, filter_size=5, @@ -53,9 +49,6 @@ def cnn_model(data, args): pool_stride=2, act="relu") - set_error_clip(args.error_clip_method, conv_pool_1.name, - args.error_clip_min, args.error_clip_max) - # TODO(dzhwinter) : refine the initializer and random seed settting SIZE = 10 input_shape = conv_pool_2.shape @@ -96,7 +89,7 @@ def get_model(args): places = fluid.layers.get_places(args.cpus) pd = fluid.layers.ParallelDo(places) with pd.do(): - predict = cnn_model(pd.read_input(images), args) + predict = cnn_model(pd.read_input(images)) label = pd.read_input(label) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) @@ -110,7 +103,7 @@ def get_model(args): batch_acc = fluid.layers.mean(batch_acc) else: # Train program - predict = cnn_model(images, args) + predict = cnn_model(images) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) @@ -120,21 +113,9 @@ def get_model(args): # inference program inference_program = fluid.default_main_program().clone() - # set gradient clip - # set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm) - # Optimization opt = fluid.optimizer.AdamOptimizer( - learning_rate=get_decay_learning_rate( - decay_method=args.learning_rate_decay_method, - learning_rate=0.001, - decay_steps=args.learning_rate_decay_steps, - decay_rate=args.learning_rate_decay_rate), - regularization=get_regularization( - regularizer_method=args.weight_decay_regularizer_method, - regularizer_coeff=args.weight_decay_regularizer_coeff), - beta1=0.9, - beta2=0.999) + learning_rate=0.001, beta1=0.9, beta2=0.999) # Reader train_reader = paddle.batch( diff --git a/benchmark/fluid/models/model_base.py b/benchmark/fluid/models/model_base.py deleted file mode 100644 index e2135442e47e26..00000000000000 --- a/benchmark/fluid/models/model_base.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import argparse - -import paddle.fluid as fluid -from paddle.fluid.regularizer import L1DecayRegularizer -from paddle.fluid.regularizer import L2DecayRegularizer -from paddle.fluid.clip import GradientClipByNorm -from paddle.fluid.clip import GradientClipByGlobalNorm -from paddle.fluid.clip import ErrorClipByValue - -__all__ = [ - 'get_decay_learning_rate', - 'get_regularization', - 'set_error_clip', - 'set_gradient_clip', -] - - -def get_decay_learning_rate(decay_method, - learning_rate=0.001, - decay_steps=100000, - decay_rate=0.5, - staircase=True): - if not decay_method: - return learning_rate - else: - decay_op = getattr(fluid.layers, "%s_decay" % decay_method) - return decay_op( - learning_rate=learning_rate, - decay_steps=decay_steps, - decay_rate=decay_rate) - - -def get_regularization(regularizer_method, regularizer_coeff=0.1): - if not regularizer_method: - return None - else: - RegularizerClazz = globals()["%sDecayRegularizer" % regularizer_method] - regularizer = RegularizerClazz(regularization_coeff=regularizer_coeff) - return regularizer - - -def set_error_clip(clip_method, - layer_name, - clip_min=-1e-6, - clip_max=2e-6, - program=None): - assert clip_min < clip_max - if not clip_method: - return None - else: - ClipClazz = globals()["ErrorClipBy%s" % clip_method] - if not program: - prog = fluid.default_main_program() - else: - prog = program - prog.block(0).var(layer_name).set_error_clip( - ClipClazz( - max=clip_max, min=clip_min)) - - -def set_gradient_clip(clip_method, clip_norm=1.): - if not clip_method: - return None - else: - ClipClazz = globals()["GradientClipBy%s" % clip_method] - fluid.clip.set_gradient_clip(ClipClazz(clip_norm=clip_norm)) - return clip_method diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index beecfceb996825..2ee2b5be09bfcc 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -27,10 +27,6 @@ import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.profiler as profiler -from models.model_base import get_decay_learning_rate -from models.model_base import get_regularization -from models.model_base import set_error_clip -from models.model_base import set_gradient_clip from recordio_converter import imagenet_train, imagenet_test @@ -76,7 +72,7 @@ def layer_warp(block_func, input, ch_out, count, stride): return res_out -def resnet_imagenet(input, class_dim, args, depth=50, data_format='NCHW'): +def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'): cfg = { 18: ([2, 2, 2, 1], basicblock), @@ -100,12 +96,10 @@ def resnet_imagenet(input, class_dim, args, depth=50, data_format='NCHW'): pool_stride=1, global_pooling=True) out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax') - set_error_clip(args.error_clip_method, out.name, args.error_clip_min, - args.error_clip_max) return out -def resnet_cifar10(input, class_dim, args, depth=32, data_format='NCHW'): +def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): assert (depth - 2) % 6 == 0 n = (depth - 2) // 6 @@ -118,8 +112,6 @@ def resnet_cifar10(input, class_dim, args, depth=32, data_format='NCHW'): pool = fluid.layers.pool2d( input=res3, pool_size=8, pool_type='avg', pool_stride=1) out = fluid.layers.fc(input=pool, size=class_dim, act='softmax') - set_error_clip(args.error_clip_method, out.name, args.error_clip_min, - args.error_clip_max) return out @@ -179,7 +171,7 @@ def get_model(args): places = fluid.layers.get_places(args.cpus) pd = fluid.layers.ParallelDo(places) with pd.do(): - predict = model(pd.read_input(input), class_dim, args=args) + predict = model(pd.read_input(input), class_dim) label = pd.read_input(label) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) @@ -192,7 +184,7 @@ def get_model(args): avg_cost = fluid.layers.mean(avg_cost) batch_acc = fluid.layers.mean(batch_acc) else: - predict = model(input, class_dim, args=args) + predict = model(input, class_dim) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) batch_acc = fluid.layers.accuracy(input=predict, label=label) @@ -202,19 +194,7 @@ def get_model(args): inference_program = fluid.io.get_inference_program( target_vars=[batch_acc]) - # set gradient clip - set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm) - - optimizer = fluid.optimizer.Momentum( - learning_rate=get_decay_learning_rate( - decay_method=args.learning_rate_decay_method, - learning_rate=0.01, - decay_steps=args.learning_rate_decay_steps, - decay_rate=args.learning_rate_decay_rate), - regularization=get_regularization( - regularizer_method=args.weight_decay_regularizer_method, - regularizer_coeff=args.weight_decay_regularizer_coeff), - momentum=0.9) + optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) batched_train_reader = paddle.batch( paddle.reader.shuffle( diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py index 66e69699f073b9..e1c4857f1a365f 100644 --- a/benchmark/fluid/models/stacked_dynamic_lstm.py +++ b/benchmark/fluid/models/stacked_dynamic_lstm.py @@ -28,10 +28,6 @@ import paddle.fluid as fluid import paddle.batch as batch import paddle.fluid.profiler as profiler -from models.model_base import get_decay_learning_rate -from models.model_base import get_regularization -from models.model_base import set_error_clip -from models.model_base import set_gradient_clip word_dict = imdb.word_dict() @@ -62,9 +58,6 @@ def get_model(args): sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh') - set_error_clip(args.error_clip_method, sentence.name, args.error_clip_min, - args.error_clip_max) - rnn = fluid.layers.DynamicRNN() with rnn.block(): word = rnn.step_input(sentence) @@ -119,18 +112,7 @@ def gate_common( inference_program = fluid.io.get_inference_program( target_vars=[batch_acc, batch_size_tensor]) - # set gradient clip - set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm) - - adam = fluid.optimizer.Adam( - learning_rate=get_decay_learning_rate( - decay_method=args.learning_rate_decay_method, - learning_rate=0.001, - decay_steps=args.learning_rate_decay_steps, - decay_rate=args.learning_rate_decay_rate), - regularization=get_regularization( - regularizer_method=args.weight_decay_regularizer_method, - regularizer_coeff=args.weight_decay_regularizer_coeff)) + adam = fluid.optimizer.Adam() train_reader = batch( paddle.reader.shuffle( From 0a90eee3f8f8cbadbc723fe6373842f2e58fff81 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 11 Jun 2018 15:16:55 +0800 Subject: [PATCH 11/12] change the name retry_times to left_time --- .../paddle/fluid/tests/unittests/test_listen_and_serv_op.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py index 836fcd651d4b11..1422b3bae49680 100644 --- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py +++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py @@ -70,10 +70,10 @@ def _start_pserver(self, use_cuda, sync_mode): return p.pid def _wait_ps_ready(self, pid): - retry_times = self.ps_timeout + left_time = self.ps_timeout sleep_time = 0.5 while True: - assert retry_times >= 0, "wait ps ready failed" + assert left_time >= 0, "wait ps ready failed" time.sleep(sleep_time) try: # the listen_and_serv_op would touch a file which contains the listen port @@ -81,7 +81,7 @@ def _wait_ps_ready(self, pid): os.stat("/tmp/paddle.%d.port" % pid) return except os.error: - retry_times -= sleep_time + left_time -= sleep_time def test_rpc_interfaces(self): # TODO(Yancey1989): need to make sure the rpc interface correctly. From c950d22fcc2e2aa2ecfd4b3cfd08ee0bb0e58d23 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 11 Jun 2018 15:18:26 +0800 Subject: [PATCH 12/12] change retry_times to the pserver start left time --- .../paddle/fluid/tests/unittests/test_listen_and_serv_op.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py index 1422b3bae49680..d1d709551c7790 100644 --- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py +++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py @@ -70,10 +70,10 @@ def _start_pserver(self, use_cuda, sync_mode): return p.pid def _wait_ps_ready(self, pid): - left_time = self.ps_timeout + start_left_time = self.ps_timeout sleep_time = 0.5 while True: - assert left_time >= 0, "wait ps ready failed" + assert start_left_time >= 0, "wait ps ready failed" time.sleep(sleep_time) try: # the listen_and_serv_op would touch a file which contains the listen port @@ -81,7 +81,7 @@ def _wait_ps_ready(self, pid): os.stat("/tmp/paddle.%d.port" % pid) return except os.error: - left_time -= sleep_time + start_left_time -= sleep_time def test_rpc_interfaces(self): # TODO(Yancey1989): need to make sure the rpc interface correctly.