From 72bb214c8a8afd71b98ef03fd3fd10cfe5fb3a86 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Tue, 5 Jun 2018 21:51:17 +0800
Subject: [PATCH 01/12] 1. add weight decay feature into fluid benchmark test
 2. add learning rate decay feature into fluid benchmark test 3. add L1&L2
 regularization feature into fluid benchmark test 4. add error clipping
 feature into fluid benchmark test 5. add gradient clipping feature into fluid
 benchmark test

---
 benchmark/fluid/fluid_benchmark.py            | 69 +++++++++++++++++++
 benchmark/fluid/models/machine_translation.py | 24 ++++++-
 benchmark/fluid/models/mnist.py               | 26 ++++++-
 benchmark/fluid/models/resnet.py              | 28 ++++++--
 .../fluid/models/stacked_dynamic_lstm.py      | 20 +++++-
 benchmark/fluid/models/vgg.py                 | 23 ++++++-
 6 files changed, 176 insertions(+), 14 deletions(-)

diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 9d33a841cddb8d..67d1591595a068 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -39,8 +39,67 @@ def parse_args():
         help='The model to run benchmark with.')
     parser.add_argument(
         '--batch_size', type=int, default=32, help='The minibatch size.')
+    #  args related to learning rate
     parser.add_argument(
         '--learning_rate', type=float, default=0.001, help='The learning rate.')
+    parser.add_argument(
+        '--learning_rate_decay_method',
+        type=str,
+        default=None,
+        choices=['exponential', 'natural_exp', 'inverse_time'],
+        help='Learning rate decay method, can be exponential, natural_exp, inverse_time'
+    )
+    parser.add_argument(
+        '--learning_rate_decay_steps',
+        type=int,
+        default=100000,
+        help='Decay steps for learning rate decay method')
+    parser.add_argument(
+        '--learning_rate_decay_rate',
+        type=float,
+        default=0.5,
+        help='Decay rate for learning rate decay method')
+    #  args related to regularization
+    parser.add_argument(
+        '--weight_decay_regularizer_method',
+        type=str,
+        default=None,
+        choices=['L1', 'L2'],
+        help='Weight decay regularizer method, can be L1, L2')
+    parser.add_argument(
+        '--weight_decay_regularizer_coeff',
+        type=float,
+        default=0.1,
+        help='Weight decay regularizer coeff, 0.1 for default')
+    #  args related to gradient clipping
+    parser.add_argument(
+        '--gradient_clip_method',
+        type=str,
+        default=None,
+        choices=['Norm', 'GlobalNorm'],
+        help='Gradient clipping method, can be Norm, GlobalNorm')
+    parser.add_argument(
+        '--gradient_clip_norm',
+        type=float,
+        default=1.,
+        help='Gradient clipping norm, 1. for default')
+    #  args related to error clipping
+    parser.add_argument(
+        '--error_clip_method',
+        type=str,
+        default=None,
+        choices=['Value'],
+        help='Error clipping method, can be Value')
+    parser.add_argument(
+        '--error_clip_min',
+        type=float,
+        default=1e-6,
+        help='Error clipping min value, 1e-6 for default')
+    parser.add_argument(
+        '--error_clip_max',
+        type=float,
+        default=2e-6,
+        help='Error clipping max value, 2e-6 for default')
     # TODO(wuyi): add "--use_fake_data" option back.
     parser.add_argument(
         '--skip_batch_num',
@@ -103,6 +162,16 @@ def parse_args():
         default='local',
         choices=['local', 'pserver', 'nccl2'],
         help='Choose parameter update method, can be local, pserver, nccl2.')
+    parser.add_argument(
+        '--no_split_var',
+        action='store_true',
+        default=False,
+        help='Whether split variables into blocks when update_method is pserver')
+    parser.add_argument(
+        '--async_mode',
+        action='store_true',
+        default=False,
+        help='Whether start pserver in async mode to support ASGD')
     args = parser.parse_args()
     return args
 
diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py
index 635b3373dd27b2..637b291f8a4f6e 100644
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -26,6 +26,10 @@
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
 from paddle.fluid.executor import Executor
+from models.model_base import get_decay_learning_rate
+from models.model_base import get_regularization
+from models.model_base import set_error_clip
+from models.model_base import set_gradient_clip
 
 
 def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
@@ -50,7 +54,7 @@ def linear(inputs):
 
 
 def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
-                   target_dict_dim, is_generating, beam_size, max_length):
+                   target_dict_dim, is_generating, beam_size, max_length, args):
     """Construct a seq2seq network."""
 
     def bi_lstm_encoder(input_seq, gate_size):
@@ -99,6 +103,8 @@ def bi_lstm_encoder(input_seq, gate_size):
                                    size=decoder_size,
                                    bias_attr=False,
                                    act='tanh')
+    set_error_clip(args.error_clip_method, encoded_proj.name,
+                   args.error_clip_min, args.error_clip_max)
 
     def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
                                     decoder_boot, decoder_size):
@@ -211,12 +217,24 @@ def get_model(args):
         dict_size,
         False,
         beam_size=beam_size,
-        max_length=max_length)
+        max_length=max_length,
+        args=args)
 
     # clone from default main program
     inference_program = fluid.default_main_program().clone()
 
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    # set gradient clip
+    set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)
+
+    optimizer = fluid.optimizer.Adam(
+        learning_rate=get_decay_learning_rate(
+            decay_method=args.learning_rate_decay_method,
+            learning_rate=args.learning_rate,
+            decay_steps=args.learning_rate_decay_steps,
+            decay_rate=args.learning_rate_decay_rate),
+        regularization=get_regularization(
+            regularizer_method=args.weight_decay_regularizer_method,
+            regularizer_coeff=args.weight_decay_regularizer_coeff))
 
     train_batch_generator = paddle.batch(
         paddle.reader.shuffle(
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
index d264bfc12bdb15..ef1ae433a3b639 100644
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -24,6 +24,10 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler
+from models.model_base import get_decay_learning_rate
+from models.model_base import get_regularization
+from models.model_base import set_error_clip
+from models.model_base import set_gradient_clip
 
 SEED = 1
 DTYPE = "float32"
@@ -32,7 +36,7 @@
 # fluid.default_startup_program().random_seed = SEED
 
 
-def cnn_model(data):
+def cnn_model(data, args):
     conv_pool_1 = fluid.nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
@@ -48,6 +52,9 @@ def cnn_model(data):
         pool_stride=2,
         act="relu")
 
+    set_error_clip(args.error_clip_method, conv_pool_1.name,
+                   args.error_clip_min, args.error_clip_max)
+
     # TODO(dzhwinter) : refine the initializer and random seed settting
     SIZE = 10
     input_shape = conv_pool_2.shape
@@ -70,7 +77,8 @@ def get_model(args):
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     # Train program
-    predict = cnn_model(images)
+    predict = cnn_model(images, args)
+
     cost = fluid.layers.cross_entropy(input=predict, label=label)
     avg_cost = fluid.layers.mean(x=cost)
 
@@ -82,9 +90,21 @@ def get_model(args):
     # inference program
     inference_program = fluid.default_main_program().clone()
 
+    # set gradient clip
+    # set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)
+
     # Optimization
     opt = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.001, beta1=0.9, beta2=0.999)
+        learning_rate=get_decay_learning_rate(
+            decay_method=args.learning_rate_decay_method,
+            learning_rate=0.001,
+            decay_steps=args.learning_rate_decay_steps,
+            decay_rate=args.learning_rate_decay_rate),
+        regularization=get_regularization(
+            regularizer_method=args.weight_decay_regularizer_method,
+            regularizer_coeff=args.weight_decay_regularizer_coeff),
+        beta1=0.9,
+        beta2=0.999)
 
     # Reader
     train_reader = paddle.batch(
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index 9dec8911ed64e0..d7e1293fcf2923 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -26,6 +26,10 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
+from models.model_base import get_decay_learning_rate
+from models.model_base import get_regularization
+from models.model_base import set_error_clip
+from models.model_base import set_gradient_clip
 
 
 def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
@@ -70,7 +74,7 @@ def layer_warp(block_func, input, ch_out, count, stride):
     return res_out
 
 
-def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+def resnet_imagenet(input, class_dim, args, depth=50, data_format='NCHW'):
 
     cfg = {
         18: ([2, 2, 2, 1], basicblock),
@@ -94,10 +98,12 @@ def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
         pool_stride=1,
         global_pooling=True)
     out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    set_error_clip(args.error_clip_method, out.name, args.error_clip_min,
+                   args.error_clip_max)
     return out
 
 
-def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+def resnet_cifar10(input, class_dim, args, depth=32, data_format='NCHW'):
     assert (depth - 2) % 6 == 0
 
     n = (depth - 2) // 6
@@ -110,6 +116,8 @@ def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
     pool = fluid.layers.pool2d(
         input=res3, pool_size=8, pool_type='avg', pool_stride=1)
     out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    set_error_clip(args.error_clip_method, out.name, args.error_clip_min,
+                   args.error_clip_max)
     return out
 
 
@@ -132,7 +140,7 @@ def get_model(args):
 
     input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    predict = model(input, class_dim)
+    predict = model(input, class_dim, args=args)
     cost = fluid.layers.cross_entropy(input=predict, label=label)
     avg_cost = fluid.layers.mean(x=cost)
 
@@ -145,7 +153,19 @@ def get_model(args):
         inference_program = fluid.io.get_inference_program(
             target_vars=[batch_acc, batch_size_tensor])
 
-    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+    # set gradient clip
+    set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)
+
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=get_decay_learning_rate(
+            decay_method=args.learning_rate_decay_method,
+            learning_rate=0.01,
+            decay_steps=args.learning_rate_decay_steps,
+            decay_rate=args.learning_rate_decay_rate),
+        regularization=get_regularization(
+            regularizer_method=args.weight_decay_regularizer_method,
+            regularizer_coeff=args.weight_decay_regularizer_coeff),
+        momentum=0.9)
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
index 81a28b5f3aed0c..c84caed175434d 100644
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -28,6 +28,10 @@
 import paddle.fluid as fluid
 import paddle.batch as batch
 import paddle.fluid.profiler as profiler
+from models.model_base import get_decay_learning_rate
+from models.model_base import get_regularization
+from models.model_base import set_error_clip
+from models.model_base import set_gradient_clip
 
 word_dict = imdb.word_dict()
 
@@ -55,6 +59,9 @@ def get_model(args):
 
     sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
 
+    set_error_clip(args.error_clip_method, sentence.name, args.error_clip_min,
+                   args.error_clip_max)
+
     rnn = fluid.layers.DynamicRNN()
     with rnn.block():
         word = rnn.step_input(sentence)
@@ -110,7 +117,18 @@ def gate_common(
         inference_program = fluid.io.get_inference_program(
             target_vars=[batch_acc, batch_size_tensor])
 
-    adam = fluid.optimizer.Adam()
+    # set gradient clip
+    set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)
+
+    adam = fluid.optimizer.Adam(
+        learning_rate=get_decay_learning_rate(
+            decay_method=args.learning_rate_decay_method,
+            learning_rate=0.001,
+            decay_steps=args.learning_rate_decay_steps,
+            decay_rate=args.learning_rate_decay_rate),
+        regularization=get_regularization(
+            regularizer_method=args.weight_decay_regularizer_method,
+            regularizer_coeff=args.weight_decay_regularizer_coeff))
 
     train_reader = batch(
         paddle.reader.shuffle(
diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
index 53856c5f7acd3a..2d621760b2480a 100644
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -22,9 +22,13 @@
 import paddle.fluid.core as core
 import argparse
 import functools
+from models.model_base import get_decay_learning_rate
+from models.model_base import get_regularization
+from models.model_base import set_error_clip
+from models.model_base import set_gradient_clip
 
 
-def vgg16_bn_drop(input):
+def vgg16_bn_drop(input, args):
     def conv_block(input, num_filter, groups, dropouts):
         return fluid.nets.img_conv_group(
             input=input,
@@ -48,6 +52,8 @@ def conv_block(input, num_filter, groups, dropouts):
     bn = fluid.layers.batch_norm(input=fc1, act='relu')
     drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
     fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    set_error_clip(args.error_clip_method, fc1.name, args.error_clip_min,
+                   args.error_clip_max)
     return fc2
 
 
@@ -70,7 +76,7 @@ def get_model(args):
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     # Train program
-    net = vgg16_bn_drop(images)
+    net = vgg16_bn_drop(images, args=args)
     predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
     cost = fluid.layers.cross_entropy(input=predict, label=label)
     avg_cost = fluid.layers.mean(x=cost)
@@ -86,8 +92,19 @@ def get_model(args):
         inference_program = fluid.io.get_inference_program(
             target_vars=[batch_acc, batch_size_tensor])
 
+    # set gradient clip
+    set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)
+
     # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimizer = fluid.optimizer.Adam(
+        learning_rate=get_decay_learning_rate(
+            decay_method=args.learning_rate_decay_method,
+            learning_rate=args.learning_rate,
+            decay_steps=args.learning_rate_decay_steps,
+            decay_rate=args.learning_rate_decay_rate),
+        regularization=get_regularization(
+            regularizer_method=args.weight_decay_regularizer_method,
+            regularizer_coeff=args.weight_decay_regularizer_coeff))
 
     # data reader
     train_reader = paddle.batch(

From 3bd8f9e4f8f1218ab2398707223659062c7f26f7 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Tue, 5 Jun 2018 22:05:50 +0800
Subject: [PATCH 02/12] Add some document to README.md under benchmark/fluid/
 repo

---
 benchmark/fluid/README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
index 1b0c7dce8bd6fa..80aa863734b691 100644
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -24,10 +24,14 @@ Currently supported `--model` argument include:
 
 * Run the following command to start a benchmark job locally:
     ```bash
-      python fluid_benchmark.py --model mnist  --device GPU
+      python fluid_benchmark.py --model mnist --device GPU
     ```
     You can choose to use GPU/CPU training. With GPU training, you can specify
     `--gpus <gpu_num>` to run multi GPU training.
+    You can set gradient clipping. With gradient clipping, you can specify
+    `--gradient_clipping_method GlobalNorm` to clip the gradient with global norm.
+    You can set regularizer to optimizer. With regularization, you can specify
+    `--weight_decay_regularizer_method L1` to add regularizer to optimizer.
 * Run distributed training with parameter servers:
     * start parameter servers:
         ```bash

From 3bf93b3378f86b3de7fadd6422c6c7e7d7f2a173 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 6 Jun 2018 11:40:37 +0800
Subject: [PATCH 03/12] Add model_base.py

---
 benchmark/fluid/models/model_base.py | 86 ++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 benchmark/fluid/models/model_base.py

diff --git a/benchmark/fluid/models/model_base.py b/benchmark/fluid/models/model_base.py
new file mode 100644
index 00000000000000..e2135442e47e26
--- /dev/null
+++ b/benchmark/fluid/models/model_base.py
@@ -0,0 +1,86 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+
+import paddle.fluid as fluid
+from paddle.fluid.regularizer import L1DecayRegularizer
+from paddle.fluid.regularizer import L2DecayRegularizer
+from paddle.fluid.clip import GradientClipByNorm
+from paddle.fluid.clip import GradientClipByGlobalNorm
+from paddle.fluid.clip import ErrorClipByValue
+
+__all__ = [
+    'get_decay_learning_rate',
+    'get_regularization',
+    'set_error_clip',
+    'set_gradient_clip',
+]
+
+
+def get_decay_learning_rate(decay_method,
+                            learning_rate=0.001,
+                            decay_steps=100000,
+                            decay_rate=0.5,
+                            staircase=True):
+    if not decay_method:
+        return learning_rate
+    else:
+        decay_op = getattr(fluid.layers, "%s_decay" % decay_method)
+        return decay_op(
+            learning_rate=learning_rate,
+            decay_steps=decay_steps,
+            decay_rate=decay_rate)
+
+
+def get_regularization(regularizer_method, regularizer_coeff=0.1):
+    if not regularizer_method:
+        return None
+    else:
+        RegularizerClazz = globals()["%sDecayRegularizer" % regularizer_method]
+        regularizer = RegularizerClazz(regularization_coeff=regularizer_coeff)
+        return regularizer
+
+
+def set_error_clip(clip_method,
+                   layer_name,
+                   clip_min=-1e-6,
+                   clip_max=2e-6,
+                   program=None):
+    assert clip_min < clip_max
+    if not clip_method:
+        return None
+    else:
+        ClipClazz = globals()["ErrorClipBy%s" % clip_method]
+        if not program:
+            prog = fluid.default_main_program()
+        else:
+            prog = program
+        prog.block(0).var(layer_name).set_error_clip(
+            ClipClazz(
+                max=clip_max, min=clip_min))
+
+
+def set_gradient_clip(clip_method, clip_norm=1.):
+    if not clip_method:
+        return None
+    else:
+        ClipClazz = globals()["GradientClipBy%s" % clip_method]
+        fluid.clip.set_gradient_clip(ClipClazz(clip_norm=clip_norm))
+        return clip_method

From 8041e8dff6d5c9fb1a7926f6a67eeceea4b5297a Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 6 Jun 2018 14:46:40 +0800
Subject: [PATCH 04/12] Fix bugs in test_listen_and_serv_op

---
 .../paddle/fluid/tests/unittests/test_listen_and_serv_op.py  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index 1226027ddc9c0b..836fcd651d4b11 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -71,16 +71,17 @@ def _start_pserver(self, use_cuda, sync_mode):
 
     def _wait_ps_ready(self, pid):
         retry_times = self.ps_timeout
+        sleep_time = 0.5
         while True:
             assert retry_times >= 0, "wait ps ready failed"
-            time.sleep(0.5)
+            time.sleep(sleep_time)
             try:
                 # the listen_and_serv_op would touch a file which contains the listen port
                 # on the /tmp directory until it was ready to process all the RPC call.
                 os.stat("/tmp/paddle.%d.port" % pid)
                 return
             except os.error:
-                retry_times -= 1
+                retry_times -= sleep_time
 
     def test_rpc_interfaces(self):
         # TODO(Yancey1989): need to make sure the rpc interface correctly.

From 4dd0ded52b529d1de1e69eee65b606cf5fd93e66 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 8 Jun 2018 15:18:01 +0800
Subject: [PATCH 05/12] 1. remove args out of fluid_benchmark.py 2. remove
 lr_decay, regularization, clipping out of fluid_benchmark.py

---
 benchmark/fluid/args.py            | 172 +++++++++++++++++++++++++++++
 benchmark/fluid/fluid_benchmark.py | 156 +-------------------------
 2 files changed, 173 insertions(+), 155 deletions(-)
 create mode 100644 benchmark/fluid/args.py

diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
new file mode 100644
index 00000000000000..7585dc4ac6390e
--- /dev/null
+++ b/benchmark/fluid/args.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+__all__ = ['parse_args', ]
+
+BENCHMARK_MODELS = [
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Fluid model benchmarks.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=BENCHMARK_MODELS,
+        default='resnet',
+        help='The model to run benchmark with.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    #  args related to learning rate
+    parser.add_argument(
+        '--learning_rate', type=float, default=0.001, help='The learning rate.')
+    parser.add_argument(
+        '--learning_rate_decay_method',
+        type=str,
+        default=None,
+        choices=[],
+        help='Learning rate decay method, not allowed yet')
+    parser.add_argument(
+        '--learning_rate_decay_steps',
+        type=int,
+        default=100000,
+        help='Decay steps for learning rate decay method')
+    parser.add_argument(
+        '--learning_rate_decay_rate',
+        type=float,
+        default=0.5,
+        help='Decay rate for learning rate decay method')
+    #  args related to regularization
+    parser.add_argument(
+        '--weight_decay_regularizer_method',
+        type=str,
+        default=None,
+        choices=[],
+        help='Weight decay regularizer method, not allowed yet')
+    parser.add_argument(
+        '--weight_decay_regularizer_coeff',
+        type=float,
+        default=0.1,
+        help='Weight decay regularizer coeff, 0.1 for default')
+    #  args related to gradient clipping
+    parser.add_argument(
+        '--gradient_clip_method',
+        type=str,
+        default=None,
+        choices=[],
+        help='Gradient clipping method, not allowed yet')
+    parser.add_argument(
+        '--gradient_clip_norm',
+        type=float,
+        default=1.,
+        help='Gradient clipping norm, 1. for default')
+    #  args related to error clipping
+    parser.add_argument(
+        '--error_clip_method',
+        type=str,
+        default=None,
+        choices=[],
+        help='Error clipping method, not allowed yet')
+    parser.add_argument(
+        '--error_clip_min',
+        type=float,
+        default=1e-6,
+        help='Error clipping min value, 1e-6 for default')
+    parser.add_argument(
+        '--error_clip_max',
+        type=float,
+        default=2e-6,
+        help='Error clipping max value, 2e-6 for default')
+    # TODO(wuyi): add "--use_fake_data" option back.
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=100, help='The number of passes.')
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default='NCHW',
+        choices=['NCHW', 'NHWC'],
+        help='The data data_format, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=1,
+        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+    parser.add_argument(
+        '--cpus',
+        type=int,
+        default=1,
+        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+    parser.add_argument(
+        '--data_set',
+        type=str,
+        default='flowers',
+        choices=['cifar10', 'flowers'],
+        help='Optional dataset for benchmark.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--no_test',
+        action='store_true',
+        help='If set, do not test the testset during training.')
+    parser.add_argument(
+        '--memory_optimize',
+        action='store_true',
+        help='If set, optimize runtime memory before start.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='If set ommit the actual read data operators.')
+    parser.add_argument(
+        '--profile', action='store_true', help='If set, profile a few steps.')
+    parser.add_argument(
+        '--update_method',
+        type=str,
+        default='local',
+        choices=['local', 'pserver', 'nccl2'],
+        help='Choose parameter update method, can be local, pserver, nccl2.')
+    parser.add_argument(
+        '--no_split_var',
+        action='store_true',
+        default=False,
+        help='Whether split variables into blocks when update_method is pserver')
+    parser.add_argument(
+        '--async_mode',
+        action='store_true',
+        default=False,
+        help='Whether start pserver in async mode to support ASGD')
+    args = parser.parse_args()
+    return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 553395d3f8d14d..d3a29bd83c5ebd 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -24,161 +24,7 @@
 import paddle.fluid.profiler as profiler
 import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
 
-BENCHMARK_MODELS = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
-]
-
-
-def parse_args():
-    parser = argparse.ArgumentParser('Fluid model benchmarks.')
-    parser.add_argument(
-        '--model',
-        type=str,
-        choices=BENCHMARK_MODELS,
-        default='resnet',
-        help='The model to run benchmark with.')
-    parser.add_argument(
-        '--batch_size', type=int, default=32, help='The minibatch size.')
-    #  args related to learning rate
-    parser.add_argument(
-        '--learning_rate', type=float, default=0.001, help='The learning rate.')
-    parser.add_argument(
-        '--learning_rate_decay_method',
-        type=str,
-        default=None,
-        choices=['exponential', 'natural_exp', 'inverse_time'],
-        help='Learning rate decay method, can be exponential, natural_exp, inverse_time'
-    )
-    parser.add_argument(
-        '--learning_rate_decay_steps',
-        type=int,
-        default=100000,
-        help='Decay steps for learning rate decay method')
-    parser.add_argument(
-        '--learning_rate_decay_rate',
-        type=float,
-        default=0.5,
-        help='Decay rate for learning rate decay method')
-    #  args related to regularization
-    parser.add_argument(
-        '--weight_decay_regularizer_method',
-        type=str,
-        default=None,
-        choices=['L1', 'L2'],
-        help='Weight decay regularizer method, can be L1, L2')
-    parser.add_argument(
-        '--weight_decay_regularizer_coeff',
-        type=float,
-        default=0.1,
-        help='Weight decay regularizer coeff, 0.1 for default')
-    #  args related to gradient clipping
-    parser.add_argument(
-        '--gradient_clip_method',
-        type=str,
-        default=None,
-        choices=['Norm', 'GlobalNorm'],
-        help='Gradient clipping method, can be Norm, GlobalNorm')
-    parser.add_argument(
-        '--gradient_clip_norm',
-        type=float,
-        default=1.,
-        help='Gradient clipping norm, 1. for default')
-    #  args related to error clipping
-    parser.add_argument(
-        '--error_clip_method',
-        type=str,
-        default=None,
-        choices=['Value'],
-        help='Error clipping method, can be Value')
-    parser.add_argument(
-        '--error_clip_min',
-        type=float,
-        default=1e-6,
-        help='Error clipping min value, 1e-6 for default')
-    parser.add_argument(
-        '--error_clip_max',
-        type=float,
-        default=2e-6,
-        help='Error clipping max value, 2e-6 for default')
-    # TODO(wuyi): add "--use_fake_data" option back.
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations', type=int, default=80, help='The number of minibatches.')
-    parser.add_argument(
-        '--pass_num', type=int, default=100, help='The number of passes.')
-    parser.add_argument(
-        '--data_format',
-        type=str,
-        default='NCHW',
-        choices=['NCHW', 'NHWC'],
-        help='The data data_format, now only support NCHW.')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--gpus',
-        type=int,
-        default=1,
-        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
-    parser.add_argument(
-        '--cpus',
-        type=int,
-        default=1,
-        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
-    parser.add_argument(
-        '--data_set',
-        type=str,
-        default='flowers',
-        choices=['cifar10', 'flowers'],
-        help='Optional dataset for benchmark.')
-    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
-    parser.add_argument(
-        '--use_cprof', action='store_true', help='If set, use cProfile.')
-    parser.add_argument(
-        '--use_nvprof',
-        action='store_true',
-        help='If set, use nvprof for CUDA.')
-    parser.add_argument(
-        '--no_test',
-        action='store_true',
-        help='If set, do not test the testset during training.')
-    parser.add_argument(
-        '--memory_optimize',
-        action='store_true',
-        help='If set, optimize runtime memory before start.')
-    parser.add_argument(
-        '--use_fake_data',
-        action='store_true',
-        help='If set ommit the actual read data operators.')
-    parser.add_argument(
-        '--profile', action='store_true', help='If set, profile a few steps.')
-    parser.add_argument(
-        '--update_method',
-        type=str,
-        default='local',
-        choices=['local', 'pserver', 'nccl2'],
-        help='Choose parameter update method, can be local, pserver, nccl2.')
-    parser.add_argument(
-        '--no_split_var',
-        action='store_true',
-        default=False,
-        help='Whether split variables into blocks when update_method is pserver')
-    parser.add_argument(
-        '--async_mode',
-        action='store_true',
-        default=False,
-        help='Whether start pserver in async mode to support ASGD')
-    args = parser.parse_args()
-    return args
+from args import *
 
 
 def append_nccl2_prepare(trainer_id):

From 9c2e68d9f903b9d2666581414825d6a416d7bc10 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 8 Jun 2018 15:50:55 +0800
Subject: [PATCH 06/12] add async_mode description to doc and remove the
 clipping description out

---
 benchmark/fluid/README.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
index cae9351841deda..28cade4634bb62 100644
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -28,10 +28,8 @@ Currently supported `--model` argument include:
     ```
     You can choose to use GPU/CPU training. With GPU training, you can specify
     `--gpus <gpu_num>` to run multi GPU training.
-    You can set gradient clipping. With gradient clipping, you can specify
-    `--gradient_clipping_method GlobalNorm` to clip the gradient with global norm.
-    You can set regularizer to optimizer. With regularization, you can specify
-    `--weight_decay_regularizer_method L1` to add regularizer to optimizer.
+    You can set async mode parameter server. With async mode, you can specify
+    `--async_mode` to train model asynchronous.
 * Run distributed training with parameter servers:
     * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
     * start parameter servers:

From d11e2bf977ddd319704d9312619677a386763252 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 8 Jun 2018 20:22:13 +0800
Subject: [PATCH 07/12] for restart build

---
 benchmark/fluid/args.py            |  2 +-
 benchmark/fluid/fluid_benchmark.py | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index 20a25bc2a53386..3549b8fed7678a 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -172,7 +172,7 @@ def parse_args():
     parser.add_argument(
         '--use_reader_op',
         action='store_true',
-        help='Whether to use reader op, and must specify the data path if set this to true.'
+        help='Whether to use reader op, and must specify the data path if set this to true'
     )
     parser.add_argument(
         '--data_path',
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 89e67a6bea631f..902dca209fcc07 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -59,7 +59,7 @@ def append_nccl2_prepare(trainer_id):
                         "nccl-based dist train.")
 
 
-def dist_transpile(trainer_id):
+def dist_transpile(trainer_id, args):
     if trainer_id < 0:
         return None, None
 
@@ -81,7 +81,12 @@ def dist_transpile(trainer_id):
     training_role = os.getenv("PADDLE_TRAINING_ROLE")
 
     t = distribute_transpiler.DistributeTranspiler()
-    t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+    t.transpile(
+        trainer_id,
+        pservers=pserver_endpoints,
+        trainers=trainers,
+        sync_mode=not args.async_mode,
+        slice_var_up=not args.no_split_var)
     if training_role == "PSERVER":
         pserver_program = t.get_pserver_program(current_endpoint)
         pserver_startup_program = t.get_startup_program(current_endpoint,
@@ -316,7 +321,7 @@ def main():
         fluid.memory_optimize(fluid.default_main_program())
 
     if args.update_method == "pserver":
-        train_prog, startup_prog = dist_transpile(trainer_id)
+        train_prog, startup_prog = dist_transpile(trainer_id, args)
         if not train_prog:
             raise Exception(
                 "Must configure correct environments to run dist train.")

From 2da70cc3a85e49850b46b6ea1ba3c33ee46e40b9 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 8 Jun 2018 21:33:09 +0800
Subject: [PATCH 08/12] to restart build

---
 benchmark/fluid/args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index 3549b8fed7678a..20a25bc2a53386 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -172,7 +172,7 @@ def parse_args():
     parser.add_argument(
         '--use_reader_op',
         action='store_true',
-        help='Whether to use reader op, and must specify the data path if set this to true'
+        help='Whether to use reader op, and must specify the data path if set this to true.'
     )
     parser.add_argument(
         '--data_path',

From 95cbb4309fe86710d72306d7c02a1f977cfe5dac Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 11 Jun 2018 11:21:23 +0800
Subject: [PATCH 09/12] remove optimization args from args.py

---
 benchmark/fluid/args.py       | 57 -----------------------------------
 benchmark/fluid/models/vgg.py | 25 +++------------
 2 files changed, 4 insertions(+), 78 deletions(-)

diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index 20a25bc2a53386..68a3d42d7a8a80 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -34,63 +34,6 @@ def parse_args():
     #  args related to learning rate
     parser.add_argument(
         '--learning_rate', type=float, default=0.001, help='The learning rate.')
-    parser.add_argument(
-        '--learning_rate_decay_method',
-        type=str,
-        default=None,
-        choices=[],
-        help='Learning rate decay method, not allowed yet')
-    parser.add_argument(
-        '--learning_rate_decay_steps',
-        type=int,
-        default=100000,
-        help='Decay steps for learning rate decay method')
-    parser.add_argument(
-        '--learning_rate_decay_rate',
-        type=float,
-        default=0.5,
-        help='Decay rate for learning rate decay method')
-    #  args related to regularization
-    parser.add_argument(
-        '--weight_decay_regularizer_method',
-        type=str,
-        default=None,
-        choices=[],
-        help='Weight decay regularizer method, not allowed yet')
-    parser.add_argument(
-        '--weight_decay_regularizer_coeff',
-        type=float,
-        default=0.1,
-        help='Weight decay regularizer coeff, 0.1 for default')
-    #  args related to gradient clipping
-    parser.add_argument(
-        '--gradient_clip_method',
-        type=str,
-        default=None,
-        choices=[],
-        help='Gradient clipping method, not allowed yet')
-    parser.add_argument(
-        '--gradient_clip_norm',
-        type=float,
-        default=1.,
-        help='Gradient clipping norm, 1. for default')
-    #  args related to error clipping
-    parser.add_argument(
-        '--error_clip_method',
-        type=str,
-        default=None,
-        choices=[],
-        help='Error clipping method, not allowed yet')
-    parser.add_argument(
-        '--error_clip_min',
-        type=float,
-        default=1e-6,
-        help='Error clipping min value, 1e-6 for default')
-    parser.add_argument(
-        '--error_clip_max',
-        type=float,
-        default=2e-6,
-        help='Error clipping max value, 2e-6 for default')
     # TODO(wuyi): add "--use_fake_data" option back.
     parser.add_argument(
         '--skip_batch_num',
diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
index f2010cb81717b0..6092cdeb884b3a 100644
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -14,7 +14,6 @@
 """VGG16 benchmark in Fluid"""
 from __future__ import print_function
 
-import os
 import sys
 import time
 import numpy as np
@@ -23,13 +22,10 @@
 import paddle.fluid.core as core
 import argparse
 import functools
-from models.model_base import get_decay_learning_rate
-from models.model_base import get_regularization
-from models.model_base import set_error_clip
-from models.model_base import set_gradient_clip
+import os
 
 
-def vgg16_bn_drop(input, args):
+def vgg16_bn_drop(input):
     def conv_block(input, num_filter, groups, dropouts):
         return fluid.nets.img_conv_group(
             input=input,
@@ -53,8 +49,6 @@ def conv_block(input, num_filter, groups, dropouts):
     bn = fluid.layers.batch_norm(input=fc1, act='relu')
     drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
     fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
-    set_error_clip(args.error_clip_method, fc1.name, args.error_clip_min,
-                   args.error_clip_max)
     return fc2
 
 
@@ -92,7 +86,7 @@ def get_model(args):
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     # Train program
-    net = vgg16_bn_drop(images, args=args)
+    net = vgg16_bn_drop(images)
     predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
     cost = fluid.layers.cross_entropy(input=predict, label=label)
     avg_cost = fluid.layers.mean(x=cost)
@@ -108,19 +102,8 @@ def get_model(args):
         inference_program = fluid.io.get_inference_program(
             target_vars=[batch_acc, batch_size_tensor])
 
-    # set gradient clip
-    set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)
-
     # Optimization
-    optimizer = fluid.optimizer.Adam(
-        learning_rate=get_decay_learning_rate(
-            decay_method=args.learning_rate_decay_method,
-            learning_rate=args.learning_rate,
-            decay_steps=args.learning_rate_decay_steps,
-            decay_rate=args.learning_rate_decay_rate),
-        regularization=get_regularization(
-            regularizer_method=args.weight_decay_regularizer_method,
-            regularizer_coeff=args.weight_decay_regularizer_coeff))
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
 
     # data reader
     train_reader = paddle.batch(

From e140844ec30d1ee364d20354eba4402be851c424 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 11 Jun 2018 11:25:28 +0800
Subject: [PATCH 10/12] 1. remove optimization from models 2. fix bug in
 test_listen_and_serv_op

---
 benchmark/fluid/models/machine_translation.py | 24 +-----
 benchmark/fluid/models/mnist.py               | 27 +-----
 benchmark/fluid/models/model_base.py          | 86 -------------------
 benchmark/fluid/models/resnet.py              | 30 ++-----
 .../fluid/models/stacked_dynamic_lstm.py      | 20 +----
 5 files changed, 13 insertions(+), 174 deletions(-)
 delete mode 100644 benchmark/fluid/models/model_base.py

diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py
index 898ffaf6bb475b..69541adf6b7e53 100644
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -26,10 +26,6 @@
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
 from paddle.fluid.executor import Executor
-from models.model_base import get_decay_learning_rate
-from models.model_base import get_regularization
-from models.model_base import set_error_clip
-from models.model_base import set_gradient_clip
 
 
 def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
@@ -54,7 +50,7 @@ def linear(inputs):
 
 
 def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
-                   target_dict_dim, is_generating, beam_size, max_length, args):
+                   target_dict_dim, is_generating, beam_size, max_length):
     """Construct a seq2seq network."""
 
     def bi_lstm_encoder(input_seq, gate_size):
@@ -103,8 +99,6 @@ def bi_lstm_encoder(input_seq, gate_size):
                                    size=decoder_size,
                                    bias_attr=False,
                                    act='tanh')
-    set_error_clip(args.error_clip_method, encoded_proj.name,
-                   args.error_clip_min, args.error_clip_max)
 
     def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
                                     decoder_boot, decoder_size):
@@ -219,24 +213,12 @@ def get_model(args):
         dict_size,
         False,
         beam_size=beam_size,
-        max_length=max_length,
-        args=args)
+        max_length=max_length)
 
     # clone from default main program
     inference_program = fluid.default_main_program().clone()
 
-    # set gradient clip
-    set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)
-
-    optimizer = fluid.optimizer.Adam(
-        learning_rate=get_decay_learning_rate(
-            decay_method=args.learning_rate_decay_method,
-            learning_rate=args.learning_rate,
-            decay_steps=args.learning_rate_decay_steps,
-            decay_rate=args.learning_rate_decay_rate),
-        regularization=get_regularization(
-            regularizer_method=args.weight_decay_regularizer_method,
-            regularizer_coeff=args.weight_decay_regularizer_coeff))
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
 
     train_batch_generator = paddle.batch(
         paddle.reader.shuffle(
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
index 00a3d76907f664..8e740dc6896b7e 100644
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -25,10 +25,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler
-from models.model_base import get_decay_learning_rate
-from models.model_base import get_regularization
-from models.model_base import set_error_clip
-from models.model_base import set_gradient_clip
 
 SEED = 1
 DTYPE = "float32"
@@ -37,7 +33,7 @@
 # fluid.default_startup_program().random_seed = SEED
 
 
-def cnn_model(data, args):
+def cnn_model(data):
     conv_pool_1 = fluid.nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
@@ -53,9 +49,6 @@ def cnn_model(data, args):
         pool_stride=2,
         act="relu")
 
-    set_error_clip(args.error_clip_method, conv_pool_1.name,
-                   args.error_clip_min, args.error_clip_max)
-
     # TODO(dzhwinter) : refine the initializer and random seed settting
     SIZE = 10
     input_shape = conv_pool_2.shape
@@ -96,7 +89,7 @@ def get_model(args):
         places = fluid.layers.get_places(args.cpus)
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
-            predict = cnn_model(pd.read_input(images), args)
+            predict = cnn_model(pd.read_input(images))
             label = pd.read_input(label)
             cost = fluid.layers.cross_entropy(input=predict, label=label)
             avg_cost = fluid.layers.mean(x=cost)
@@ -110,7 +103,7 @@ def get_model(args):
         batch_acc = fluid.layers.mean(batch_acc)
     else:
         # Train program
-        predict = cnn_model(images, args)
+        predict = cnn_model(images)
         cost = fluid.layers.cross_entropy(input=predict, label=label)
         avg_cost = fluid.layers.mean(x=cost)
 
@@ -120,21 +113,9 @@ def get_model(args):
     # inference program
     inference_program = fluid.default_main_program().clone()
 
-    # set gradient clip
-    # set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)
-
     # Optimization
     opt = fluid.optimizer.AdamOptimizer(
-        learning_rate=get_decay_learning_rate(
-            decay_method=args.learning_rate_decay_method,
-            learning_rate=0.001,
-            decay_steps=args.learning_rate_decay_steps,
-            decay_rate=args.learning_rate_decay_rate),
-        regularization=get_regularization(
-            regularizer_method=args.weight_decay_regularizer_method,
-            regularizer_coeff=args.weight_decay_regularizer_coeff),
-        beta1=0.9,
-        beta2=0.999)
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
 
     # Reader
     train_reader = paddle.batch(
diff --git a/benchmark/fluid/models/model_base.py b/benchmark/fluid/models/model_base.py
deleted file mode 100644
index e2135442e47e26..00000000000000
--- a/benchmark/fluid/models/model_base.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import argparse
-
-import paddle.fluid as fluid
-from paddle.fluid.regularizer import L1DecayRegularizer
-from paddle.fluid.regularizer import L2DecayRegularizer
-from paddle.fluid.clip import GradientClipByNorm
-from paddle.fluid.clip import GradientClipByGlobalNorm
-from paddle.fluid.clip import ErrorClipByValue
-
-__all__ = [
-    'get_decay_learning_rate',
-    'get_regularization',
-    'set_error_clip',
-    'set_gradient_clip',
-]
-
-
-def get_decay_learning_rate(decay_method,
-                            learning_rate=0.001,
-                            decay_steps=100000,
-                            decay_rate=0.5,
-                            staircase=True):
-    if not decay_method:
-        return learning_rate
-    else:
-        decay_op = getattr(fluid.layers, "%s_decay" % decay_method)
-        return decay_op(
-            learning_rate=learning_rate,
-            decay_steps=decay_steps,
-            decay_rate=decay_rate)
-
-
-def get_regularization(regularizer_method, regularizer_coeff=0.1):
-    if not regularizer_method:
-        return None
-    else:
-        RegularizerClazz = globals()["%sDecayRegularizer" % regularizer_method]
-        regularizer = RegularizerClazz(regularization_coeff=regularizer_coeff)
-        return regularizer
-
-
-def set_error_clip(clip_method,
-                   layer_name,
-                   clip_min=-1e-6,
-                   clip_max=2e-6,
-                   program=None):
-    assert clip_min < clip_max
-    if not clip_method:
-        return None
-    else:
-        ClipClazz = globals()["ErrorClipBy%s" % clip_method]
-        if not program:
-            prog = fluid.default_main_program()
-        else:
-            prog = program
-        prog.block(0).var(layer_name).set_error_clip(
-            ClipClazz(
-                max=clip_max, min=clip_min))
-
-
-def set_gradient_clip(clip_method, clip_norm=1.):
-    if not clip_method:
-        return None
-    else:
-        ClipClazz = globals()["GradientClipBy%s" % clip_method]
-        fluid.clip.set_gradient_clip(ClipClazz(clip_norm=clip_norm))
-        return clip_method
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index beecfceb996825..2ee2b5be09bfcc 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -27,10 +27,6 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
-from models.model_base import get_decay_learning_rate
-from models.model_base import get_regularization
-from models.model_base import set_error_clip
-from models.model_base import set_gradient_clip
 from recordio_converter import imagenet_train, imagenet_test
 
 
@@ -76,7 +72,7 @@ def layer_warp(block_func, input, ch_out, count, stride):
     return res_out
 
 
-def resnet_imagenet(input, class_dim, args, depth=50, data_format='NCHW'):
+def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
 
     cfg = {
         18: ([2, 2, 2, 1], basicblock),
@@ -100,12 +96,10 @@ def resnet_imagenet(input, class_dim, args, depth=50, data_format='NCHW'):
         pool_stride=1,
         global_pooling=True)
     out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
-    set_error_clip(args.error_clip_method, out.name, args.error_clip_min,
-                   args.error_clip_max)
     return out
 
 
-def resnet_cifar10(input, class_dim, args, depth=32, data_format='NCHW'):
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
     assert (depth - 2) % 6 == 0
 
     n = (depth - 2) // 6
@@ -118,8 +112,6 @@ def resnet_cifar10(input, class_dim, args, depth=32, data_format='NCHW'):
     pool = fluid.layers.pool2d(
         input=res3, pool_size=8, pool_type='avg', pool_stride=1)
     out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
-    set_error_clip(args.error_clip_method, out.name, args.error_clip_min,
-                   args.error_clip_max)
     return out
 
 
@@ -179,7 +171,7 @@ def get_model(args):
         places = fluid.layers.get_places(args.cpus)
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
-            predict = model(pd.read_input(input), class_dim, args=args)
+            predict = model(pd.read_input(input), class_dim)
             label = pd.read_input(label)
             cost = fluid.layers.cross_entropy(input=predict, label=label)
             avg_cost = fluid.layers.mean(x=cost)
@@ -192,7 +184,7 @@ def get_model(args):
         avg_cost = fluid.layers.mean(avg_cost)
         batch_acc = fluid.layers.mean(batch_acc)
     else:
-        predict = model(input, class_dim, args=args)
+        predict = model(input, class_dim)
         cost = fluid.layers.cross_entropy(input=predict, label=label)
         avg_cost = fluid.layers.mean(x=cost)
         batch_acc = fluid.layers.accuracy(input=predict, label=label)
@@ -202,19 +194,7 @@ def get_model(args):
         inference_program = fluid.io.get_inference_program(
             target_vars=[batch_acc])
 
-    # set gradient clip
-    set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)
-
-    optimizer = fluid.optimizer.Momentum(
-        learning_rate=get_decay_learning_rate(
-            decay_method=args.learning_rate_decay_method,
-            learning_rate=0.01,
-            decay_steps=args.learning_rate_decay_steps,
-            decay_rate=args.learning_rate_decay_rate),
-        regularization=get_regularization(
-            regularizer_method=args.weight_decay_regularizer_method,
-            regularizer_coeff=args.weight_decay_regularizer_coeff),
-        momentum=0.9)
+    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
 
     batched_train_reader = paddle.batch(
         paddle.reader.shuffle(
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
index 66e69699f073b9..e1c4857f1a365f 100644
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -28,10 +28,6 @@
 import paddle.fluid as fluid
 import paddle.batch as batch
 import paddle.fluid.profiler as profiler
-from models.model_base import get_decay_learning_rate
-from models.model_base import get_regularization
-from models.model_base import set_error_clip
-from models.model_base import set_gradient_clip
 
 word_dict = imdb.word_dict()
 
@@ -62,9 +58,6 @@ def get_model(args):
 
     sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
 
-    set_error_clip(args.error_clip_method, sentence.name, args.error_clip_min,
-                   args.error_clip_max)
-
     rnn = fluid.layers.DynamicRNN()
     with rnn.block():
         word = rnn.step_input(sentence)
@@ -119,18 +112,7 @@ def gate_common(
         inference_program = fluid.io.get_inference_program(
             target_vars=[batch_acc, batch_size_tensor])
 
-    # set gradient clip
-    set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)
-
-    adam = fluid.optimizer.Adam(
-        learning_rate=get_decay_learning_rate(
-            decay_method=args.learning_rate_decay_method,
-            learning_rate=0.001,
-            decay_steps=args.learning_rate_decay_steps,
-            decay_rate=args.learning_rate_decay_rate),
-        regularization=get_regularization(
-            regularizer_method=args.weight_decay_regularizer_method,
-            regularizer_coeff=args.weight_decay_regularizer_coeff))
+    adam = fluid.optimizer.Adam()
 
     train_reader = batch(
         paddle.reader.shuffle(

From 0a90eee3f8f8cbadbc723fe6373842f2e58fff81 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 11 Jun 2018 15:16:55 +0800
Subject: [PATCH 11/12] change the name retry_times to left_time

---
 .../paddle/fluid/tests/unittests/test_listen_and_serv_op.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index 836fcd651d4b11..1422b3bae49680 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -70,10 +70,10 @@ def _start_pserver(self, use_cuda, sync_mode):
         return p.pid
 
     def _wait_ps_ready(self, pid):
-        retry_times = self.ps_timeout
+        left_time = self.ps_timeout
         sleep_time = 0.5
         while True:
-            assert retry_times >= 0, "wait ps ready failed"
+            assert left_time >= 0, "wait ps ready failed"
             time.sleep(sleep_time)
             try:
                 # the listen_and_serv_op would touch a file which contains the listen port
@@ -81,7 +81,7 @@ def _wait_ps_ready(self, pid):
                 os.stat("/tmp/paddle.%d.port" % pid)
                 return
             except os.error:
-                retry_times -= sleep_time
+                left_time -= sleep_time
 
     def test_rpc_interfaces(self):
         # TODO(Yancey1989): need to make sure the rpc interface correctly.

From c950d22fcc2e2aa2ecfd4b3cfd08ee0bb0e58d23 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 11 Jun 2018 15:18:26 +0800
Subject: [PATCH 12/12] change retry_times to the pserver start left time

---
 .../paddle/fluid/tests/unittests/test_listen_and_serv_op.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index 1422b3bae49680..d1d709551c7790 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -70,10 +70,10 @@ def _start_pserver(self, use_cuda, sync_mode):
         return p.pid
 
     def _wait_ps_ready(self, pid):
-        left_time = self.ps_timeout
+        start_left_time = self.ps_timeout
         sleep_time = 0.5
         while True:
-            assert left_time >= 0, "wait ps ready failed"
+            assert start_left_time >= 0, "wait ps ready failed"
             time.sleep(sleep_time)
             try:
                 # the listen_and_serv_op would touch a file which contains the listen port
@@ -81,7 +81,7 @@ def _wait_ps_ready(self, pid):
                 os.stat("/tmp/paddle.%d.port" % pid)
                 return
             except os.error:
-                left_time -= sleep_time
+                start_left_time -= sleep_time
 
     def test_rpc_interfaces(self):
         # TODO(Yancey1989): need to make sure the rpc interface correctly.