diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index eb2ea8c56ac31b..239705b57ed1f5 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -25,10 +25,8 @@
 
 
 class TestAdamOp1(OpTest):
-
     def setUp(self):
-        '''Test Adam Op with supplied attributes
-        '''
+        '''Test Adam Op with supplied attributes'''
         self.op_type = "adam"
         param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
         grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
@@ -50,20 +48,19 @@ def setUp(self):
             'Moment2': moment2,
             'LearningRate': np.array([learning_rate]).astype("float32"),
             'Beta1Pow': np.array([beta1_pow]).astype("float32"),
-            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
         }
 
         self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
 
-        param_out, moment1_out, \
-            moment2_out = adam_step(self.inputs, self.attrs)
+        param_out, moment1_out, moment2_out = adam_step(self.inputs, self.attrs)
 
         self.outputs = {
             'Moment1Out': moment1_out,
             'Moment2Out': moment2_out,
             'ParamOut': param_out,
             'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
-            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
         }
 
     def test_check_output(self):
@@ -71,13 +68,11 @@ def test_check_output(self):
 
 
 class TestAdamOp2(OpTest):
-
     def set_shape(self):
         self.shape = (102, 105)
 
     def setUp(self):
-        '''Test Adam Op with supplied attributes
-        '''
+        '''Test Adam Op with supplied attributes'''
         self.op_type = "adam"
         self.set_shape()
         param = np.random.uniform(-1, 1, self.shape).astype("float32")
@@ -100,20 +95,19 @@ def setUp(self):
             'Moment2': moment2,
             'LearningRate': np.array([learning_rate]).astype("float32"),
             'Beta1Pow': np.array([beta1_pow]).astype("float32"),
-            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
         }
 
         attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
 
-        param_out, moment1_out, \
-            moment2_out = adam_step(self.inputs, attributes)
+        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
 
         self.outputs = {
             'Moment1Out': moment1_out,
             'Moment2Out': moment2_out,
             'ParamOut': param_out,
             'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
-            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
         }
 
     def test_check_output(self):
@@ -121,16 +115,13 @@ def test_check_output(self):
 
 
 class TestAdamOnlyTailOp(TestAdamOp2):
-
     def set_shape(self):
-        self.shape = (3)
+        self.shape = 3
 
 
 class TestAdamOpMultipleSteps(OpTest):
-
     def setUp(self):
-        '''Test Adam Operator with supplied attributes
-        '''
+        '''Test Adam Operator with supplied attributes'''
         self.op_type = "adam"
         self.num_steps = 10
 
@@ -154,19 +145,20 @@ def setUp(self):
             'Moment2': moment2,
             'LearningRate': np.array([learning_rate]).astype("float32"),
             'Beta1Pow': np.array([self.beta1_pow]).astype("float32"),
-            'Beta2Pow': np.array([self.beta2_pow]).astype("float32")
+            'Beta2Pow': np.array([self.beta2_pow]).astype("float32"),
         }
 
         self.attrs = {
             'epsilon': epsilon,
             'beta1': self.beta1,
-            'beta2': self.beta2
+            'beta2': self.beta2,
         }
 
     def test_check_output(self):
         for _ in range(self.num_steps):
-            param_out, moment1_out, \
-                moment2_out = adam_step(self.inputs, self.attrs)
+            param_out, moment1_out, moment2_out = adam_step(
+                self.inputs, self.attrs
+            )
 
             beta1_pow_out = self.inputs['Beta1Pow'] * self.beta1
             beta2_pow_out = self.inputs['Beta2Pow'] * self.beta2
@@ -175,7 +167,7 @@ def test_check_output(self):
                 'Moment2Out': moment2_out,
                 'ParamOut': param_out,
                 'Beta1PowOut': beta1_pow_out,
-                'Beta2PowOut': beta2_pow_out
+                'Beta2PowOut': beta2_pow_out,
             }
 
             # Verify output for this step
@@ -191,8 +183,9 @@ def test_check_output(self):
             self.inputs['Beta2Pow'] = beta2_pow_out
 
             # Randomize gradient for next step
-            self.inputs['Grad'] = np.random.uniform(
-                -1, 1, (102, 105)).astype("float32")
+            self.inputs['Grad'] = np.random.uniform(-1, 1, (102, 105)).astype(
+                "float32"
+            )
 
     def test_api_eager_dygraph(self):
         with _test_eager_guard():
@@ -272,8 +265,9 @@ def adamw_step(inputs, attributes):
     return param_out, moment1_out, moment2_out
 
 
-def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
-                     lazy_mode):
+def adam_step_sparse(
+    inputs, attributes, height, rows, row_numel, np_grad, lazy_mode
+):
     '''
     Simulate one step of the adam optimizer
     :param inputs: dict of inputs
@@ -298,13 +292,16 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
     param_out = np.zeros(shape=[height, row_numel])
 
     def update_row(row_id, update_value):
-        moment1_out[row_id] = beta1 * moment1[row_id] + (1 -
-                                                         beta1) * update_value
-        moment2_out[row_id] = beta2 * moment2[row_id] + (
-            1 - beta2) * np.square(update_value)
+        moment1_out[row_id] = (
+            beta1 * moment1[row_id] + (1 - beta1) * update_value
+        )
+        moment2_out[row_id] = beta2 * moment2[row_id] + (1 - beta2) * np.square(
+            update_value
+        )
         lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
         param_out[row_id] = param[row_id] - lr_t * (
-            moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon))
+            moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon)
+        )
 
     if lazy_mode:
         for idx, row_id in enumerate(rows):
@@ -320,7 +317,6 @@ def update_row(row_id, update_value):
 
 
 class TestSparseAdamOp(unittest.TestCase):
-
     def setup(self, scope, place, lazy_mode):
         beta1 = 0.78
         beta2 = 0.836
@@ -339,14 +335,14 @@ def setup(self, scope, place, lazy_mode):
             "Moment2": np.full((height, row_numel), 5.0).astype("float32"),
             'Beta1Pow': beta1_pow,
             'Beta2Pow': beta2_pow,
-            "LearningRate": np.full((1), 2.0).astype("float32")
+            "LearningRate": np.full((1), 2.0).astype("float32"),
         }
         self.init_output = np.full((height, row_numel), 0.0).astype("float32")
         self.attrs = {
             'epsilon': epsilon,
             'beta1': beta1,
             'beta2': beta2,
-            'min_row_size_to_use_multithread': 2
+            'min_row_size_to_use_multithread': 2,
         }
 
         grad_selected_rows = scope.var('Grad').get_selected_rows()
@@ -361,15 +357,21 @@ def setup(self, scope, place, lazy_mode):
 
         self.sparse_inputs = ["Grad"]
 
-        param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
-                                                 height, rows, row_numel,
-                                                 np_array, lazy_mode)
+        param_out, mom1, mom2 = adam_step_sparse(
+            self.dense_inputs,
+            self.attrs,
+            height,
+            rows,
+            row_numel,
+            np_array,
+            lazy_mode,
+        )
         self.outputs = {
             "ParamOut": param_out,
             "Moment1Out": mom1,
             "Moment2Out": mom2,
             'Beta1PowOut': beta1_pow * beta1,
-            'Beta2PowOut': beta2_pow * beta2
+            'Beta2PowOut': beta2_pow * beta2,
         }
 
     def check_with_place(self, place, lazy_mode):
@@ -414,10 +416,8 @@ def test_sparse_adam(self):
 
 
 class TestAdamOpBetaVariable(OpTest):
-
     def setUp(self):
-        '''Test Adam Op with beta as Variable
-        '''
+        '''Test Adam Op with beta as Variable'''
         self.op_type = "adam"
         param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
         grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
@@ -446,15 +446,14 @@ def setUp(self):
 
         attributes = {'epsilon': epsilon}
 
-        param_out, moment1_out, \
-            moment2_out = adam_step(self.inputs, attributes)
+        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
 
         self.outputs = {
             'Moment1Out': moment1_out,
             'Moment2Out': moment2_out,
             'ParamOut': param_out,
             'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
-            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
         }
 
     def test_check_output(self):
@@ -462,10 +461,8 @@ def test_check_output(self):
 
 
 class TestAdamOpBetaEpsilonVariable(OpTest):
-
     def setUp(self):
-        '''Test Adam Op with beta/epsilon as Variable
-        '''
+        '''Test Adam Op with beta/epsilon as Variable'''
         self.op_type = "adam"
         param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
         grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
@@ -495,15 +492,14 @@ def setUp(self):
 
         attributes = {'epsilon': epsilon}
 
-        param_out, moment1_out, \
-            moment2_out = adam_step(self.inputs, attributes)
+        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
 
         self.outputs = {
             'Moment1Out': moment1_out,
             'Moment2Out': moment2_out,
             'ParamOut': param_out,
             'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
-            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
         }
 
     def test_check_output(self):
@@ -511,10 +507,8 @@ def test_check_output(self):
 
 
 class TestAdamOpWithGlobalBetaPow(OpTest):
-
     def setUp(self):
-        '''Test Adam Op with global_beta_pow
-        '''
+        '''Test Adam Op with global_beta_pow'''
         self.op_type = "adam"
         param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
         grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
@@ -544,8 +538,7 @@ def setUp(self):
 
         attributes = {'epsilon': epsilon}
 
-        param_out, moment1_out, \
-            moment2_out = adam_step(self.inputs, attributes)
+        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
 
         self.attrs = {'use_global_beta_pow': True}
 
@@ -555,7 +548,7 @@ def setUp(self):
             'Moment2Out': moment2_out,
             'ParamOut': param_out,
             'Beta1PowOut': np.array([]),
-            'Beta2PowOut': np.array([])
+            'Beta2PowOut': np.array([]),
         }
 
     def test_check_output(self):
@@ -563,10 +556,8 @@ def test_check_output(self):
 
 
 class TestAdamOpWithSkipUpdate(OpTest):
-
     def setUp(self):
-        '''Test Adam Op with global_beta_pow
-        '''
+        '''Test Adam Op with global_beta_pow'''
         self.op_type = "adam"
         param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
         grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
@@ -613,7 +604,6 @@ def test_check_output(self):
 
 
 class TestAdamOpV2(unittest.TestCase):
-
     def test_adam_op(self):
         place = fluid.CPUPlace()
         shape = [2, 3, 8, 8]
@@ -626,20 +616,20 @@ def test_adam_op(self):
                 conv = fluid.layers.conv2d(data, 8, 3)
                 loss = fluid.layers.reduce_mean(conv)
 
-                beta1 = fluid.layers.create_global_var(shape=[1],
-                                                       value=0.85,
-                                                       dtype='float32',
-                                                       persistable=True)
-                beta2 = fluid.layers.create_global_var(shape=[1],
-                                                       value=0.95,
-                                                       dtype='float32',
-                                                       persistable=True)
+                beta1 = fluid.layers.create_global_var(
+                    shape=[1], value=0.85, dtype='float32', persistable=True
+                )
+                beta2 = fluid.layers.create_global_var(
+                    shape=[1], value=0.95, dtype='float32', persistable=True
+                )
                 betas = [beta1, beta2]
-                opt = paddle.optimizer.Adam(learning_rate=1e-5,
-                                            beta1=beta1,
-                                            beta2=beta2,
-                                            weight_decay=0.01,
-                                            epsilon=1e-8)
+                opt = paddle.optimizer.Adam(
+                    learning_rate=1e-5,
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01,
+                    epsilon=1e-8,
+                )
                 opt.minimize(loss)
 
         exe.run(startup)
@@ -653,8 +643,9 @@ def test_adam_op_dygraph(self):
         a = fluid.dygraph.to_variable(value)
         linear = fluid.Linear(13, 5, dtype="float32")
 
-        adam = paddle.optimizer.Adam(learning_rate=0.01,
-                                     parameters=linear.parameters())
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.01, parameters=linear.parameters()
+        )
         out = linear(a)
         out.backward()
         adam.step()
@@ -670,26 +661,29 @@ def test_adam_op_with_state_dict(self):
         state_dict = adam.state_dict()
         adam.set_state_dict(state_dict)
 
-        #learning_rate is LRScheduler
+        # learning_rate is LRScheduler
         learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(
-            learning_rate=0.1, T_max=10)
+            learning_rate=0.1, T_max=10
+        )
         adam = paddle.optimizer.Adam(
             learning_rate=learning_rate,
             weight_decay=fluid.regularizer.L2Decay(0.001),
-            parameters=emb.parameters())
+            parameters=emb.parameters(),
+        )
         lr = adam.get_lr()
         state_dict = adam.state_dict()
         adam.set_state_dict(state_dict)
 
-        #leanrning_rate is Tensor
+        # leanrning_rate is Tensor
         with self.assertRaises(TypeError):
             learning_rate = np.array([0.01]).astype("float32")
             learning_rate = paddle.to_tensor(learning_rate)
-            adam = paddle.optimizer.Adam(learning_rate=learning_rate,
-                                         parameters=emb.parameters())
+            adam = paddle.optimizer.Adam(
+                learning_rate=learning_rate, parameters=emb.parameters()
+            )
 
         params = adam.get_opti_var_name_list()
-        assert (params is not None)
+        assert params is not None
         paddle.enable_static()
 
     def test_adam_with_grad_clip(self):
@@ -698,9 +692,9 @@ def test_adam_with_grad_clip(self):
         a = fluid.dygraph.to_variable(value)
         linear = fluid.Linear(13, 5, dtype="float32")
         clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
-        adam = paddle.optimizer.Adam(0.1,
-                                     parameters=linear.parameters(),
-                                     grad_clip=clip)
+        adam = paddle.optimizer.Adam(
+            0.1, parameters=linear.parameters(), grad_clip=clip
+        )
         out = linear(a)
         out.backward()
         adam.step()
@@ -715,11 +709,11 @@ def test_adam_op_with_set_lr(self):
         lr = 0.01
         adam.set_lr(lr)
         cur_lr = adam.get_lr()
-        assert (lr == cur_lr)
+        assert lr == cur_lr
         with self.assertRaises(TypeError):
-            lr_var = paddle.fluid.layers.create_global_var(shape=[1],
-                                                           value=lr,
-                                                           dtype='float32')
+            lr_var = paddle.fluid.layers.create_global_var(
+                shape=[1], value=lr, dtype='float32'
+            )
             adam.set_lr(lr_var)
         paddle.enable_static()
 
@@ -727,17 +721,17 @@ def test_adam_op_invalid_input(self):
         paddle.disable_static()
         linear = paddle.nn.Linear(10, 10)
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.Adam(0.1,
-                                         beta1=-1,
-                                         parameters=linear.parameters())
+            adam = paddle.optimizer.Adam(
+                0.1, beta1=-1, parameters=linear.parameters()
+            )
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.Adam(0.1,
-                                         beta2=-1,
-                                         parameters=linear.parameters())
+            adam = paddle.optimizer.Adam(
+                0.1, beta2=-1, parameters=linear.parameters()
+            )
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.Adam(0.1,
-                                         epsilon=-1,
-                                         parameters=linear.parameters())
+            adam = paddle.optimizer.Adam(
+                0.1, epsilon=-1, parameters=linear.parameters()
+            )
         paddle.enable_static()
 
     def test_adam_op_with_sparse_input_and_weight_decay(self):
@@ -746,9 +740,9 @@ def test_adam_op_with_sparse_input_and_weight_decay(self):
         x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64)
         x = paddle.to_tensor(x_data, stop_gradient=False)
         emb = paddle.nn.Embedding(10, 10, sparse=True)
-        adam = paddle.optimizer.Adam(0.001,
-                                     parameters=emb.parameters(),
-                                     weight_decay=0.01)
+        adam = paddle.optimizer.Adam(
+            0.001, parameters=emb.parameters(), weight_decay=0.01
+        )
 
         with self.assertRaises(RuntimeError):
             out = emb(x)
@@ -766,13 +760,14 @@ def test_api_eager_dygraph(self):
 
 
 class TestAdamOptimizer(unittest.TestCase):
-
-    def _test(self,
-              place,
-              use_tensor=True,
-              use_fluid_api=True,
-              use_global_beta_pow=False,
-              flatten_param_grads=False):
+    def _test(
+        self,
+        place,
+        use_tensor=True,
+        use_fluid_api=True,
+        use_global_beta_pow=False,
+        flatten_param_grads=False,
+    ):
         paddle.enable_static()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -786,29 +781,30 @@ def _test(self,
         weight_attr1 = paddle.ParamAttr(
             name="weight1",
             initializer=fluid.initializer.Constant(value=1.0),
-            trainable=True)
+            trainable=True,
+        )
         weight_attr2 = paddle.ParamAttr(
             name="weight2",
             initializer=fluid.initializer.Constant(value=2.0),
-            trainable=True)
+            trainable=True,
+        )
         clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
 
         with paddle.static.program_guard(main_prog, startup_prog):
             with paddle.utils.unique_name.guard():
                 a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
                 b = paddle.static.data(name="b", shape=[2, 2], dtype='float32')
-                label = paddle.static.data(name="label",
-                                           shape=[2, 1],
-                                           dtype='int64')
+                label = paddle.static.data(
+                    name="label", shape=[2, 1], dtype='int64'
+                )
 
                 sum = paddle.add(a, b)
                 z = paddle.pow(sum, 2.0)
 
                 fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1)
-                prediction = fluid.layers.fc(input=fc_1,
-                                             size=2,
-                                             param_attr=weight_attr2,
-                                             act='softmax')
+                prediction = fluid.layers.fc(
+                    input=fc_1, size=2, param_attr=weight_attr2, act='softmax'
+                )
 
                 cost = fluid.layers.cross_entropy(input=prediction, label=label)
                 loss = fluid.layers.reduce_mean(cost)
@@ -821,19 +817,22 @@ def _test(self,
                         value=float(beta1_init),
                         dtype='float32',
                         persistable=True,
-                        name="beta1")
+                        name="beta1",
+                    )
                     beta2 = fluid.layers.create_global_var(
                         shape=[1],
                         value=float(beta2_init),
                         dtype='float32',
                         persistable=True,
-                        name="beta2")
+                        name="beta2",
+                    )
                     epsilon = fluid.layers.create_global_var(
                         shape=[1],
                         value=float(epsilon_init),
                         dtype='float32',
                         persistable=True,
-                        name="epsilon")
+                        name="epsilon",
+                    )
                     if use_fluid_api:
                         adam = fluid.optimizer.Adam(
                             learning_rate=0.01,
@@ -843,13 +842,16 @@ def _test(self,
                             use_global_beta_pow=use_global_beta_pow,
                             flatten_param_grads=flatten_param_grads,
                             align_size=256,
-                            grad_clip=clip)
+                            grad_clip=clip,
+                        )
                     else:
-                        adam = paddle.optimizer.Adam(learning_rate=0.01,
-                                                     beta1=beta1,
-                                                     beta2=beta2,
-                                                     epsilon=epsilon,
-                                                     grad_clip=clip)
+                        adam = paddle.optimizer.Adam(
+                            learning_rate=0.01,
+                            beta1=beta1,
+                            beta2=beta2,
+                            epsilon=epsilon,
+                            grad_clip=clip,
+                        )
                 else:
                     if use_fluid_api:
                         adam = fluid.optimizer.Adam(
@@ -860,13 +862,16 @@ def _test(self,
                             use_global_beta_pow=use_global_beta_pow,
                             flatten_param_grads=flatten_param_grads,
                             align_size=256,
-                            grad_clip=clip)
+                            grad_clip=clip,
+                        )
                     else:
-                        adam = fluid.optimizer.Adam(learning_rate=0.01,
-                                                    beta1=beta1_init,
-                                                    beta2=beta2_init,
-                                                    epsilon=epsilon_init,
-                                                    grad_clip=clip)
+                        adam = fluid.optimizer.Adam(
+                            learning_rate=0.01,
+                            beta1=beta1_init,
+                            beta2=beta2_init,
+                            epsilon=epsilon_init,
+                            grad_clip=clip,
+                        )
 
                 adam.minimize(loss)
 
@@ -877,15 +882,16 @@ def _test(self,
 
             print("Start run on {}".format(place))
             for epoch in range(10):
-                pred_res, loss_res = exe.run(main_prog,
-                                             feed={
-                                                 "a": a_np,
-                                                 "b": b_np,
-                                                 "label": label_np
-                                             },
-                                             fetch_list=[prediction, loss])
-                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
-                    epoch, pred_res[0], loss_res))
+                pred_res, loss_res = exe.run(
+                    main_prog,
+                    feed={"a": a_np, "b": b_np, "label": label_np},
+                    fetch_list=[prediction, loss],
+                )
+                print(
+                    "Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                        epoch, pred_res[0], loss_res
+                    )
+                )
             paddle.disable_static()
             return pred_res, loss_res
 
@@ -897,10 +903,13 @@ def _test_with_place(self, place):
             for use_fluid_api in [True, False]:
                 for use_global_beta_pow in [True, False]:
                     for flatten_param_grads in [True, False]:
-                        pred, loss = self._test(place, use_tensor,
-                                                use_fluid_api,
-                                                use_global_beta_pow,
-                                                flatten_param_grads)
+                        pred, loss = self._test(
+                            place,
+                            use_tensor,
+                            use_fluid_api,
+                            use_global_beta_pow,
+                            flatten_param_grads,
+                        )
                         preds.append(pred)
                         losses.append(loss)
         for pred in preds:
@@ -922,21 +931,22 @@ def test_adam_flatten_param_grads_with_regularizer(self):
             name="weight1",
             initializer=fluid.initializer.Constant(value=1.0),
             regularizer=fluid.regularizer.L1DecayRegularizer(
-                regularization_coeff=0.1),
-            trainable=True)
+                regularization_coeff=0.1
+            ),
+            trainable=True,
+        )
         with fluid.program_guard(main):
             x = fluid.data(name='x', shape=[None, 13], dtype='float32')
             y = fluid.data(name='y', shape=[None, 1], dtype='float32')
-            y_predict = fluid.layers.fc(input=x,
-                                        size=1,
-                                        act=None,
-                                        param_attr=weight_attr)
+            y_predict = fluid.layers.fc(
+                input=x, size=1, act=None, param_attr=weight_attr
+            )
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
             avg_cost = paddle.mean(cost)
 
-            adam = fluid.optimizer.AdamOptimizer(0.01,
-                                                 flatten_param_grads=True,
-                                                 align_size=256)
+            adam = fluid.optimizer.AdamOptimizer(
+                0.01, flatten_param_grads=True, align_size=256
+            )
             adam.minimize(avg_cost)
             paddle.disable_static()
 
@@ -959,13 +969,16 @@ def test_adam_exception(self):
         adam = fluid.optimizer.Adam(use_global_beta_pow=True)
         adam.minimize(loss)
         self.assertRaises(Exception, adam._get_global_accumulator, 'tmp')
-        adam._add_global_accumulator('tmp',
-                                     type=core.VarDesc.VarType.LOD_TENSOR)
+        adam._add_global_accumulator(
+            'tmp', type=core.VarDesc.VarType.LOD_TENSOR
+        )
         adam._get_global_accumulator('tmp')
-        self.assertRaises(Exception,
-                          adam._add_global_accumulator,
-                          adam._beta1_pow_acc_str,
-                          type=core.VarDesc.VarType.LOD_TENSOR)
+        self.assertRaises(
+            Exception,
+            adam._add_global_accumulator,
+            adam._beta1_pow_acc_str,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+        )
         paddle.disable_static()
 
     def test_adam_save_load(self):
@@ -976,12 +989,14 @@ def test_adam_save_load(self):
         state_dict = linear.state_dict()
         fluid.save_dygraph(state_dict, "paddle_dy")
 
-        scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01,
-                                                  warmup_steps=100,
-                                                  verbose=True)
-        adam = paddle.fluid.optimizer.Adam(learning_rate=scheduler,
-                                           parameter_list=linear.parameters(),
-                                           use_global_beta_pow=True)
+        scheduler = paddle.optimizer.lr.NoamDecay(
+            d_model=0.01, warmup_steps=100, verbose=True
+        )
+        adam = paddle.fluid.optimizer.Adam(
+            learning_rate=scheduler,
+            parameter_list=linear.parameters(),
+            use_global_beta_pow=True,
+        )
         adam.minimize(b)
         state_dict = adam.state_dict()
         fluid.save_dygraph(state_dict, "paddle_dy")
@@ -1002,13 +1017,14 @@ def get_opt(dtype, shape):
                 state_dict = linear.state_dict()
                 fluid.save_dygraph(state_dict, "paddle_dy")
 
-                scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01,
-                                                          warmup_steps=100,
-                                                          verbose=True)
+                scheduler = paddle.optimizer.lr.NoamDecay(
+                    d_model=0.01, warmup_steps=100, verbose=True
+                )
                 adam = paddle.fluid.optimizer.Adam(
                     learning_rate=scheduler,
                     parameter_list=linear.parameters(),
-                    use_global_beta_pow=True)
+                    use_global_beta_pow=True,
+                )
                 adam.minimize(b)
                 return adam
 
@@ -1023,14 +1039,14 @@ def get_opt(dtype, shape):
         self.assertRaises(AssertionError, adam2.set_state_dict, opt_state_dict)
 
         adam3 = get_opt('float32', [10, 10])  # shape not match
-        opt_state_dict['beta1_pow_acc_0'] = np.array([0.9, 0.9],
-                                                     dtype='float32')
+        opt_state_dict['beta1_pow_acc_0'] = np.array(
+            [0.9, 0.9], dtype='float32'
+        )
         self.assertRaises(AssertionError, adam3.set_state_dict, opt_state_dict)
         paddle.enable_static()
 
 
 class TestAdamOpV2Group(TestAdamOpV2):
-
     def test_adam_op(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -1038,16 +1054,19 @@ def test_adam_op(self):
         linear_1 = paddle.nn.Linear(13, 5)
         linear_2 = paddle.nn.Linear(5, 3)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Adam(learning_rate=0.01,
-                                     parameters=[{
-                                         'params': linear_1.parameters()
-                                     }, {
-                                         'params': linear_2.parameters(),
-                                         'weight_decay': 0.001,
-                                         'beta1': 0.1,
-                                         'beta2': 0.99
-                                     }],
-                                     weight_decay=0.1)
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.01,
+            parameters=[
+                {'params': linear_1.parameters()},
+                {
+                    'params': linear_2.parameters(),
+                    'weight_decay': 0.001,
+                    'beta1': 0.1,
+                    'beta2': 0.99,
+                },
+            ],
+            weight_decay=0.1,
+        )
         out = linear_1(a)
         out = linear_2(out)
         out.backward()
@@ -1056,13 +1075,14 @@ def test_adam_op(self):
 
 
 class TestMultiTensorAdam(unittest.TestCase):
-
-    def _adam_optimize_dygraph(self,
-                               place,
-                               use_param_attr=False,
-                               use_param_group=False,
-                               use_amp=False,
-                               use_multi_tensor=False):
+    def _adam_optimize_dygraph(
+        self,
+        place,
+        use_param_attr=False,
+        use_param_group=False,
+        use_amp=False,
+        use_multi_tensor=False,
+    ):
         paddle.disable_static()
         paddle.seed(10)
         paddle.set_device(place)
@@ -1072,29 +1092,40 @@ def _adam_optimize_dygraph(self,
         weight_attr = paddle.ParamAttr(
             learning_rate=0.5,
             regularizer=paddle.regularizer.L2Decay(1.0),
-            trainable=True)
+            trainable=True,
+        )
         if use_param_attr:
             model = paddle.nn.Linear(5, 5, weight_attr)
         else:
             model = paddle.nn.Linear(5, 5)
 
         if not use_param_group:
-            optimizer = paddle.optimizer.Adam(parameters=model.parameters(),
-                                              use_multi_tensor=use_multi_tensor,
-                                              multi_precision=use_amp)
+            optimizer = paddle.optimizer.Adam(
+                parameters=model.parameters(),
+                use_multi_tensor=use_multi_tensor,
+                multi_precision=use_amp,
+            )
         else:
-            optimizer = paddle.optimizer.Adam(parameters=[{
-                'params':
-                model.parameters(),
-                'weight_decay':
-                0.001,
-                'beta1':
-                0.1,
-                'beta2':
-                0.99
-            }],
-                                              use_multi_tensor=use_multi_tensor,
-                                              multi_precision=use_amp)
+            parameters = list(model.parameters())
+            param_num = len(parameters)
+            optimizer = paddle.optimizer.Adam(
+                parameters=[
+                    {
+                        'params': parameters[: int(param_num / 2)],
+                        'weight_decay': 0.001,
+                        'beta1': 0.1,
+                        'beta2': 0.99,
+                    },
+                    {
+                        'params': parameters[int(param_num / 2) :],
+                        'weight_decay': 0.001,
+                        'beta1': 0.1,
+                        'beta2': 0.99,
+                    },
+                ],
+                use_multi_tensor=use_multi_tensor,
+                multi_precision=use_amp,
+            )
 
         for idx in range(2):
             if place == 'gpu' and use_amp == True:
@@ -1118,10 +1149,9 @@ def _adam_optimize_dygraph(self,
 
         return output, model.parameters()
 
-    def _adam_optimize_static(self,
-                              place,
-                              use_amp=False,
-                              use_multi_tensor=False):
+    def _adam_optimize_static(
+        self, place, use_amp=False, use_multi_tensor=False
+    ):
         paddle.enable_static()
         paddle.seed(10)
         np.random.seed(10)
@@ -1130,24 +1160,26 @@ def _adam_optimize_static(self,
         exe = paddle.static.Executor(place=place)
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        optimizer = paddle.optimizer.Adam(multi_precision=use_amp,
-                                          use_multi_tensor=use_multi_tensor)
+        optimizer = paddle.optimizer.Adam(
+            multi_precision=use_amp, use_multi_tensor=use_multi_tensor
+        )
         if use_amp:
             optimizer = paddle.static.amp.decorate(
                 optimizer,
                 init_loss_scaling=128.0,
                 use_dynamic_loss_scaling=True,
                 use_pure_fp16=True,
-                use_fp16_guard=False)
+                use_fp16_guard=False,
+            )
         with paddle.static.program_guard(train_program, startup_program):
             if use_amp:
-                data = paddle.static.data(shape=[2, 2],
-                                          name='X',
-                                          dtype='float16')
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float16'
+                )
             else:
-                data = paddle.static.data(shape=[2, 2],
-                                          name='X',
-                                          dtype='float32')
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float32'
+                )
             hidden = paddle.static.nn.fc(x=data, size=10)
             loss = paddle.mean(hidden)
             optimizer.minimize(loss)
@@ -1159,9 +1191,9 @@ def _adam_optimize_static(self,
             x = np.random.random(size=(2, 2)).astype('float32')
         out = []
         for idx in range(5):
-            loss_data, = exe.run(train_program,
-                                 feed={"X": x},
-                                 fetch_list=[loss.name])
+            (loss_data,) = exe.run(
+                train_program, feed={"X": x}, fetch_list=[loss.name]
+            )
             out.append(loss_data)
         return out
 
@@ -1174,49 +1206,59 @@ def _get_places(self):
     def _check_with_place_amp(self, place, use_amp):
         # test dygraph mode
         output_dygraph1, params_dygraph1 = self._adam_optimize_dygraph(
-            place=place, use_amp=use_amp, use_multi_tensor=True)
+            place=place, use_amp=use_amp, use_multi_tensor=True
+        )
         output_dygraph2, params_dygraph2 = self._adam_optimize_dygraph(
-            place=place, use_amp=use_amp, use_multi_tensor=False)
+            place=place, use_amp=use_amp, use_multi_tensor=False
+        )
         np.testing.assert_allclose(output_dygraph1, output_dygraph2, rtol=1e-05)
         for idx in range(len(params_dygraph1)):
-            np.testing.assert_allclose(params_dygraph1[idx],
-                                       params_dygraph2[idx],
-                                       rtol=1e-05)
+            np.testing.assert_allclose(
+                params_dygraph1[idx], params_dygraph2[idx], rtol=1e-05
+            )
         # test static mode
-        output_static1 = self._adam_optimize_static(place=place,
-                                                    use_amp=use_amp,
-                                                    use_multi_tensor=True)
-        output_static2 = self._adam_optimize_static(place=place,
-                                                    use_amp=use_amp,
-                                                    use_multi_tensor=False)
+        output_static1 = self._adam_optimize_static(
+            place=place, use_amp=use_amp, use_multi_tensor=True
+        )
+        output_static2 = self._adam_optimize_static(
+            place=place, use_amp=use_amp, use_multi_tensor=False
+        )
         for idx in range(len(output_static1)):
-            np.testing.assert_allclose(output_static1[idx],
-                                       output_static2[idx],
-                                       rtol=1e-05)
+            np.testing.assert_allclose(
+                output_static1[idx], output_static2[idx], rtol=1e-05
+            )
 
     def _check_with_param_arrt(self, place, use_amp):
-        output1, params1 = self._adam_optimize_dygraph(place=place,
-                                                       use_amp=use_amp,
-                                                       use_param_attr=True,
-                                                       use_multi_tensor=True)
-        output2, params2 = self._adam_optimize_dygraph(place=place,
-                                                       use_amp=use_amp,
-                                                       use_param_attr=True,
-                                                       use_multi_tensor=False)
+        output1, params1 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_attr=True,
+            use_multi_tensor=True,
+        )
+        output2, params2 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_attr=True,
+            use_multi_tensor=False,
+        )
 
         np.testing.assert_allclose(output1, output2, rtol=1e-05)
         for idx in range(len(params1)):
             np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
 
     def _check_with_param_group(self, place, use_amp):
-        output1, params1 = self._adam_optimize_dygraph(place=place,
-                                                       use_amp=use_amp,
-                                                       use_param_group=True,
-                                                       use_multi_tensor=True)
-        output2, params2 = self._adam_optimize_dygraph(place=place,
-                                                       use_amp=use_amp,
-                                                       use_param_group=True,
-                                                       use_multi_tensor=False)
+        output1, params1 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_group=True,
+            use_multi_tensor=True,
+        )
+        output2, params2 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_group=True,
+            use_multi_tensor=False,
+        )
 
         np.testing.assert_allclose(output1, output2, rtol=1e-05)
         for idx in range(len(params1)):
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index 7d037798588530..614d53de5ed040 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -25,14 +25,16 @@
 from paddle.fluid.framework import _test_eager_guard
 
 
-def calculate_momentum_by_numpy(param,
-                                grad,
-                                mu,
-                                velocity,
-                                use_nesterov,
-                                learning_rate,
-                                regularization_method=None,
-                                regularization_coeff=1.0):
+def calculate_momentum_by_numpy(
+    param,
+    grad,
+    mu,
+    velocity,
+    use_nesterov,
+    learning_rate,
+    regularization_method=None,
+    regularization_coeff=1.0,
+):
     if regularization_method == "l2_decay":
         grad = grad + regularization_coeff * param
 
@@ -44,8 +46,9 @@ def calculate_momentum_by_numpy(param,
     else:
         velocity_out = mu * velocity + grad
         if use_nesterov:
-            param_out = param - grad * learning_rate - \
-                        velocity_out * mu * learning_rate
+            param_out = (
+                param - grad * learning_rate - velocity_out * mu * learning_rate
+            )
         else:
             param_out = param - learning_rate * velocity_out
 
@@ -53,7 +56,6 @@ def calculate_momentum_by_numpy(param,
 
 
 class TestMomentumOp1(OpTest):
-
     def setUp(self):
         self.op_type = "momentum"
         self.dtype = np.float32
@@ -70,7 +72,7 @@ def setUp(self):
             'Param': param,
             'Grad': grad,
             'Velocity': velocity,
-            'LearningRate': learning_rate
+            'LearningRate': learning_rate,
         }
 
         self.attrs = {'mu': mu}
@@ -81,7 +83,8 @@ def setUp(self):
             mu=mu,
             velocity=velocity,
             use_nesterov=use_nesterov,
-            learning_rate=learning_rate)
+            learning_rate=learning_rate,
+        )
 
         self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
 
@@ -93,7 +96,6 @@ def test_check_output(self):
 
 
 class TestMomentumOpFp16(TestMomentumOp1):
-
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -102,8 +104,7 @@ def test_check_output(self):
 
 
 class TestMomentumOp2(OpTest):
-    '''Test Momentum with default values for attributes
-    '''
+    '''Test Momentum with default values for attributes'''
 
     def setUp(self):
         self.op_type = "momentum"
@@ -119,7 +120,7 @@ def setUp(self):
             'Param': param,
             'Grad': grad,
             'Velocity': velocity,
-            'LearningRate': learning_rate
+            'LearningRate': learning_rate,
         }
 
         self.attrs = {'mu': mu, 'use_nesterov': use_nesterov}
@@ -130,7 +131,8 @@ def setUp(self):
             mu=mu,
             velocity=velocity,
             use_nesterov=use_nesterov,
-            learning_rate=learning_rate)
+            learning_rate=learning_rate,
+        )
 
         self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
 
@@ -138,10 +140,10 @@ def test_check_output(self):
         self.check_output()
 
 
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
 class TestLarsMomentumOpWithMP(OpTest):
-
     def setUp(self):
         self.config()
         self.op_type = "lars_momentum"
@@ -168,11 +170,16 @@ def setUp(self):
             fp32_grad = grad.astype("float32")
             pnorm = np.sqrt(np.square(master_param).sum())
             gnorm = np.sqrt(np.square(fp32_grad).sum())
-            local_lr = learning_rate * lars_coeff * pnorm / (
-                gnorm + lars_weight_decay * pnorm)
+            local_lr = (
+                learning_rate
+                * lars_coeff
+                * pnorm
+                / (gnorm + lars_weight_decay * pnorm)
+            )
             fp32_grad = fp32_grad * rescale_grad
             velocity_out = mu * velocity + local_lr * (
-                fp32_grad + lars_weight_decay * master_param)
+                fp32_grad + lars_weight_decay * master_param
+            )
             p_new = master_param - velocity_out
             param_out = p_new.astype("float16")
             master_param_out = p_new
@@ -185,7 +192,8 @@ def setUp(self):
             param_outs.append(("SubParam_out_" + str(i), param_out))
             master_params.append(("SubMasterParam_" + str(i), master_param))
             master_param_outs.append(
-                ("SubMasterParamOut_" + str(i), master_param_out))
+                ("SubMasterParamOut_" + str(i), master_param_out)
+            )
 
         self.inputs = {
             'Param': params,
@@ -200,13 +208,13 @@ def setUp(self):
             'lars_coeff': lars_coeff,
             'lars_weight_decay': [lars_weight_decay],
             'multi_precision': True,
-            'rescale_grad': rescale_grad
+            'rescale_grad': rescale_grad,
         }
 
         self.outputs = {
             'ParamOut': param_outs,
             'VelocityOut': velocity_outs,
-            'MasterParamOut': master_param_outs
+            'MasterParamOut': master_param_outs,
         }
 
     def test_check_output(self):
@@ -221,7 +229,6 @@ def config(self):
 
 
 class TestLarsMomentumOp(OpTest):
-
     def setUp(self):
         self.config()
         self.op_type = "lars_momentum"
@@ -242,10 +249,15 @@ def setUp(self):
             learning_rate = np.array([0.001]).astype("float32")
             pnorm = np.sqrt(np.square(param).sum())
             gnorm = np.sqrt(np.square(grad).sum())
-            local_lr = learning_rate * lars_coeff * pnorm / (
-                gnorm + lars_weight_decay * param)
+            local_lr = (
+                learning_rate
+                * lars_coeff
+                * pnorm
+                / (gnorm + lars_weight_decay * param)
+            )
             velocity_out = mu * velocity + local_lr * (
-                grad + lars_weight_decay * param)
+                grad + lars_weight_decay * param
+            )
             param_out = param - velocity_out
 
             params.append(("SubParam_" + str(i), param))
@@ -259,13 +271,13 @@ def setUp(self):
             'Param': params,
             'Grad': grads,
             'Velocity': velocitys,
-            'LearningRate': learning_rates
+            'LearningRate': learning_rates,
         }
 
         self.attrs = {
             'mu': mu,
             'lars_coeff': lars_coeff,
-            'lars_weight_decay': [lars_weight_decay]
+            'lars_weight_decay': [lars_weight_decay],
         }
         self.outputs = {'ParamOut': param_outs, 'VelocityOut': velocity_outs}
 
@@ -278,7 +290,6 @@ def config(self):
 
 
 class TestSparseMomentumOp(unittest.TestCase):
-
     def setUp(self):
         self.use_nesterov = False
         self.regularization_method = ""
@@ -317,8 +328,9 @@ def check_with_place(self, place):
         velocity_np_array = np.ones((height, row_numel)).astype("float32")
         velocity.set(velocity_np_array, place)
         velocity_out = scope.var('VelocityOut').get_tensor()
-        velocity_out_np_array = np.full((height, row_numel),
-                                        0.0).astype("float32")
+        velocity_out_np_array = np.full((height, row_numel), 0.0).astype(
+            "float32"
+        )
         velocity_out.set(velocity_out_np_array, place)
 
         # create and initialize LearningRate Variable
@@ -327,17 +339,19 @@ def check_with_place(self, place):
         lr.set(lr_array, place)
 
         # create and run operator
-        op = Operator("momentum",
-                      Param='Param',
-                      Grad='Grad',
-                      Velocity='Velocity',
-                      ParamOut='ParamOut',
-                      VelocityOut='VelocityOut',
-                      LearningRate='LearningRate',
-                      mu=mu,
-                      use_nesterov=use_nesterov,
-                      regularization_method=regularization_method,
-                      regularization_coeff=regularization_coeff)
+        op = Operator(
+            "momentum",
+            Param='Param',
+            Grad='Grad',
+            Velocity='Velocity',
+            ParamOut='ParamOut',
+            VelocityOut='VelocityOut',
+            LearningRate='LearningRate',
+            mu=mu,
+            use_nesterov=use_nesterov,
+            regularization_method=regularization_method,
+            regularization_coeff=regularization_coeff,
+        )
         op.run(scope, place)
 
         # get and compare result
@@ -360,7 +374,8 @@ def check_with_place(self, place):
             use_nesterov=use_nesterov,
             learning_rate=lr_array,
             regularization_method=regularization_method,
-            regularization_coeff=regularization_coeff)
+            regularization_coeff=regularization_coeff,
+        )
 
         self.assertTrue((_velocity_out == velocity_out_np_array).all())
         self.assertTrue((_param_out == param_out_np_array).all())
@@ -377,13 +392,11 @@ def test_sparse_momentum(self):
 
 
 class TestSparseMomentumOp2(TestSparseMomentumOp):
-
     def init_kernel(self):
         self.use_nesterov = True
 
 
 class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
-
     def setUp(self):
         self.init_args()
         self.regularization_method = ""
@@ -427,8 +440,9 @@ def check_with_place(self, place):
         velocity_np_array = np.ones((height, row_numel)).astype("float32")
         velocity.set(velocity_np_array, place)
         velocity_out = scope.var('VelocityOut').get_tensor()
-        velocity_out_np_array = np.full((height, row_numel),
-                                        0.0).astype("float32")
+        velocity_out_np_array = np.full((height, row_numel), 0.0).astype(
+            "float32"
+        )
         velocity_out.set(velocity_out_np_array, place)
 
         # create and initialize LearningRate Variable
@@ -437,21 +451,23 @@ def check_with_place(self, place):
         lr.set(lr_array, place)
 
         # create and run operator
-        op = Operator("momentum",
-                      Param='Param',
-                      Grad='Grad',
-                      Velocity='Velocity',
-                      MasterParam='MasterParam',
-                      ParamOut='ParamOut',
-                      VelocityOut='VelocityOut',
-                      MasterParamOut='MasterParamOut',
-                      LearningRate='LearningRate',
-                      mu=mu,
-                      use_nesterov=use_nesterov,
-                      regularization_method=regularization_method,
-                      regularization_coeff=regularization_coeff,
-                      multi_precision=True,
-                      rescale_grad=1.0)
+        op = Operator(
+            "momentum",
+            Param='Param',
+            Grad='Grad',
+            Velocity='Velocity',
+            MasterParam='MasterParam',
+            ParamOut='ParamOut',
+            VelocityOut='VelocityOut',
+            MasterParamOut='MasterParamOut',
+            LearningRate='LearningRate',
+            mu=mu,
+            use_nesterov=use_nesterov,
+            regularization_method=regularization_method,
+            regularization_coeff=regularization_coeff,
+            multi_precision=True,
+            rescale_grad=1.0,
+        )
         op.run(scope, place)
 
         # get and compare result
@@ -472,7 +488,8 @@ def check_with_place(self, place):
             use_nesterov=use_nesterov,
             learning_rate=lr_array,
             regularization_method=regularization_method,
-            regularization_coeff=regularization_coeff)
+            regularization_coeff=regularization_coeff,
+        )
 
         self.assertTrue((_velocity_out == velocity_out_np_array).all())
         self.assertTrue((_param_out == param_out_np_array).all())
@@ -486,23 +503,22 @@ def test_sparse_momentum(self):
 
 
 class TestSparseMomentumOpWithMultiPrecision2(
-        TestSparseMomentumOpWithMultiPrecision):
-
+    TestSparseMomentumOpWithMultiPrecision
+):
     def init_args(self):
         self.use_nesterov = True
 
 
 class TestMomentumV2(unittest.TestCase):
-
     def test_momentum_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Momentum(learning_rate=0.01,
-                                         momentum=0.9,
-                                         parameters=linear.parameters())
+        adam = paddle.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9, parameters=linear.parameters()
+        )
         out = linear(a)
         out.backward()
         adam.step()
@@ -519,13 +535,15 @@ def test_momentum(self):
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
             avg_cost = paddle.mean(cost)
 
-            rms_optimizer = paddle.optimizer.Momentum(learning_rate=0.1,
-                                                      momentum=0.9)
+            rms_optimizer = paddle.optimizer.Momentum(
+                learning_rate=0.1, momentum=0.9
+            )
             rms_optimizer.minimize(avg_cost)
 
             fetch_list = [avg_cost]
-            train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
-                                        batch_size=1)
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1
+            )
             feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -533,9 +551,9 @@ def test_momentum(self):
                 exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
 
     def test_raise_error(self):
-        self.assertRaises(ValueError,
-                          paddle.optimizer.Momentum,
-                          learning_rate=None)
+        self.assertRaises(
+            ValueError, paddle.optimizer.Momentum, learning_rate=None
+        )
         self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None)
 
     def test_api_eager_dygraph(self):
@@ -545,7 +563,6 @@ def test_api_eager_dygraph(self):
 
 
 class TestMomentumOpWithDecay(OpTest):
-
     def setUp(self):
         self.op_type = "momentum"
         self.dtype = np.float32
@@ -567,14 +584,14 @@ def setUp(self):
             'Param': param,
             'Grad': grad,
             'Velocity': velocity,
-            'LearningRate': learning_rate
+            'LearningRate': learning_rate,
         }
 
         self.attrs = {
             'mu': mu,
             'use_nesterov': use_nesterov,
             'regularization_method': regularization_method,
-            'regularization_coeff': regularization_coeff
+            'regularization_coeff': regularization_coeff,
         }
 
         grad = grad + regularization_coeff * param
@@ -585,7 +602,8 @@ def setUp(self):
             mu=mu,
             velocity=velocity,
             use_nesterov=use_nesterov,
-            learning_rate=learning_rate)
+            learning_rate=learning_rate,
+        )
 
         self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
 
@@ -598,7 +616,6 @@ def test_check_output(self):
 
 
 class TestMomentumOpWithDecayFP16(TestMomentumOpWithDecay):
-
     def init_config(self):
         self.dtype = np.float16
 
@@ -608,13 +625,11 @@ def test_check_output(self):
 
 
 class TestMomentumOpWithDecay2(TestMomentumOpWithDecay):
-
     def init_config(self):
         self.use_nesterov = False
 
 
 class TestSparseMomentumOpWithDecay(TestSparseMomentumOp):
-
     def setUp(self):
         self.use_nesterov = False
         self.regularization_method = 'l2_decay'
@@ -622,13 +637,11 @@ def setUp(self):
 
 
 class TestSparseMomentumOpWithDecay2(TestSparseMomentumOpWithDecay):
-
     def init_kernel(self):
         self.use_nesterov = True
 
 
 class TestMomentumOpWithDecayAPI(unittest.TestCase):
-
     def _test_momentum_dygraph_common(self, regularization):
         paddle.disable_static()
         inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
@@ -641,13 +654,16 @@ def _test_momentum_dygraph_common(self, regularization):
             learning_rate=0.01,
             momentum=0.9,
             parameter_list=linear.parameters(),
-            regularization=regularization)
+            regularization=regularization,
+        )
         momentum.minimize(loss)
 
     def test_momentum_dygraph_1(self):
         self._test_momentum_dygraph_common(
             regularization=paddle.fluid.regularizer.L2Decay(
-                regularization_coeff=0.1))
+                regularization_coeff=0.1
+            )
+        )
 
     def test_momentum_static(self):
         paddle.enable_static()
@@ -661,12 +677,14 @@ def test_momentum_static(self):
             avg_cost = paddle.mean(cost)
 
             momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum(
-                learning_rate=0.1, momentum=0.9)
+                learning_rate=0.1, momentum=0.9
+            )
             momentum_optimizer.minimize(avg_cost)
 
             fetch_list = [avg_cost]
-            train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
-                                        batch_size=1)
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1
+            )
             feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -675,23 +693,23 @@ def test_momentum_static(self):
 
 
 class TestFusedMomentumWithDecayAPI(unittest.TestCase):
-
     def get_program(self, weight_attr, bias_attr=False):
         main_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        with paddle.static.program_guard(main_program=main_program,
-                                         startup_program=startup_program):
+        with paddle.static.program_guard(
+            main_program=main_program, startup_program=startup_program
+        ):
             x = paddle.static.data(name='x', shape=[10, 10])
-            linear = paddle.nn.Linear(10,
-                                      10,
-                                      weight_attr=weight_attr,
-                                      bias_attr=bias_attr)
+            linear = paddle.nn.Linear(
+                10, 10, weight_attr=weight_attr, bias_attr=bias_attr
+            )
             out = linear(x)
             loss = paddle.mean(out)
             optimizer = paddle.optimizer.Momentum(
                 learning_rate=0.01,
                 momentum=0.9,
-                weight_decay=paddle.regularizer.L2Decay(0.5))
+                weight_decay=paddle.regularizer.L2Decay(0.5),
+            )
             optimizer.minimize(loss)
         return main_program
 
@@ -700,7 +718,8 @@ def test_param_has_l2decay(self):
         weight_attr = paddle.ParamAttr(
             name="weight",
             initializer=paddle.nn.initializer.Constant(value=0.5),
-            regularizer=paddle.regularizer.L2Decay(0.1))
+            regularizer=paddle.regularizer.L2Decay(0.1),
+        )
         program = self.get_program(weight_attr, bias_attr=False)
         ops = program.global_block().ops
 
@@ -715,11 +734,13 @@ def test_param_has_l1decay(self):
         weight_attr = paddle.ParamAttr(
             name="weight",
             initializer=paddle.nn.initializer.Constant(value=0.5),
-            regularizer=paddle.regularizer.L1Decay(0.1))
+            regularizer=paddle.regularizer.L1Decay(0.1),
+        )
         bias_attr = paddle.ParamAttr(
             name="bias",
-            initializer=paddle.nn.initializer.Constant(value=0.),
-            regularizer=None)
+            initializer=paddle.nn.initializer.Constant(value=0.0),
+            regularizer=None,
+        )
         program = self.get_program(weight_attr, bias_attr)
         ops = program.global_block().ops
 
@@ -734,8 +755,9 @@ def test_param_has_l1decay(self):
             self.assertEqual(ops[-1].attr('regularization_coeff'), 0)
         if 'bias' in ops[-2].input('Param'):
             self.assertEqual(ops[-2].attr('regularization_method'), 'l2_decay')
-            self.assertEqual(ops[-2].attr('regularization_coeff'),
-                             np.float32(0.5))
+            self.assertEqual(
+                ops[-2].attr('regularization_coeff'), np.float32(0.5)
+            )
 
     def test_param_has_no_regularizer(self):
         paddle.enable_static()
@@ -749,11 +771,11 @@ def test_param_has_no_regularizer(self):
 
 
 class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
-
     def __update_params(self, momentum, linear):
         for i in range(10):
-            inp = paddle.full(shape=[2, 2], fill_value=i,
-                              dtype='float32').astype("float32")
+            inp = paddle.full(
+                shape=[2, 2], fill_value=i, dtype='float32'
+            ).astype("float32")
             inp = paddle.to_tensor(inp)
             out = linear(inp)
             loss = paddle.mean(out)
@@ -768,32 +790,39 @@ def __test_vs(self, place=fluid.CPUPlace()):
             2,
             2,
             weight_attr=paddle.nn.initializer.Constant(value=2.0),
-            bias_attr=paddle.nn.initializer.Constant(value=2.0))
+            bias_attr=paddle.nn.initializer.Constant(value=2.0),
+        )
         momentum_old = paddle.fluid.optimizer.Momentum(
             learning_rate=0.01,
             momentum=0.9,
             parameter_list=linear_old.parameters(),
             regularization=paddle.fluid.regularizer.L2Decay(
-                regularization_coeff=0.1))
+                regularization_coeff=0.1
+            ),
+        )
         self.__update_params(momentum=momentum_old, linear=linear_old)
 
         linear_new = paddle.nn.Linear(
             2,
             2,
             weight_attr=paddle.nn.initializer.Constant(value=2.0),
-            bias_attr=paddle.nn.initializer.Constant(value=2.0))
+            bias_attr=paddle.nn.initializer.Constant(value=2.0),
+        )
         momentum_new = paddle.fluid.contrib.optimizer.Momentum(
             learning_rate=0.01,
             momentum=0.9,
             parameter_list=linear_new.parameters(),
             regularization=paddle.fluid.regularizer.L2Decay(
-                regularization_coeff=0.1))
+                regularization_coeff=0.1
+            ),
+        )
         self.__update_params(momentum=momentum_new, linear=linear_new)
 
         self.assertEqual(
             (linear_old.weight.numpy() == linear_new.weight.numpy()).all(),
             True,
-            'the param weight updated by two Momentum optimizers should equal')
+            'the param weight updated by two Momentum optimizers should equal',
+        )
 
     def test_vs(self, place=fluid.CPUPlace()):
         places = [fluid.CPUPlace()]
@@ -805,7 +834,6 @@ def test_vs(self, place=fluid.CPUPlace()):
 
 
 class TestMomentumV2Group(TestMomentumV2):
-
     def test_momentum_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -813,22 +841,20 @@ def test_momentum_dygraph(self):
         linear_1 = paddle.nn.Linear(13, 5)
         linear_2 = paddle.nn.Linear(5, 3)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Momentum(learning_rate=0.01,
-                                         parameters=[{
-                                             'params':
-                                             linear_1.parameters()
-                                         }, {
-                                             'params':
-                                             linear_2.parameters(),
-                                             'weight_decay':
-                                             0.001,
-                                             'learning_rate':
-                                             0.1,
-                                             'momentum':
-                                             0.99
-                                         }],
-                                         weight_decay=0.1,
-                                         momentum=0.9)
+        adam = paddle.optimizer.Momentum(
+            learning_rate=0.01,
+            parameters=[
+                {'params': linear_1.parameters()},
+                {
+                    'params': linear_2.parameters(),
+                    'weight_decay': 0.001,
+                    'learning_rate': 0.1,
+                    'momentum': 0.99,
+                },
+            ],
+            weight_decay=0.1,
+            momentum=0.9,
+        )
         out = linear_1(a)
         out = linear_2(out)
         out.backward()
@@ -837,13 +863,14 @@ def test_momentum_dygraph(self):
 
 
 class TestMultiTensorMomentumDygraph(unittest.TestCase):
-
-    def _momentum_optimize_dygraph(self,
-                                   place,
-                                   use_param_attr=False,
-                                   use_param_group=False,
-                                   use_amp=False,
-                                   use_multi_tensor=False):
+    def _momentum_optimize_dygraph(
+        self,
+        place,
+        use_param_attr=False,
+        use_param_group=False,
+        use_amp=False,
+        use_multi_tensor=False,
+    ):
         paddle.disable_static()
         paddle.seed(10)
         paddle.set_device(place)
@@ -851,7 +878,8 @@ def _momentum_optimize_dygraph(self,
         weight_attr = paddle.ParamAttr(
             learning_rate=0.5,
             regularizer=paddle.regularizer.L2Decay(1.0),
-            trainable=True)
+            trainable=True,
+        )
         if use_param_attr:
             model = paddle.nn.Linear(5, 5, weight_attr)
         else:
@@ -860,17 +888,29 @@ def _momentum_optimize_dygraph(self,
             optimizer = paddle.optimizer.Momentum(
                 parameters=model.parameters(),
                 use_multi_tensor=use_multi_tensor,
-                multi_precision=use_amp)
+                multi_precision=use_amp,
+            )
         else:
+            parameters = list(model.parameters())
+            n = len(parameters)
             optimizer = paddle.optimizer.Momentum(
-                parameters=[{
-                    'params': model.parameters(),
-                    'weight_decay': 0.001,
-                    'learning_rate': 0.1,
-                    'momentum': 0.99
-                }],
+                parameters=[
+                    {
+                        'params': parameters[: int(n / 2)],
+                        'weight_decay': 0.001,
+                        'learning_rate': 0.1,
+                        'momentum': 0.99,
+                    },
+                    {
+                        'params': parameters[int(n / 2) :],
+                        'weight_decay': 0.001,
+                        'learning_rate': 0.1,
+                        'momentum': 0.99,
+                    },
+                ],
                 use_multi_tensor=use_multi_tensor,
-                multi_precision=use_amp)
+                multi_precision=use_amp,
+            )
         for idx in range(5):
             if place == 'gpu' and use_amp == True:
                 model = paddle.amp.decorate(models=model, level='O2')
@@ -900,9 +940,11 @@ def _get_places(self):
 
     def _check_with_place_amp(self, place, use_amp):
         output1, params1 = self._momentum_optimize_dygraph(
-            place=place, use_amp=use_amp, use_multi_tensor=True)
+            place=place, use_amp=use_amp, use_multi_tensor=True
+        )
         output2, params2 = self._momentum_optimize_dygraph(
-            place=place, use_amp=use_amp, use_multi_tensor=False)
+            place=place, use_amp=use_amp, use_multi_tensor=False
+        )
 
         np.testing.assert_allclose(output1, output2, rtol=1e-05)
         for idx in range(len(params1)):
@@ -913,12 +955,14 @@ def _check_with_param_arrt(self, place, use_amp):
             place=place,
             use_amp=use_amp,
             use_param_attr=True,
-            use_multi_tensor=True)
+            use_multi_tensor=True,
+        )
         output2, params2 = self._momentum_optimize_dygraph(
             place=place,
             use_amp=use_amp,
             use_param_attr=True,
-            use_multi_tensor=False)
+            use_multi_tensor=False,
+        )
         np.testing.assert_allclose(output1, output2, rtol=1e-05)
         for idx in range(len(params1)):
             np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
@@ -928,12 +972,14 @@ def _check_with_param_group(self, place, use_amp):
             place=place,
             use_amp=use_amp,
             use_param_group=True,
-            use_multi_tensor=True)
+            use_multi_tensor=True,
+        )
         output2, params2 = self._momentum_optimize_dygraph(
             place=place,
             use_amp=use_amp,
             use_param_group=True,
-            use_multi_tensor=False)
+            use_multi_tensor=False,
+        )
         np.testing.assert_allclose(output1, output2, rtol=1e-05)
         for idx in range(len(params1)):
             np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
@@ -952,11 +998,9 @@ def test_api_eager_dygraph(self):
 
 
 class TestMultiTensorMomentumStatic(unittest.TestCase):
-
-    def _momentum_optimize_static(self,
-                                  place,
-                                  use_amp=False,
-                                  use_multi_tensor=False):
+    def _momentum_optimize_static(
+        self, place, use_amp=False, use_multi_tensor=False
+    ):
         paddle.enable_static()
         paddle.seed(10)
         np.random.seed(10)
@@ -965,24 +1009,26 @@ def _momentum_optimize_static(self,
         exe = paddle.static.Executor(place=place)
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        optimizer = paddle.optimizer.Momentum(multi_precision=use_amp,
-                                              use_multi_tensor=use_multi_tensor)
+        optimizer = paddle.optimizer.Momentum(
+            multi_precision=use_amp, use_multi_tensor=use_multi_tensor
+        )
         if use_amp:
             optimizer = paddle.static.amp.decorate(
                 optimizer,
                 init_loss_scaling=128.0,
                 use_dynamic_loss_scaling=True,
                 use_pure_fp16=True,
-                use_fp16_guard=False)
+                use_fp16_guard=False,
+            )
         with paddle.static.program_guard(train_program, startup_program):
             if use_amp:
-                data = paddle.static.data(shape=[2, 2],
-                                          name='X',
-                                          dtype='float16')
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float16'
+                )
             else:
-                data = paddle.static.data(shape=[2, 2],
-                                          name='X',
-                                          dtype='float32')
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float32'
+                )
             hidden = paddle.static.nn.fc(x=data, size=10)
             loss = paddle.mean(hidden)
             optimizer.minimize(loss)
@@ -994,9 +1040,9 @@ def _momentum_optimize_static(self,
             x = numpy.random.random(size=(2, 2)).astype('float32')
         out = []
         for idx in range(5):
-            loss_data, = exe.run(train_program,
-                                 feed={"X": x},
-                                 fetch_list=[loss.name])
+            (loss_data,) = exe.run(
+                train_program, feed={"X": x}, fetch_list=[loss.name]
+            )
             out.append(loss_data)
         return out
 
@@ -1007,12 +1053,12 @@ def _get_places(self):
         return places
 
     def _check_with_place_amp(self, place, use_amp):
-        output1 = self._momentum_optimize_static(place=place,
-                                                 use_amp=use_amp,
-                                                 use_multi_tensor=True)
-        output2 = self._momentum_optimize_static(place=place,
-                                                 use_amp=use_amp,
-                                                 use_multi_tensor=False)
+        output1 = self._momentum_optimize_static(
+            place=place, use_amp=use_amp, use_multi_tensor=True
+        )
+        output2 = self._momentum_optimize_static(
+            place=place, use_amp=use_amp, use_multi_tensor=False
+        )
         for idx in range(len(output1)):
             np.testing.assert_allclose(output1[idx], output2[idx], rtol=1e-05)
 
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 8ba040c1116e47..ba3bd964bf14cb 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -163,18 +163,20 @@ class Adam(Optimizer):
     _beta1_pow_acc_str = "beta1_pow_acc"
     _beta2_pow_acc_str = "beta2_pow_acc"
 
-    def __init__(self,
-                 learning_rate=0.001,
-                 beta1=0.9,
-                 beta2=0.999,
-                 epsilon=1e-8,
-                 parameters=None,
-                 weight_decay=None,
-                 grad_clip=None,
-                 lazy_mode=False,
-                 multi_precision=False,
-                 use_multi_tensor=False,
-                 name=None):
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-8,
+        parameters=None,
+        weight_decay=None,
+        grad_clip=None,
+        lazy_mode=False,
+        multi_precision=False,
+        use_multi_tensor=False,
+        name=None,
+    ):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
@@ -182,20 +184,25 @@ def __init__(self,
         if not isinstance(beta1, Variable):
             if not 0 <= beta1 < 1:
                 raise ValueError(
-                    "Invaild value of beta1, expect beta1 in [0,1).")
+                    "Invaild value of beta1, expect beta1 in [0,1)."
+                )
         if not isinstance(beta2, Variable):
             if not 0 <= beta2 < 1:
                 raise ValueError(
-                    "Invaild value of beta2, expect beta2 in [0,1).")
+                    "Invaild value of beta2, expect beta2 in [0,1)."
+                )
         if not isinstance(epsilon, Variable):
             if not 0 <= epsilon:
                 raise ValueError(
-                    "Invaild value of epsilon, expect epsilon >= 0.")
-        super(Adam, self).__init__(learning_rate=learning_rate,
-                                   parameters=parameters,
-                                   weight_decay=weight_decay,
-                                   grad_clip=grad_clip,
-                                   name=name)
+                    "Invaild value of epsilon, expect epsilon >= 0."
+                )
+        super(Adam, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name,
+        )
         self.type = "adam"
         self._beta1 = beta1
         self._beta2 = beta2
@@ -212,21 +219,13 @@ def __init__(self,
 
         self._use_multi_tensor = use_multi_tensor
         if self._use_multi_tensor:
-            self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
-            self._moment1_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
-            self._moment2_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
-            self._beta1_pow_acc_dict = {
-                'FP32_LODTensor': [],
-                'FP16_LODTensor': []
-            }
-            self._beta2_pow_acc_dict = {
-                'FP32_LODTensor': [],
-                'FP16_LODTensor': []
-            }
-            self._master_weight_dict = {
-                'FP32_LODTensor': None,
-                'FP16_LODTensor': []
-            }
+            self._param_dict = self._create_multi_tensor_dict()
+            self._moment1_dict = self._create_multi_tensor_dict()
+            self._moment2_dict = self._create_multi_tensor_dict()
+            self._beta1_pow_acc_dict = self._create_multi_tensor_dict()
+            self._beta2_pow_acc_dict = self._create_multi_tensor_dict()
+            self._master_weight_dict = self._create_multi_tensor_dict()
+            self._master_weight_dict['FP32_LODTensor'] = None
 
     def _create_master_weight(self, param):
         if param.name in self._master_weights:
@@ -236,19 +235,23 @@ def _create_master_weight(self, param):
 
             var_name = param.name + "_fp32_master"
             var_name = unique_name.generate(var_name)
-            var = layers.create_global_var(name=var_name,
-                                           shape=param.shape,
-                                           value=0,
-                                           dtype='float32',
-                                           persistable=True)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True,
+            )
             block = self.helper.startup_program.global_block()
-            block.append_op(type="cast",
-                            inputs={"X": [param]},
-                            outputs={"Out": [var]},
-                            attrs={
-                                "in_dtype": param.dtype,
-                                "out_dtype": core.VarDesc.VarType.FP32
-                            })
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32,
+                },
+            )
             self._master_weights[param.name] = var
         return var
 
@@ -262,20 +265,30 @@ def _get_accumulator(self, name, param):
         """
         if self._name is not None:
             name = self._name + "_" + name
-        find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
-        target_param = self._master_weights[
-            param.name] if find_master else param
+        find_master = (
+            self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
+        )
+        target_param = (
+            self._master_weights[param.name] if find_master else param
+        )
         target_name = target_param.name
-        if (name not in self._accumulators
-                or target_name not in self._accumulators[name]):
+        if (
+            name not in self._accumulators
+            or target_name not in self._accumulators[name]
+        ):
             raise Exception(
                 "Accumulator {} does not exist for parameter {}".format(
-                    name, target_name))
+                    name, target_name
+                )
+            )
         return self._accumulators[name][target_name]
 
     def _add_moments_pows(self, p):
         acc_dtype = p.dtype
-        if acc_dtype == core.VarDesc.VarType.FP16 or acc_dtype == core.VarDesc.VarType.BF16:
+        if (
+            acc_dtype == core.VarDesc.VarType.FP16
+            or acc_dtype == core.VarDesc.VarType.BF16
+        ):
             acc_dtype = core.VarDesc.VarType.FP32
         self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
         self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
@@ -283,18 +296,24 @@ def _add_moments_pows(self, p):
             name=self._beta1_pow_acc_str,
             param=p,
             dtype=acc_dtype,
-            fill_value=0.9 if isinstance(self._beta1, Variable) \
-                    else self._beta1,
+            fill_value=0.9
+            if isinstance(self._beta1, Variable)
+            else self._beta1,
             shape=[1],
-            type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            device='cpu',
+        )
         self._add_accumulator(
             name=self._beta2_pow_acc_str,
             param=p,
             dtype=acc_dtype,
-            fill_value=0.999 if isinstance(self._beta2, Variable) \
-                    else self._beta2,
+            fill_value=0.999
+            if isinstance(self._beta2, Variable)
+            else self._beta2,
             shape=[1],
-            type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            device='cpu',
+        )
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
@@ -307,7 +326,10 @@ def _create_accumulators(self, block, parameters):
                 master_p = self._create_master_weight(p)
                 self._add_moments_pows(master_p)
                 continue
-            if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
+            if (
+                p.dtype == core.VarDesc.VarType.FP16
+                and not self._multi_precision
+            ):
                 warnings.warn(
                     "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
                     "Consider using multi_precision=True option of the Adam optimizer."
@@ -319,50 +341,105 @@ def _append_optimize_op(self, block, param_and_grad):
         if isinstance(param_and_grad, dict):
             param_and_grad = self._update_param_group(param_and_grad)
 
-        moment1 = self._get_accumulator(self._moment1_acc_str,
-                                        param_and_grad[0])
-        moment2 = self._get_accumulator(self._moment2_acc_str,
-                                        param_and_grad[0])
-        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                              param_and_grad[0])
-        beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
-                                              param_and_grad[0])
-        find_master = self._multi_precision and param_and_grad[
-            0].dtype == core.VarDesc.VarType.FP16
-        master_weight = (self._master_weights[param_and_grad[0].name]
-                         if find_master else None)
+        moment1 = self._get_accumulator(
+            self._moment1_acc_str, param_and_grad[0]
+        )
+        moment2 = self._get_accumulator(
+            self._moment2_acc_str, param_and_grad[0]
+        )
+        beta1_pow_acc = self._get_accumulator(
+            self._beta1_pow_acc_str, param_and_grad[0]
+        )
+        beta2_pow_acc = self._get_accumulator(
+            self._beta2_pow_acc_str, param_and_grad[0]
+        )
+        find_master = (
+            self._multi_precision
+            and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
+        )
+        master_weight = (
+            self._master_weights[param_and_grad[0].name]
+            if find_master
+            else None
+        )
         lr = self._create_param_lr(param_and_grad)
         # create the adam optimize op
 
         if framework.in_dygraph_mode():
             found_inf = self._get_auxiliary_var('found_inf')
 
-            _beta1 = self._beta1 if not isinstance(
-                self._beta1, Variable) else self._beta1.numpy().item(0)
-            _beta2 = self._beta2 if not isinstance(
-                self._beta2, Variable) else self._beta2.numpy().item(0)
+            _beta1 = (
+                self._beta1
+                if not isinstance(self._beta1, Variable)
+                else self._beta1.numpy().item(0)
+            )
+            _beta2 = (
+                self._beta2
+                if not isinstance(self._beta2, Variable)
+                else self._beta2.numpy().item(0)
+            )
 
             _, _, _, _, _, _ = _C_ops.adam_(
-                param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
-                beta1_pow_acc, beta2_pow_acc, master_weight, found_inf, _beta1,
-                _beta2, self._epsilon, self._lazy_mode, 1000, find_master,
-                False)
+                param_and_grad[0],
+                param_and_grad[1],
+                lr,
+                moment1,
+                moment2,
+                beta1_pow_acc,
+                beta2_pow_acc,
+                master_weight,
+                found_inf,
+                _beta1,
+                _beta2,
+                self._epsilon,
+                self._lazy_mode,
+                1000,
+                find_master,
+                False,
+            )
 
             return None
 
         if framework._in_legacy_dygraph():
 
-            _beta1 = self._beta1 if not isinstance(
-                self._beta1, Variable) else self._beta1.numpy().item(0)
-            _beta2 = self._beta2 if not isinstance(
-                self._beta2, Variable) else self._beta2.numpy().item(0)
+            _beta1 = (
+                self._beta1
+                if not isinstance(self._beta1, Variable)
+                else self._beta1.numpy().item(0)
+            )
+            _beta2 = (
+                self._beta2
+                if not isinstance(self._beta2, Variable)
+                else self._beta2.numpy().item(0)
+            )
             _, _, _, _, _, _ = _legacy_C_ops.adam(
-                param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
-                beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
-                moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
-                'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
-                'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
-                'beta2', _beta2, 'multi_precision', find_master)
+                param_and_grad[0],
+                param_and_grad[1],
+                lr,
+                moment1,
+                moment2,
+                beta1_pow_acc,
+                beta2_pow_acc,
+                master_weight,
+                param_and_grad[0],
+                moment1,
+                moment2,
+                beta1_pow_acc,
+                beta2_pow_acc,
+                master_weight,
+                'epsilon',
+                self._epsilon,
+                'lazy_mode',
+                self._lazy_mode,
+                'min_row_size_to_use_multithread',
+                1000,
+                'beta1',
+                _beta1,
+                'beta2',
+                _beta2,
+                'multi_precision',
+                find_master,
+            )
 
             return None
 
@@ -373,7 +450,7 @@ def _append_optimize_op(self, block, param_and_grad):
             "Moment1": [moment1],
             "Moment2": [moment2],
             "Beta1Pow": [beta1_pow_acc],
-            "Beta2Pow": [beta2_pow_acc]
+            "Beta2Pow": [beta2_pow_acc],
         }
         outputs = {
             "ParamOut": [param_and_grad[0]],
@@ -385,7 +462,7 @@ def _append_optimize_op(self, block, param_and_grad):
         attrs = {
             "lazy_mode": self._lazy_mode,
             "min_row_size_to_use_multithread": 1000,
-            "multi_precision": find_master
+            "multi_precision": find_master,
         }
 
         if isinstance(self._beta1, Variable):
@@ -405,11 +482,13 @@ def _append_optimize_op(self, block, param_and_grad):
             inputs["MasterParam"] = master_weight
             outputs["MasterParamOut"] = master_weight
 
-        adam_op = block.append_op(type=self.type,
-                                  inputs=inputs,
-                                  outputs=outputs,
-                                  attrs=attrs,
-                                  stop_gradient=True)
+        adam_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True,
+        )
 
         return adam_op
 
@@ -426,7 +505,7 @@ def step(self):
             .. code-block:: python
 
                 import paddle
-                
+
                 a = paddle.rand([2,13], dtype="float32")
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
@@ -445,27 +524,34 @@ def step(self):
                 if param._grad_ivar() is not None:
                     grad_var = param._grad_ivar()
                     if in_dygraph_mode():
-                        if hasattr(grad_var, "is_selected_rows"
-                                   ) and grad_var.is_selected_rows(
-                                   ) and self.regularization is not None:
+                        if (
+                            hasattr(grad_var, "is_selected_rows")
+                            and grad_var.is_selected_rows()
+                            and self.regularization is not None
+                        ):
                             raise RuntimeError(
                                 "Adam don't support weight_decay with sparse parameters, please set it to None."
                             )
                     else:
-                        if hasattr(
-                                grad_var, "_is_sparse") and grad_var._is_sparse(
-                                ) and self.regularization is not None:
+                        if (
+                            hasattr(grad_var, "_is_sparse")
+                            and grad_var._is_sparse()
+                            and self.regularization is not None
+                        ):
                             raise RuntimeError(
                                 "Adam don't support weight_decay with sparse parameters, please set it to None."
                             )
                     params_grads.append((param, grad_var))
 
-            optimize_ops = self._apply_optimize(loss=None,
-                                                startup_program=None,
-                                                params_grads=params_grads)
+            optimize_ops = self._apply_optimize(
+                loss=None,
+                startup_program=None,
+                params_grads=params_grads,
+                param_group_idx=0,
+            )
         else:
             # optimize parameters in groups
-            for param_group in self._param_groups:
+            for idx, param_group in enumerate(self._param_groups):
                 params_grads = defaultdict(lambda: list())
                 for param in param_group['params']:
                     if param.stop_gradient:
@@ -474,13 +560,16 @@ def step(self):
                         grad_var = param._grad_ivar()
                         params_grads['params'].append((param, grad_var))
                 params_grads.update(
-                    {k: v
-                     for k, v in param_group.items() if k != 'params'})
-                self._apply_optimize(loss=None,
-                                     startup_program=None,
-                                     params_grads=params_grads)
+                    {k: v for k, v in param_group.items() if k != 'params'}
+                )
+                self._apply_optimize(
+                    loss=None,
+                    startup_program=None,
+                    params_grads=params_grads,
+                    param_group_idx=idx,
+                )
 
-    def _multi_tensor_init(self, target_block, parameters):
+    def _multi_tensor_init(self, target_block, parameters, param_group_idx):
         """
         All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
         This function will be overridden in the corresponding optimizer file.
@@ -492,26 +581,49 @@ def _multi_tensor_init(self, target_block, parameters):
         for param in parameters:
             moment1 = self._get_accumulator(self._moment1_acc_str, param)
             moment2 = self._get_accumulator(self._moment2_acc_str, param)
-            beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                                  param)
-            beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
-                                                  param)
+            beta1_pow_acc = self._get_accumulator(
+                self._beta1_pow_acc_str, param
+            )
+            beta2_pow_acc = self._get_accumulator(
+                self._beta2_pow_acc_str, param
+            )
 
             if param.dtype == paddle.float32:
-                self._param_dict['FP32_LODTensor'].append(param)
-                self._moment1_dict['FP32_LODTensor'].append(moment1)
-                self._moment2_dict['FP32_LODTensor'].append(moment2)
-                self._beta1_pow_acc_dict['FP32_LODTensor'].append(beta1_pow_acc)
-                self._beta2_pow_acc_dict['FP32_LODTensor'].append(beta2_pow_acc)
+                self._param_dict['FP32_LODTensor'][param_group_idx].append(
+                    param
+                )
+                self._moment1_dict['FP32_LODTensor'][param_group_idx].append(
+                    moment1
+                )
+                self._moment2_dict['FP32_LODTensor'][param_group_idx].append(
+                    moment2
+                )
+                self._beta1_pow_acc_dict['FP32_LODTensor'][
+                    param_group_idx
+                ].append(beta1_pow_acc)
+                self._beta2_pow_acc_dict['FP32_LODTensor'][
+                    param_group_idx
+                ].append(beta2_pow_acc)
             elif param.dtype == paddle.float16:
-                self._param_dict['FP16_LODTensor'].append(param)
-                self._moment1_dict['FP16_LODTensor'].append(moment1)
-                self._moment2_dict['FP16_LODTensor'].append(moment2)
-                self._beta1_pow_acc_dict['FP16_LODTensor'].append(beta1_pow_acc)
-                self._beta2_pow_acc_dict['FP16_LODTensor'].append(beta2_pow_acc)
+                self._param_dict['FP16_LODTensor'][param_group_idx].append(
+                    param
+                )
+                self._moment1_dict['FP16_LODTensor'][param_group_idx].append(
+                    moment1
+                )
+                self._moment2_dict['FP16_LODTensor'][param_group_idx].append(
+                    moment2
+                )
+                self._beta1_pow_acc_dict['FP16_LODTensor'][
+                    param_group_idx
+                ].append(beta1_pow_acc)
+                self._beta2_pow_acc_dict['FP16_LODTensor'][
+                    param_group_idx
+                ].append(beta2_pow_acc)
                 if self._multi_precision:
-                    self._master_weight_dict['FP16_LODTensor'].append(
-                        self._master_weights[param.name])
+                    self._master_weight_dict['FP16_LODTensor'][
+                        param_group_idx
+                    ].append(self._master_weights[param.name])
                 else:
                     self._master_weight_dict['FP16_LODTensor'] = None
             else:
@@ -519,9 +631,13 @@ def _multi_tensor_init(self, target_block, parameters):
                     "Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR."
                 )
 
-    def _append_optimize_multi_tensor_op(self, target_block,
-                                         parameters_and_grads):
-        """ 
+    def _append_optimize_multi_tensor_op(
+        self,
+        target_block,
+        parameters_and_grads,
+        param_group_idx,
+    ):
+        """
         For Multi Tensor, append optimize merged_operator to block.
         """
         assert isinstance(target_block, framework.Block)
@@ -534,15 +650,19 @@ def _append_optimize_multi_tensor_op(self, target_block,
                 if param_and_grad[1] is None:
                     continue
                 if param_and_grad[0].stop_gradient is False:
-                    if param_and_grad[
-                            0].dtype == paddle.float32 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    if (
+                        param_and_grad[0].dtype == paddle.float32
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP32_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP32_LODTensor'].append(lr)
-                    elif param_and_grad[
-                            0].dtype == paddle.float16 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    elif (
+                        param_and_grad[0].dtype == paddle.float16
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP16_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP16_LODTensor'].append(lr)
@@ -553,97 +673,149 @@ def _append_optimize_multi_tensor_op(self, target_block,
                 if param_and_grad[0].stop_gradient is False:
                     param_grad_dict = dict()
                     param_grad_dict['params'] = param_and_grad
-                    param_grad_dict.update({
-                        k: v
-                        for k, v in parameters_and_grads.items()
-                        if k != 'params'
-                    })
+                    param_grad_dict.update(
+                        {
+                            k: v
+                            for k, v in parameters_and_grads.items()
+                            if k != 'params'
+                        }
+                    )
                     param_and_grad = self._update_param_group(param_grad_dict)
-                    if param_and_grad[
-                            0].dtype == paddle.float32 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    if (
+                        param_and_grad[0].dtype == paddle.float32
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP32_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP32_LODTensor'].append(lr)
-                    elif param_and_grad[
-                            0].dtype == paddle.float16 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    elif (
+                        param_and_grad[0].dtype == paddle.float16
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP16_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP16_LODTensor'].append(lr)
 
         multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
         for key in multi_tensor_list:
-            if len(self._param_dict[key]) > 0:
+            if len(self._param_dict[key][param_group_idx]) > 0:
                 find_master = self._multi_precision and key == 'FP16_LODTensor'
 
-                _beta1 = self._beta1 if not isinstance(
-                    self._beta1, Variable) else self._beta1.numpy().item(0)
-                _beta2 = self._beta2 if not isinstance(
-                    self._beta2, Variable) else self._beta2.numpy().item(0)
+                _beta1 = (
+                    self._beta1
+                    if not isinstance(self._beta1, Variable)
+                    else self._beta1.numpy().item(0)
+                )
+                _beta2 = (
+                    self._beta2
+                    if not isinstance(self._beta2, Variable)
+                    else self._beta2.numpy().item(0)
+                )
 
                 if framework._non_static_mode():
+                    master_weight = self._master_weight_dict[key]
+                    master_weight = (
+                        master_weight[param_group_idx]
+                        if master_weight is not None
+                        else None
+                    )
                     if in_dygraph_mode():
+
                         _, _, _, _, _, _ = _C_ops.merged_adam_(
-                            self._param_dict[key], grad_dict[key], lr_dict[key],
-                            self._moment1_dict[key], self._moment2_dict[key],
-                            self._beta1_pow_acc_dict[key],
-                            self._beta2_pow_acc_dict[key],
-                            self._master_weight_dict[key], _beta1, _beta2,
-                            self._epsilon, find_master, False)
+                            self._param_dict[key][param_group_idx],
+                            grad_dict[key],
+                            lr_dict[key],
+                            self._moment1_dict[key][param_group_idx],
+                            self._moment2_dict[key][param_group_idx],
+                            self._beta1_pow_acc_dict[key][param_group_idx],
+                            self._beta2_pow_acc_dict[key][param_group_idx],
+                            master_weight,
+                            _beta1,
+                            _beta2,
+                            self._epsilon,
+                            find_master,
+                            False,
+                        )
                     else:
                         _, _, _, _, _, _ = _legacy_C_ops.merged_adam(
-                            self._param_dict[key], grad_dict[key], lr_dict[key],
-                            self._moment1_dict[key], self._moment2_dict[key],
-                            self._beta1_pow_acc_dict[key],
-                            self._beta2_pow_acc_dict[key],
-                            self._master_weight_dict[key],
-                            self._param_dict[key], self._moment1_dict[key],
-                            self._moment2_dict[key],
-                            self._beta1_pow_acc_dict[key],
-                            self._beta2_pow_acc_dict[key],
-                            self._master_weight_dict[key], 'epsilon',
-                            self._epsilon, 'beta1', _beta1, 'beta2', _beta2,
-                            'multi_precision', find_master)
+                            self._param_dict[key][param_group_idx],
+                            grad_dict[key],
+                            lr_dict[key],
+                            self._moment1_dict[key][param_group_idx],
+                            self._moment2_dict[key][param_group_idx],
+                            self._beta1_pow_acc_dict[key][param_group_idx],
+                            self._beta2_pow_acc_dict[key][param_group_idx],
+                            master_weight,
+                            self._param_dict[key][param_group_idx],
+                            self._moment1_dict[key][param_group_idx],
+                            self._moment2_dict[key][param_group_idx],
+                            self._beta1_pow_acc_dict[key][param_group_idx],
+                            self._beta2_pow_acc_dict[key][param_group_idx],
+                            master_weight,
+                            'epsilon',
+                            self._epsilon,
+                            'beta1',
+                            _beta1,
+                            'beta2',
+                            _beta2,
+                            'multi_precision',
+                            find_master,
+                        )
                 else:
                     inputs = {
-                        "Param": self._param_dict[key],
+                        "Param": self._param_dict[key][param_group_idx],
                         "Grad": grad_dict[key],
                         "LearningRate": lr_dict[key],
-                        "Moment1": self._moment1_dict[key],
-                        "Moment2": self._moment2_dict[key],
-                        "Beta1Pow": self._beta1_pow_acc_dict[key],
-                        "Beta2Pow": self._beta2_pow_acc_dict[key]
+                        "Moment1": self._moment1_dict[key][param_group_idx],
+                        "Moment2": self._moment2_dict[key][param_group_idx],
+                        "Beta1Pow": self._beta1_pow_acc_dict[key][
+                            param_group_idx
+                        ],
+                        "Beta2Pow": self._beta2_pow_acc_dict[key][
+                            param_group_idx
+                        ],
                     }
                     outputs = {
-                        "ParamOut": self._param_dict[key],
-                        "Moment1Out": self._moment1_dict[key],
-                        "Moment2Out": self._moment2_dict[key],
-                        "Beta1PowOut": self._beta1_pow_acc_dict[key],
-                        "Beta2PowOut": self._beta2_pow_acc_dict[key]
+                        "ParamOut": self._param_dict[key][param_group_idx],
+                        "Moment1Out": self._moment1_dict[key][param_group_idx],
+                        "Moment2Out": self._moment2_dict[key][param_group_idx],
+                        "Beta1PowOut": self._beta1_pow_acc_dict[key][
+                            param_group_idx
+                        ],
+                        "Beta2PowOut": self._beta2_pow_acc_dict[key][
+                            param_group_idx
+                        ],
                     }
                     attrs = {
                         "epsilon": self._epsilon,
                         "beta1": _beta1,
-                        "beta2": _beta2
+                        "beta2": _beta2,
                     }
                     if find_master:
-                        inputs["MasterParam"] = self._master_weight_dict[key]
+                        inputs["MasterParam"] = self._master_weight_dict[key][
+                            param_group_idx
+                        ]
                         outputs["MasterParamOut"] = self._master_weight_dict[
-                            key]
+                            key
+                        ][param_group_idx]
                         attrs["multi_precision"] = find_master
-                    target_block.append_op(type="merged_adam",
-                                           inputs=inputs,
-                                           outputs=outputs,
-                                           attrs=attrs,
-                                           stop_gradient=True)
+                    target_block.append_op(
+                        type="merged_adam",
+                        inputs=inputs,
+                        outputs=outputs,
+                        attrs=attrs,
+                        stop_gradient=True,
+                    )
         return None
 
     def _update_param_group(self, parameters):
         self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
         self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
         self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
-        self._lazy_mode = parameters.get('lazy_mode',
-                                         self._default_dict['lazy_mode'])
+        self._lazy_mode = parameters.get(
+            'lazy_mode', self._default_dict['lazy_mode']
+        )
         parameters = parameters.get('params')
         return parameters
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index 988ac052b0307b..5ccda36a2c361d 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -123,29 +123,35 @@ class Momentum(Optimizer):
     """
     _velocity_acc_str = "velocity"
 
-    def __init__(self,
-                 learning_rate=0.001,
-                 momentum=0.9,
-                 parameters=None,
-                 use_nesterov=False,
-                 weight_decay=None,
-                 grad_clip=None,
-                 multi_precision=False,
-                 rescale_grad=1.0,
-                 use_multi_tensor=False,
-                 name=None):
+    def __init__(
+        self,
+        learning_rate=0.001,
+        momentum=0.9,
+        parameters=None,
+        use_nesterov=False,
+        weight_decay=None,
+        grad_clip=None,
+        multi_precision=False,
+        rescale_grad=1.0,
+        use_multi_tensor=False,
+        name=None,
+    ):
         if learning_rate is None:
             raise ValueError("learning_rate is not set")
         if momentum is None:
             raise ValueError("momentum is not set")
 
-        predicate = lambda regular: isinstance(regular,
-                                               (L2DecayRegularizer, float))
+        predicate = lambda regular: isinstance(
+            regular, (L2DecayRegularizer, float)
+        )
         if isinstance(parameters, list):
             if isinstance(parameters[0], dict):
                 for param_group in parameters:
-                    decay = param_group[
-                        'weight_decay'] if 'weight_decay' in param_group else weight_decay
+                    decay = (
+                        param_group['weight_decay']
+                        if 'weight_decay' in param_group
+                        else weight_decay
+                    )
                     reg_method, reg_coeff = self._update_regularization(decay)
                     param_group['regularization_method'] = reg_method
                     param_group['regularization_coeff'] = reg_coeff
@@ -153,16 +159,20 @@ def __init__(self,
                     param_group['weight_decay'] = py_regular
 
         py_regular = None if predicate(weight_decay) else weight_decay
-        super(Momentum, self).__init__(learning_rate=learning_rate,
-                                       parameters=parameters,
-                                       weight_decay=py_regular,
-                                       grad_clip=grad_clip,
-                                       name=name)
+        super(Momentum, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=py_regular,
+            grad_clip=grad_clip,
+            name=name,
+        )
         self.type = "momentum"
         self._momentum = momentum
         self._use_nesterov = bool(use_nesterov)
-        self._regularization_method, self._regularization_coeff = self._update_regularization(
-            weight_decay)
+        (
+            self._regularization_method,
+            self._regularization_coeff,
+        ) = self._update_regularization(weight_decay)
         self._multi_precision = multi_precision
         self._rescale_grad = rescale_grad
         self._master_weights = {}
@@ -176,29 +186,21 @@ def __init__(self,
         }
         self._use_multi_tensor = use_multi_tensor
         if self._use_multi_tensor:
-            self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
-            self._velocity_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
-            self._master_weight_dict = {
-                'FP32_LODTensor': None,
-                'FP16_LODTensor': []
-            }
-            self._regularization_method_dict = {
-                'FP32_LODTensor': [],
-                'FP16_LODTensor': []
-            }
-            self._regularization_coeff_dict = {
-                'FP32_LODTensor': [],
-                'FP16_LODTensor': []
-            }
+            self._param_dict = self._create_multi_tensor_dict()
+            self._velocity_dict = self._create_multi_tensor_dict()
+            self._master_weight_dict = self._create_multi_tensor_dict()
+            self._master_weight_dict['FP32_LODTensor'] = None
+            self._regularization_method_dict = self._create_multi_tensor_dict()
+            self._regularization_coeff_dict = self._create_multi_tensor_dict()
 
     def _update_regularization(self, weight_decay):
         reg_method = ""
         reg_coeff = 0.0
 
-        if (isinstance(weight_decay, L2DecayRegularizer)):
+        if isinstance(weight_decay, L2DecayRegularizer):
             reg_method = "l2_decay"
             reg_coeff = weight_decay._regularization_coeff
-        if (isinstance(weight_decay, float)):
+        if isinstance(weight_decay, float):
             reg_method = "l2_decay"
             reg_coeff = weight_decay
         return reg_method, reg_coeff
@@ -211,19 +213,23 @@ def _create_master_weight(self, param):
 
             var_name = param.name + "_fp32_master"
             var_name = unique_name.generate(var_name)
-            var = layers.create_global_var(name=var_name,
-                                           shape=param.shape,
-                                           value=0,
-                                           dtype='float32',
-                                           persistable=True)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True,
+            )
             block = self.helper.startup_program.global_block()
-            block.append_op(type="cast",
-                            inputs={"X": [param]},
-                            outputs={"Out": [var]},
-                            attrs={
-                                "in_dtype": param.dtype,
-                                "out_dtype": core.VarDesc.VarType.FP32
-                            })
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32,
+                },
+            )
             self._master_weights[param.name] = var
         return var
 
@@ -239,15 +245,22 @@ def _get_accumulator(self, name, param):
         """
         if self._name is not None:
             name = self._name + "_" + name
-        find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
-        target_param = self._master_weights[
-            param.name] if find_master else param
+        find_master = (
+            self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
+        )
+        target_param = (
+            self._master_weights[param.name] if find_master else param
+        )
         target_name = target_param.name
-        if (name not in self._accumulators
-                or target_name not in self._accumulators[name]):
+        if (
+            name not in self._accumulators
+            or target_name not in self._accumulators[name]
+        ):
             raise Exception(
                 "Accumulator {} does not exist for parameter {}".format(
-                    name, target_name))
+                    name, target_name
+                )
+            )
         return self._accumulators[name][target_name]
 
     def _create_accumulators(self, block, parameters):
@@ -265,7 +278,10 @@ def _create_accumulators(self, block, parameters):
                 master_p = self._create_master_weight(p)
                 self._add_accumulator(self._velocity_acc_str, master_p)
                 continue
-            if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
+            if (
+                p.dtype == core.VarDesc.VarType.FP16
+                and not self._multi_precision
+            ):
                 warnings.warn(
                     "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
                     "Consider using multi_precision=True option of the Momentum optimizer."
@@ -273,25 +289,28 @@ def _create_accumulators(self, block, parameters):
             self._add_accumulator(self._velocity_acc_str, p)
 
     def _create_regularization_of_grad(self, param, grad, regularization=None):
-        """ Create and add backward regularization Operators
-    
+        """Create and add backward regularization Operators
+
         Function helper of append_regularization_ops.
         """
         # If ParamAttr is set to L2Decay, we skip doing regularization here. And then we fused
         # L2Decay with momentum which can refer to _append_optimize_op below.
-        if hasattr(param, 'regularizer') and isinstance(param.regularizer,
-                                                        L2DecayRegularizer):
+        if hasattr(param, 'regularizer') and isinstance(
+            param.regularizer, L2DecayRegularizer
+        ):
             return grad
         return super(Momentum, self)._create_regularization_of_grad(
-            param, grad, regularization)
+            param, grad, regularization
+        )
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
         if isinstance(param_and_grad, dict):
             param_and_grad = self._update_param_group(param_and_grad)
 
-        velocity_acc = self._get_accumulator(self._velocity_acc_str,
-                                             param_and_grad[0])
+        velocity_acc = self._get_accumulator(
+            self._velocity_acc_str, param_and_grad[0]
+        )
         lr = self._create_param_lr(param_and_grad)
 
         # For fusion of momentum and l2decay
@@ -308,30 +327,56 @@ def _append_optimize_op(self, block, param_and_grad):
                 regularization_method = ""
                 regularization_coeff = 0.0
 
-        find_master = self._multi_precision and param_and_grad[
-            0].dtype == core.VarDesc.VarType.FP16
-        master_weight = (self._master_weights[param_and_grad[0].name]
-                         if find_master else None)
+        find_master = (
+            self._multi_precision
+            and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
+        )
+        master_weight = (
+            self._master_weights[param_and_grad[0].name]
+            if find_master
+            else None
+        )
 
         if _in_legacy_dygraph():
             if isinstance(param_and_grad, dict):
                 self._update_regularization(param_and_grad['weight_decay'])
             _, _, _ = _legacy_C_ops.momentum(
-                param_and_grad[0], param_and_grad[1], velocity_acc, lr,
-                master_weight, param_and_grad[0], velocity_acc, master_weight,
-                'mu', self._momentum, 'use_nesterov', self._use_nesterov,
-                'regularization_method', regularization_method,
-                'regularization_coeff', regularization_coeff, 'multi_precision',
-                find_master)
+                param_and_grad[0],
+                param_and_grad[1],
+                velocity_acc,
+                lr,
+                master_weight,
+                param_and_grad[0],
+                velocity_acc,
+                master_weight,
+                'mu',
+                self._momentum,
+                'use_nesterov',
+                self._use_nesterov,
+                'regularization_method',
+                regularization_method,
+                'regularization_coeff',
+                regularization_coeff,
+                'multi_precision',
+                find_master,
+            )
             return None
         if in_dygraph_mode():
             if isinstance(param_and_grad, dict):
                 self._update_regularization(param_and_grad['weight_decay'])
-            return _C_ops.momentum_(param_and_grad[0], param_and_grad[1],
-                                    velocity_acc, lr, master_weight,
-                                    self._momentum, self._use_nesterov,
-                                    regularization_method, regularization_coeff,
-                                    find_master, self._rescale_grad)
+            return _C_ops.momentum_(
+                param_and_grad[0],
+                param_and_grad[1],
+                velocity_acc,
+                lr,
+                master_weight,
+                self._momentum,
+                self._use_nesterov,
+                regularization_method,
+                regularization_coeff,
+                find_master,
+                self._rescale_grad,
+            )
 
         attrs = {
             "mu": self._momentum,
@@ -339,19 +384,19 @@ def _append_optimize_op(self, block, param_and_grad):
             "regularization_method": regularization_method,
             "regularization_coeff": regularization_coeff,
             "multi_precision": find_master,
-            "rescale_grad": self._rescale_grad
+            "rescale_grad": self._rescale_grad,
         }
 
         inputs = {
             "Param": [param_and_grad[0]],
             "Grad": [param_and_grad[1]],
             "Velocity": [velocity_acc],
-            "LearningRate": [lr]
+            "LearningRate": [lr],
         }
 
         outputs = {
             "ParamOut": [param_and_grad[0]],
-            "VelocityOut": [velocity_acc]
+            "VelocityOut": [velocity_acc],
         }
 
         if find_master:
@@ -359,15 +404,17 @@ def _append_optimize_op(self, block, param_and_grad):
             outputs["MasterParamOut"] = master_weight
 
         # create the momentum optimize op
-        momentum_op = block.append_op(type=self.type,
-                                      inputs=inputs,
-                                      outputs=outputs,
-                                      attrs=attrs,
-                                      stop_gradient=True)
+        momentum_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True,
+        )
 
         return momentum_op
 
-    def _multi_tensor_init(self, target_block, parameters):
+    def _multi_tensor_init(self, target_block, parameters, param_group_idx):
         """
         All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
         This function will be overridden in the corresponding optimizer file.
@@ -385,38 +432,59 @@ def _multi_tensor_init(self, target_block, parameters):
                 # we skip param's l2decay before, so fuse it with momentum here.
                 if isinstance(param.regularizer, L2DecayRegularizer):
                     regularization_method = "l2_decay"
-                    regularization_coeff = param.regularizer._regularization_coeff
+                    regularization_coeff = (
+                        param.regularizer._regularization_coeff
+                    )
                 elif param.regularizer is not None:
                     regularization_method = ""
                     regularization_coeff = 0.0
             if param.dtype == paddle.float32:
-                self._param_dict['FP32_LODTensor'].append(param)
-                self._velocity_dict['FP32_LODTensor'].append(velocity_acc)
+                self._param_dict['FP32_LODTensor'][param_group_idx].append(
+                    param
+                )
+                self._velocity_dict['FP32_LODTensor'][param_group_idx].append(
+                    velocity_acc
+                )
                 # fp32 no master weight
-                self._regularization_method_dict['FP32_LODTensor'].append(
-                    regularization_method)
-                self._regularization_coeff_dict['FP32_LODTensor'].append(
-                    regularization_coeff)
+                self._regularization_method_dict['FP32_LODTensor'][
+                    param_group_idx
+                ].append(regularization_method)
+                self._regularization_coeff_dict['FP32_LODTensor'][
+                    param_group_idx
+                ].append(regularization_coeff)
             elif param.dtype == paddle.float16:
-                self._param_dict['FP16_LODTensor'].append(param)
-                self._velocity_dict['FP16_LODTensor'].append(velocity_acc)
+                self._param_dict['FP16_LODTensor'][param_group_idx].append(
+                    param
+                )
+                self._velocity_dict['FP16_LODTensor'][param_group_idx].append(
+                    velocity_acc
+                )
                 if self._multi_precision:
-                    self._master_weight_dict['FP16_LODTensor'].append(
-                        self._master_weights[param.name])
+                    self._master_weight_dict['FP16_LODTensor'][
+                        param_group_idx
+                    ].append(self._master_weights[param.name])
                 else:
-                    self._master_weight_dict['FP16_LODTensor'] = None
-                self._regularization_method_dict['FP16_LODTensor'].append(
-                    regularization_method)
-                self._regularization_coeff_dict['FP16_LODTensor'].append(
-                    regularization_coeff)
+                    self._master_weight_dict['FP16_LODTensor'][
+                        param_group_idx
+                    ] = None
+                self._regularization_method_dict['FP16_LODTensor'][
+                    param_group_idx
+                ].append(regularization_method)
+                self._regularization_coeff_dict['FP16_LODTensor'][
+                    param_group_idx
+                ].append(regularization_coeff)
             else:
                 raise ValueError(
                     "Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR."
                 )
 
-    def _append_optimize_multi_tensor_op(self, target_block,
-                                         parameters_and_grads):
-        """ 
+    def _append_optimize_multi_tensor_op(
+        self,
+        target_block,
+        parameters_and_grads,
+        param_group_idx,
+    ):
+        """
         For Multi Tensor, append optimize merged_operator to block.
         """
         assert isinstance(target_block, framework.Block)
@@ -429,15 +497,19 @@ def _append_optimize_multi_tensor_op(self, target_block,
                 if param_and_grad[1] is None:
                     continue
                 if param_and_grad[0].stop_gradient is False:
-                    if param_and_grad[
-                            0].dtype == paddle.float32 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    if (
+                        param_and_grad[0].dtype == paddle.float32
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP32_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP32_LODTensor'].append(lr)
-                    elif param_and_grad[
-                            0].dtype == paddle.float16 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    elif (
+                        param_and_grad[0].dtype == paddle.float16
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP16_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP16_LODTensor'].append(lr)
@@ -448,97 +520,144 @@ def _append_optimize_multi_tensor_op(self, target_block,
                 if param_and_grad[0].stop_gradient is False:
                     param_grad_dict = dict()
                     param_grad_dict['params'] = param_and_grad
-                    param_grad_dict.update({
-                        k: v
-                        for k, v in parameters_and_grads.items()
-                        if k != 'params'
-                    })
+                    param_grad_dict.update(
+                        {
+                            k: v
+                            for k, v in parameters_and_grads.items()
+                            if k != 'params'
+                        }
+                    )
                     param_and_grad = self._update_param_group(param_grad_dict)
-                    if param_and_grad[
-                            0].dtype == paddle.float32 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    if (
+                        param_and_grad[0].dtype == paddle.float32
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP32_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP32_LODTensor'].append(lr)
-                    elif param_and_grad[
-                            0].dtype == paddle.float16 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    elif (
+                        param_and_grad[0].dtype == paddle.float16
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP16_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP16_LODTensor'].append(lr)
 
         multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
         for key in multi_tensor_list:
-            if len(self._param_dict[key]) > 0:
+            if len(self._param_dict[key][param_group_idx]) > 0:
                 find_master = self._multi_precision and key == 'FP16_LODTensor'
 
+                master_weight = self._master_weight_dict[key]
+                master_weight = (
+                    master_weight[param_group_idx]
+                    if master_weight is not None
+                    else None
+                )
+
                 if framework._non_static_mode():
                     if in_dygraph_mode():
                         _, _, _ = _C_ops.merged_momentum_(
-                            self._param_dict[key], grad_dict[key],
-                            self._velocity_dict[key], lr_dict[key],
-                            self._master_weight_dict[key], self._momentum,
+                            self._param_dict[key][param_group_idx],
+                            grad_dict[key],
+                            self._velocity_dict[key][param_group_idx],
+                            lr_dict[key],
+                            master_weight,
+                            self._momentum,
                             self._use_nesterov,
-                            self._regularization_method_dict[key],
-                            self._regularization_coeff_dict[key], find_master,
-                            self._rescale_grad)
+                            self._regularization_method_dict[key][
+                                param_group_idx
+                            ],
+                            self._regularization_coeff_dict[key][
+                                param_group_idx
+                            ],
+                            find_master,
+                            self._rescale_grad,
+                        )
                     else:
                         _, _, _ = _legacy_C_ops.merged_momentum(
-                            self._param_dict[key], grad_dict[key],
-                            self._velocity_dict[key], lr_dict[key],
-                            self._master_weight_dict[key],
-                            self._param_dict[key], self._velocity_dict[key],
-                            self._master_weight_dict[key], 'mu', self._momentum,
-                            'use_nesterov', self._use_nesterov,
+                            self._param_dict[key][param_group_idx],
+                            grad_dict[key],
+                            self._velocity_dict[key][param_group_idx],
+                            lr_dict[key],
+                            master_weight,
+                            self._param_dict[key][param_group_idx],
+                            self._velocity_dict[key][param_group_idx],
+                            master_weight,
+                            'mu',
+                            self._momentum,
+                            'use_nesterov',
+                            self._use_nesterov,
                             'regularization_method',
-                            self._regularization_method_dict[key],
+                            self._regularization_method_dict[key][
+                                param_group_idx
+                            ],
                             'regularization_coeff',
-                            self._regularization_coeff_dict[key],
-                            'multi_precision', find_master)
+                            self._regularization_coeff_dict[key][
+                                param_group_idx
+                            ],
+                            'multi_precision',
+                            find_master,
+                        )
                 else:
                     inputs = {
-                        "Param": self._param_dict[key],
+                        "Param": self._param_dict[key][param_group_idx],
                         "Grad": grad_dict[key],
-                        "Velocity": self._velocity_dict[key],
+                        "Velocity": self._velocity_dict[key][param_group_idx],
                         "LearningRate": lr_dict[key],
                     }
                     outputs = {
-                        "ParamOut": self._param_dict[key],
-                        "VelocityOut": self._velocity_dict[key],
+                        "ParamOut": self._param_dict[key][param_group_idx],
+                        "VelocityOut": self._velocity_dict[key][
+                            param_group_idx
+                        ],
                     }
                     attrs = {
-                        "mu":
-                        self._momentum,
-                        "use_nesterov":
-                        self._use_nesterov,
-                        "regularization_method":
-                        self._regularization_method_dict[key],
-                        "regularization_coeff":
-                        self._regularization_coeff_dict[key],
+                        "mu": self._momentum,
+                        "use_nesterov": self._use_nesterov,
+                        "regularization_method": self._regularization_method_dict[
+                            key
+                        ][
+                            param_group_idx
+                        ],
+                        "regularization_coeff": self._regularization_coeff_dict[
+                            key
+                        ][param_group_idx],
                     }
                     if find_master:
-                        inputs["MasterParam"] = self._master_weight_dict[key]
+                        inputs["MasterParam"] = self._master_weight_dict[key][
+                            param_group_idx
+                        ]
                         outputs["MasterParamOut"] = self._master_weight_dict[
-                            key]
+                            key
+                        ][param_group_idx]
                         attrs["multi_precision"] = find_master
-                    target_block.append_op(type="merged_momentum",
-                                           inputs=inputs,
-                                           outputs=outputs,
-                                           attrs=attrs,
-                                           stop_gradient=True)
+                    target_block.append_op(
+                        type="merged_momentum",
+                        inputs=inputs,
+                        outputs=outputs,
+                        attrs=attrs,
+                        stop_gradient=True,
+                    )
         return None
 
     def _update_param_group(self, parameters):
-        self._momentum = parameters.get('momentum',
-                                        self._default_dict['momentum'])
-        self._use_nesterov = parameters.get('use_nesterov',
-                                            self._default_dict['use_nesterov'])
-        self._rescale_grad = parameters.get('rescale_grad',
-                                            self._default_dict['rescale_grad'])
+        self._momentum = parameters.get(
+            'momentum', self._default_dict['momentum']
+        )
+        self._use_nesterov = parameters.get(
+            'use_nesterov', self._default_dict['use_nesterov']
+        )
+        self._rescale_grad = parameters.get(
+            'rescale_grad', self._default_dict['rescale_grad']
+        )
         self._regularization_method = parameters.get(
-            'regularization_method',
-            self._default_dict['regularization_method'])
+            'regularization_method', self._default_dict['regularization_method']
+        )
         self._regularization_coeff = parameters.get(
-            'regularization_coeff', self._default_dict['regularization_coeff'])
+            'regularization_coeff', self._default_dict['regularization_coeff']
+        )
         parameters = parameters.get('params')
         return parameters
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 1d399021c8e8d0..58d389020c7caa 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -21,13 +21,30 @@
 
 import paddle
 from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
-from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
+from paddle.fluid.framework import (
+    Program,
+    Variable,
+    name_scope,
+    default_main_program,
+    default_startup_program,
+    device_guard,
+)
 
 from ..fluid import framework
 from ..fluid import layers
 from ..fluid import unique_name
-from ..fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
-from ..fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops
+from ..fluid.backward import (
+    append_backward,
+    _some_in_set_,
+    _append_grad_suffix_,
+    _get_no_grad_set_name,
+)
+from ..fluid.clip import (
+    GradientClipBase,
+    GradientClipByNorm,
+    error_clip_callback,
+    append_gradient_clip_ops,
+)
 from ..fluid.framework import program_guard, Parameter
 from ..fluid.initializer import Constant
 from ..fluid.layer_helper import LayerHelper
@@ -42,24 +59,36 @@
 from .lr import LRScheduler
 import copy
 from paddle import _C_ops, _legacy_C_ops
-from paddle.fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check, _current_expected_place, in_dygraph_mode
+from paddle.fluid.framework import (
+    _in_legacy_dygraph,
+    _in_eager_without_dygraph_check,
+    _current_expected_place,
+    in_dygraph_mode,
+)
 
 __all__ = []
 
 
 @framework.static_only
-def append_backward_new(loss_list,
-                        parameter_list=None,
-                        no_grad_set=None,
-                        callbacks=None,
-                        checkpoints=None,
-                        distop_context=None):
+def append_backward_new(
+    loss_list,
+    parameter_list=None,
+    no_grad_set=None,
+    callbacks=None,
+    checkpoints=None,
+    distop_context=None,
+):
     from paddle.incubate.autograd.primx import orig2prim, Transform
+
     program = default_main_program()
-    assert program.num_blocks == 1, "The append_backward_new interface is designed to process only one block."
+    assert (
+        program.num_blocks == 1
+    ), "The append_backward_new interface is designed to process only one block."
     block = program.current_block()
     for el in loss_list:
-        assert el.block == block, f'variable in loss_list should be in current block of main program'
+        assert (
+            el.block == block
+        ), f'variable in loss_list should be in current block of main program'
 
     orig2prim(block)
     ad = Transform(block)
@@ -163,12 +192,14 @@ class Optimizer(object):
     """
 
     @imperative_base.no_grad
-    def __init__(self,
-                 learning_rate,
-                 parameters=None,
-                 weight_decay=None,
-                 grad_clip=None,
-                 name=None):
+    def __init__(
+        self,
+        learning_rate,
+        parameters=None,
+        weight_decay=None,
+        grad_clip=None,
+        name=None,
+    ):
 
         if parameters is not None:
             # paddle.Tensor is also iterable, so here we don't check whether
@@ -177,13 +208,16 @@ def __init__(self,
             if isinstance(parameters, (paddle.Tensor, core.eager.Tensor)):
                 raise TypeError(
                     "`parameters` argument given to the optimizer should be "
-                    "an iterable of paddle Tensors, but got argument type is `{}`."
-                    .format(type(parameters)))
+                    "an iterable of paddle Tensors, but got argument type is `{}`.".format(
+                        type(parameters)
+                    )
+                )
             if isinstance(parameters, dict):
                 raise TypeError(
                     "`parameters` argument should not get dict type, "
                     "if parameter groups is needed, please set `parameters`"
-                    " as list of dict")
+                    " as list of dict"
+                )
             self._parameter_list = list(parameters)
         else:
             self._parameter_list = None
@@ -197,18 +231,22 @@ def __init__(self,
             if weight_decay is not None:
                 if not isinstance(self._parameter_list[0], dict):
                     for param in self._parameter_list:
-                        if hasattr(param, 'regularizer'
-                                   ) and param.regularizer is not None:
+                        if (
+                            hasattr(param, 'regularizer')
+                            and param.regularizer is not None
+                        ):
                             logging.info(
                                 "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
                                 "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
-                                % weight_decay.__str__())
+                                % weight_decay.__str__()
+                            )
                             break
 
         if not isinstance(learning_rate, (float, LRScheduler)):
             raise TypeError(
-                "learning rate should be float or LRScheduler, got %s here" %
-                type(learning_rate))
+                "learning rate should be float or LRScheduler, got %s here"
+                % type(learning_rate)
+            )
         if grad_clip is not None:
             if not isinstance(grad_clip, GradientClipBase):
                 raise TypeError(
@@ -216,6 +254,7 @@ def __init__(self,
                 )
         if isinstance(weight_decay, float):
             from ..fluid.regularizer import L2Decay
+
             self.regularization = L2Decay(weight_decay)
         else:
             self.regularization = weight_decay
@@ -227,8 +266,9 @@ def __init__(self,
         if self._parameter_list:
             if isinstance(self._parameter_list[0], dict):
                 for param_group in self._parameter_list:
-                    assert 'params' in param_group, \
-                        'params should be set in parameters if parameter groups are optimized in different options'
+                    assert (
+                        'params' in param_group
+                    ), 'params should be set in parameters if parameter groups are optimized in different options'
                 self._dtype = self._parameter_list[0]['params'][0].dtype
             else:
                 self._dtype = self._parameter_list[0].dtype
@@ -248,7 +288,7 @@ def __init__(self,
         self.clear_gradients = self.clear_grad
         self._default_dict = {
             'weight_decay': self.regularization,
-            'grad_clip': self._grad_clip
+            'grad_clip': self._grad_clip,
         }
 
         self._param_groups = []
@@ -261,13 +301,20 @@ def __init__(self,
         # NOTE: Multi Tensor: Pass in all parameters and gradients to the op kernel of the Optimizer at one time for updating for dygraph mode.
         # Optimizer support list: [ paddle.optimizer.Momentum, paddle.optimizer.Adam].
         self._use_multi_tensor = None
-        self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
 
+        self._param_dict = self._create_multi_tensor_dict()
         self._auxiliary_vars = {}
 
     def _set_auxiliary_var(self, key, val):
         self._auxiliary_vars[key] = val
 
+    def _create_multi_tensor_dict(self):
+        n = len(self._param_groups) if self._param_groups is not None else 1
+        return {
+            'FP32_LODTensor': [[] for _ in range(n)],
+            'FP16_LODTensor': [[] for _ in range(n)],
+        }
+
     def _get_auxiliary_var(self, key):
         return self._auxiliary_vars.get(key, None)
 
@@ -277,12 +324,12 @@ def state_dict(self):
         Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be include in state dict.
         If the optimizer never be called(minimize function), the state_dict is empty.
 
-        Args: 
+        Args:
             None
 
         Returns:
             state_dict(dict) : dict contains all the Tensor used by optimizer
-        
+
         Examples:
             .. code-block:: python
 
@@ -311,11 +358,11 @@ def set_state_dict(self, state_dict):
         '''
         Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be changed.
 
-        Args: 
+        Args:
             state_dict(dict) : Dict contains all the Tensor needed by optimizer
         Return:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -326,7 +373,7 @@ def set_state_dict(self, state_dict):
                 layer_state_dict = emb.state_dict()
                 paddle.save(layer_state_dict, "emb.pdparams")
 
-                scheduler = paddle.optimizer.lr.NoamDecay(	
+                scheduler = paddle.optimizer.lr.NoamDecay(
                     d_model=0.01, warmup_steps=100, verbose=True)
                 adam = paddle.optimizer.Adam(
                     learning_rate=scheduler,
@@ -353,8 +400,9 @@ def set_state_dict(self, state_dict):
         self._accumulators_holder = state_dict
         for k, v in self._accumulators.items():
             for para_name, var_tmp in v.items():
-                assert var_tmp.name in state_dict, \
-                        "optimizer Tensor {} not found".format( var_tmp.name )
+                assert (
+                    var_tmp.name in state_dict
+                ), "optimizer Tensor {} not found".format(var_tmp.name)
                 var = var_tmp.value()
                 tensor = var.get_tensor()
                 model_np = np.array(tensor)
@@ -368,16 +416,23 @@ def set_state_dict(self, state_dict):
                 elif isinstance(load_para, np.ndarray):
                     load_para_np = load_para
                 else:
-                    raise RuntimeError("State dict type {} not supprt".format(
-                        str(type(load_para))))
-
-                assert model_np.shape == load_para_np.shape,  \
-                                          "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
-                                                 model_np.name, model_np.shape, load_para_np.shape)
+                    raise RuntimeError(
+                        "State dict type {} not supprt".format(
+                            str(type(load_para))
+                        )
+                    )
+
+                assert (
+                    model_np.shape == load_para_np.shape
+                ), "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
+                    model_np.name, model_np.shape, load_para_np.shape
+                )
 
-                assert model_np.dtype == load_para_np.dtype, \
-                                          "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
-                                                model_np.name, model_np.dtype, load_para_np.dtype)
+                assert (
+                    model_np.dtype == load_para_np.dtype
+                ), "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
+                    model_np.name, model_np.dtype, load_para_np.dtype
+                )
 
                 tensor.set(load_para_np, framework._current_expected_place())
 
@@ -386,51 +441,63 @@ def get_opti_var_name_list(self):
 
     def _create_global_learning_rate(self):
         # lr var can't be float16, for pure fp16 training, should extra handle the dtype for lr
-        _lr_dtype = paddle.get_default_dtype(
-        ) if self._dtype is None else self._dtype
-        _lr_dtype = paddle.float32 if (
-            paddle.get_default_dtype() != "float16"
-            and _lr_dtype == paddle.float16) else _lr_dtype
+        _lr_dtype = (
+            paddle.get_default_dtype() if self._dtype is None else self._dtype
+        )
+        _lr_dtype = (
+            paddle.float32
+            if (
+                paddle.get_default_dtype() != "float16"
+                and _lr_dtype == paddle.float16
+            )
+            else _lr_dtype
+        )
         if isinstance(self._learning_rate, LRScheduler):
             lr_var = self._global_learning_rate()
             # only create global lr_var once
             if not isinstance(lr_var, framework.Variable):
                 lr_name = unique_name.generate('learning_rate')
                 self._learning_rate._var_name = lr_name
-                lr_var = self.helper.create_global_variable(name=lr_name,
-                                                            shape=[1],
-                                                            persistable=True,
-                                                            stop_gradient=True,
-                                                            dtype=_lr_dtype)
+                lr_var = self.helper.create_global_variable(
+                    name=lr_name,
+                    shape=[1],
+                    persistable=True,
+                    stop_gradient=True,
+                    dtype=_lr_dtype,
+                )
                 main_prog = framework.default_main_program()
                 main_prog.lr_sheduler = self._learning_rate
                 main_prog.lr_var = lr_var
 
                 self._learning_rate_map[
-                    framework.default_main_program()] = lr_var
+                    framework.default_main_program()
+                ] = lr_var
 
             lr_value = float(self._learning_rate())
             self.helper.set_variable_initializer(
-                lr_var, initializer=Constant(value=lr_value))
+                lr_var, initializer=Constant(value=lr_value)
+            )
         elif isinstance(self._learning_rate, float):
             # only create global lr_var once
             lr = self._global_learning_rate()
             if isinstance(lr, framework.Variable):
                 return
             else:
-                self._learning_rate_map[framework.default_main_program(
-                )] = layers.create_global_var(
+                self._learning_rate_map[
+                    framework.default_main_program()
+                ] = layers.create_global_var(
                     name=unique_name.generate("learning_rate"),
                     shape=[1],
                     value=float(self._learning_rate),
                     dtype=_lr_dtype,
-                    persistable=True)
+                    persistable=True,
+                )
 
     @framework.dygraph_only
     def set_lr(self, value):
         """
         :api_attr: imperative
-        
+
         Set the value of the learning rate manually in the optimizer. If the optimizer use LRScheduler,
         this API cannot be invoked, because it will lead to conflict.
 
@@ -439,7 +506,7 @@ def set_lr(self, value):
 
         Returns:
             None
-          
+
         Examples:
             .. code-block:: python
 
@@ -465,7 +532,8 @@ def set_lr(self, value):
         if not isinstance(value, (int, float)):
             raise TypeError(
                 "The type of 'value' in optimizer.set_lr must be float, but received %s."
-                % (type(value)))
+                % (type(value))
+            )
         if isinstance(self._learning_rate, LRScheduler):
             raise RuntimeError(
                 "optimizer's learning rate can't be LRScheduler when invoke this API, because this will lead to conflict."
@@ -475,27 +543,40 @@ def set_lr(self, value):
         if current_lr is not None:
             if in_dygraph_mode():
                 place = _current_expected_place()
-                _C_ops.full_(current_lr, list(current_lr.shape), float(value),
-                             current_lr.dtype, place)
+                _C_ops.full_(
+                    current_lr,
+                    list(current_lr.shape),
+                    float(value),
+                    current_lr.dtype,
+                    place,
+                )
 
             elif _in_legacy_dygraph():
-                _legacy_C_ops.fill_constant(current_lr, 'value', float(value),
-                                            'dtype', current_lr.dtype, 'shape',
-                                            list(current_lr.shape))
+                _legacy_C_ops.fill_constant(
+                    current_lr,
+                    'value',
+                    float(value),
+                    'dtype',
+                    current_lr.dtype,
+                    'shape',
+                    list(current_lr.shape),
+                )
             else:
                 global_block = framework.default_main_program().global_block()
-                global_block.append_op(type='fill_constant',
-                                       outputs={'Out': [current_lr]},
-                                       attrs={
-                                           'dtype': current_lr.dtype,
-                                           'shape': list(current_lr.shape),
-                                           'value': float(value)
-                                       },
-                                       stop_gradient=True)
+                global_block.append_op(
+                    type='fill_constant',
+                    outputs={'Out': [current_lr]},
+                    attrs={
+                        'dtype': current_lr.dtype,
+                        'shape': list(current_lr.shape),
+                        'value': float(value),
+                    },
+                    stop_gradient=True,
+                )
 
     def get_lr(self):
         """
-        Get current learning rate of optimizer. 
+        Get current learning rate of optimizer.
         If 'LRScheduler' is not used, the return value is all the same.
         If 'LRScheduler' is used, the return value is the current scheduled learing rete.
 
@@ -565,8 +646,7 @@ def _global_learning_rate(self, program=None):
         return self._learning_rate_map.get(program, None)
 
     def _append_optimize_op(self, block, param_and_grad):
-        """ append optimize operator to block and return all the added optimize_op
-        """
+        """append optimize operator to block and return all the added optimize_op"""
         raise NotImplementedError(
             "Class \"Optimizer\" connot be used directly as an optimizer, please use its subclasses such as \"Adam\""
         )
@@ -583,8 +663,8 @@ def _create_param_lr(self, param_and_grad):
                     return self._global_learning_rate()
                 else:
                     with default_main_program()._lr_schedule_guard(
-                            is_with_opt=True), framework.name_scope(
-                                'scale_with_param_lr'):
+                        is_with_opt=True
+                    ), framework.name_scope('scale_with_param_lr'):
                         return self._global_learning_rate() * param_lr
         else:
             return self._global_learning_rate()
@@ -611,14 +691,16 @@ def _finish_update(self, block, parameters_and_grads):
         """
         pass
 
-    def _add_accumulator(self,
-                         name,
-                         param,
-                         dtype=None,
-                         fill_value=0.0,
-                         shape=None,
-                         type=None,
-                         device=None):
+    def _add_accumulator(
+        self,
+        name,
+        param,
+        dtype=None,
+        fill_value=0.0,
+        shape=None,
+        type=None,
+        device=None,
+    ):
         """Utility function to add an accumulator for a parameter
 
         Args:
@@ -630,13 +712,17 @@ def _add_accumulator(self,
         """
         if self._name is not None:
             name = self._name + "_" + name
-        if (name in self._accumulators
-                and param.name in self._accumulators[name]):
+        if (
+            name in self._accumulators
+            and param.name in self._accumulators[name]
+        ):
             if framework._non_static_mode():
                 return self._accumulators[name][param.name]
             raise Exception(
                 "Accumulator {} already exists for parameter {}".format(
-                    name, param.name))
+                    name, param.name
+                )
+            )
         if shape == None:
             shape = param.shape
         assert isinstance(self.helper, LayerHelper)
@@ -650,20 +736,25 @@ def _add_accumulator(self,
             persistable=True,
             dtype=dtype or param.dtype,
             type=core.VarDesc.VarType.LOD_TENSOR
-            if framework._in_eager_without_dygraph_check() else
-            (param.type if type is None else type),
+            if framework._in_eager_without_dygraph_check()
+            else (param.type if type is None else type),
             shape=shape,
-            belong_to_optimizer=True)
+            belong_to_optimizer=True,
+        )
         if device is None:
             device = self._get_device_for_param(param.name)
         with device_guard(device):
             self.helper.set_variable_initializer(
-                var, initializer=Constant(value=float(fill_value)))
+                var, initializer=Constant(value=float(fill_value))
+            )
 
         if framework._non_static_mode():
             if len(self._accumulators_holder) > 0:
-                assert var_name in self._accumulators_holder, \
-                        "Optimizer set error, {} should in state dict".format( var_name )
+                assert (
+                    var_name in self._accumulators_holder
+                ), "Optimizer set error, {} should in state dict".format(
+                    var_name
+                )
                 var.set_value(self._accumulators_holder[var_name])
 
         self._accumulators[name][param.name] = var
@@ -681,11 +772,15 @@ def _get_accumulator(self, name, param):
         """
         if self._name is not None:
             name = self._name + "_" + name
-        if (name not in self._accumulators
-                or param.name not in self._accumulators[name]):
+        if (
+            name not in self._accumulators
+            or param.name not in self._accumulators[name]
+        ):
             raise Exception(
                 "Accumulator {} does not exist for parameter {}".format(
-                    name, param.name))
+                    name, param.name
+                )
+            )
         return self._accumulators[name][param.name]
 
     def _update_param_device_map(self, parameters_and_grads, target_block):
@@ -693,13 +788,15 @@ def _update_param_device_map(self, parameters_and_grads, target_block):
             if param_and_grad[0].stop_gradient is False:
                 param_name = param_and_grad[0].name
                 ops = target_block.ops
-                device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName(
+                device_attr_name = (
+                    core.op_proto_and_checker_maker.kOpDeviceAttrName()
                 )
                 for op in ops:
                     input_arg_names = op.input_arg_names
                     if param_name in input_arg_names:
                         self._param_device_map[param_name] = op.attr(
-                            device_attr_name)
+                            device_attr_name
+                        )
                         break
 
     def _get_device_for_param(self, param_name):
@@ -708,7 +805,9 @@ def _get_device_for_param(self, param_name):
             device = self._param_device_map[param_name]
         return device
 
-    def _create_optimization_pass(self, parameters_and_grads):
+    def _create_optimization_pass(
+        self, parameters_and_grads, param_group_idx=0
+    ):
         """Add optimization operators to update gradients to tensors.
 
         Args:
@@ -736,10 +835,12 @@ def _create_optimization_pass(self, parameters_and_grads):
         target_block = global_block
         current_block = framework.default_main_program().current_block()
         if current_block.idx != global_block.idx:
-            assert current_block.backward_block_idx != -1, \
-                "current block is not global_block, but it doesn't have backward block."
+            assert (
+                current_block.backward_block_idx != -1
+            ), "current block is not global_block, but it doesn't have backward block."
             target_block = framework.default_main_program().blocks[
-                current_block.backward_block_idx]
+                current_block.backward_block_idx
+            ]
 
         start = len(target_block.ops)
         self.helper = LayerHelper(self.__class__.__name__)
@@ -748,57 +849,91 @@ def _create_optimization_pass(self, parameters_and_grads):
 
         # NOTE: Multi Tensor support [ Momentum, Adam ] for dygraph mode
         if self._use_multi_tensor and self.__class__.__name__ in [
-                'Momentum', 'Adam'
+            'Momentum',
+            'Adam',
         ]:
-            if len(self._param_dict['FP32_LODTensor']) == 0 and len(
-                    self._param_dict['FP16_LODTensor']) == 0:
+            if (
+                len(self._param_dict['FP32_LODTensor'][param_group_idx]) == 0
+                and len(self._param_dict['FP16_LODTensor'][param_group_idx])
+                == 0
+            ):
                 if isinstance(parameters_and_grads, list):
-                    self._multi_tensor_init(target_block, [
-                        p[0]
-                        for p in parameters_and_grads if not p[0].stop_gradient
-                    ])
+                    assert param_group_idx == 0
+                    self._multi_tensor_init(
+                        target_block,
+                        [
+                            p[0]
+                            for p in parameters_and_grads
+                            if not p[0].stop_gradient
+                        ],
+                        param_group_idx,
+                    )
                 else:
                     self._update_param_group(parameters_and_grads)
-                    self._multi_tensor_init(target_block, [
-                        p[0] for p in parameters_and_grads['params']
-                        if not p[0].stop_gradient
-                    ])
+                    self._multi_tensor_init(
+                        target_block,
+                        [
+                            p[0]
+                            for p in parameters_and_grads['params']
+                            if not p[0].stop_gradient
+                        ],
+                        param_group_idx,
+                    )
             if framework._non_static_mode():
-                self._append_optimize_multi_tensor_op(target_block,
-                                                      parameters_and_grads)
+                self._append_optimize_multi_tensor_op(
+                    target_block,
+                    parameters_and_grads,
+                    param_group_idx=param_group_idx,
+                )
             else:
-                self._update_param_device_map(parameters_and_grads,
-                                              target_block)
+                self._update_param_device_map(
+                    parameters_and_grads, target_block
+                )
                 # NOTE: Multi Tensor requires all parameters to be in the same device and program.
                 # param_grad_list = [p_0,g_0,p_1,g_1,....]
                 param_grad_list = []
                 for param_and_grad in parameters_and_grads:
-                    if not param_and_grad[0].stop_gradient and param_and_grad[
-                            1] is not None:
+                    if (
+                        not param_and_grad[0].stop_gradient
+                        and param_and_grad[1] is not None
+                    ):
                         param_grad_list.append(param_and_grad[0])
                         param_grad_list.append(param_and_grad[1])
                 with param_grad_list[0].block.program._optimized_guard(
-                        param_grad_list), name_scope("optimizer"):
+                    param_grad_list
+                ), name_scope("optimizer"):
                     device = self._get_device_for_param(param_grad_list[0].name)
                     with device_guard(device):
                         self._append_optimize_multi_tensor_op(
-                            target_block, parameters_and_grads)
+                            target_block,
+                            parameters_and_grads,
+                            param_group_idx=param_group_idx,
+                        )
         else:
             if not framework._non_static_mode():
-                params_grads_device_map = parameters_and_grads[
-                    'params'] if isinstance(parameters_and_grads,
-                                            dict) else parameters_and_grads
-                self._update_param_device_map(params_grads_device_map,
-                                              target_block)
+                params_grads_device_map = (
+                    parameters_and_grads['params']
+                    if isinstance(parameters_and_grads, dict)
+                    else parameters_and_grads
+                )
+                self._update_param_device_map(
+                    params_grads_device_map, target_block
+                )
 
             if isinstance(parameters_and_grads, list):
-                self._create_accumulators(target_block, [
-                    p[0] for p in parameters_and_grads if not p[0].stop_gradient
-                ])
+                self._create_accumulators(
+                    target_block,
+                    [
+                        p[0]
+                        for p in parameters_and_grads
+                        if not p[0].stop_gradient
+                    ],
+                )
             else:
                 params_acc_dict = parameters_and_grads.copy()
                 params_acc_dict['params'] = [
-                    p[0] for p in params_acc_dict['params']
+                    p[0]
+                    for p in params_acc_dict['params']
                     if not p[0].stop_gradient
                 ]
                 self._create_accumulators(target_block, params_acc_dict)
@@ -809,8 +944,9 @@ def _create_optimization_pass(self, parameters_and_grads):
                         if param_and_grad[1] is None:
                             continue
                         if param_and_grad[0].stop_gradient is False:
-                            self._append_optimize_op(target_block,
-                                                     param_and_grad)
+                            self._append_optimize_op(
+                                target_block, param_and_grad
+                            )
                 else:
                     for param_and_grad in parameters_and_grads['params']:
                         if param_and_grad[1] is None:
@@ -818,25 +954,31 @@ def _create_optimization_pass(self, parameters_and_grads):
                         if param_and_grad[0].stop_gradient is False:
                             param_grad_dict = dict()
                             param_grad_dict['params'] = param_and_grad
-                            param_grad_dict.update({
-                                k: v
-                                for k, v in parameters_and_grads.items()
-                                if k != 'params'
-                            })
-                            self._append_optimize_op(target_block,
-                                                     param_grad_dict)
+                            param_grad_dict.update(
+                                {
+                                    k: v
+                                    for k, v in parameters_and_grads.items()
+                                    if k != 'params'
+                                }
+                            )
+                            self._append_optimize_op(
+                                target_block, param_grad_dict
+                            )
             else:
                 for param_and_grad in parameters_and_grads:
                     if param_and_grad[1] is None:
                         continue
                     with param_and_grad[0].block.program._optimized_guard(
-                            param_and_grad), name_scope("optimizer"):
+                        param_and_grad
+                    ), name_scope("optimizer"):
                         if param_and_grad[0].stop_gradient is False:
                             device = self._get_device_for_param(
-                                param_and_grad[0].name)
+                                param_and_grad[0].name
+                            )
                             with device_guard(device):
                                 optimize_op = self._append_optimize_op(
-                                    target_block, param_and_grad)
+                                    target_block, param_and_grad
+                                )
 
         # Get custom finish ops for subclasses
         # FIXME: Need to fix this once we figure out how to handle dependencies
@@ -848,12 +990,14 @@ def _create_optimization_pass(self, parameters_and_grads):
     def _append_dgc_ops(self, param_and_grad):
         pass
 
-    def backward(self,
-                 loss,
-                 startup_program=None,
-                 parameters=None,
-                 no_grad_set=None,
-                 callbacks=None):
+    def backward(
+        self,
+        loss,
+        startup_program=None,
+        parameters=None,
+        no_grad_set=None,
+        callbacks=None,
+    ):
         """
         The first part of ``minimize``, do auto-diff to append backward operations for
         the current program.
@@ -884,7 +1028,7 @@ def backward(self,
                 a = paddle.to_tensor(value)
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
-                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                adam = paddle.optimizer.Adam(learning_rate = 0.01,
                                             parameters = linear.parameters())
                 out = linear(a)
                 out.backward()
@@ -902,8 +1046,7 @@ def backward(self,
             self._dtype = loss.dtype
 
         if framework._non_static_mode():
-            parameter_list = parameters if parameters \
-                else self._parameter_list
+            parameter_list = parameters if parameters else self._parameter_list
 
             params_grads = []
             for param in parameter_list:
@@ -917,23 +1060,26 @@ def backward(self,
             if callbacks is None:
                 callbacks = [error_clip_callback]
             else:
-                assert (isinstance(callbacks, list))
+                assert isinstance(callbacks, list)
             program = loss.block.program
-            assert len(loss.shape) == 1 and loss.shape[0] == 1, \
-                "The loss.shape should be (1L,), but the current loss.shape is {}. " \
+            assert len(loss.shape) == 1 and loss.shape[0] == 1, (
+                "The loss.shape should be (1L,), but the current loss.shape is {}. "
                 "Maybe that you should call paddle.mean to process the current loss.".format(
-                    loss.shape)
-            parameter_list = parameters if parameters \
-                else self._parameter_list
+                    loss.shape
+                )
+            )
+            parameter_list = parameters if parameters else self._parameter_list
             with program_guard(program, startup_program):
                 from paddle.incubate.autograd.utils import prim_enabled
+
                 if prim_enabled():
-                    params_grads = append_backward_new([loss], parameter_list,
-                                                       act_no_grad_set,
-                                                       callbacks)
+                    params_grads = append_backward_new(
+                        [loss], parameter_list, act_no_grad_set, callbacks
+                    )
                 else:
-                    params_grads = append_backward(loss, parameter_list,
-                                                   act_no_grad_set, callbacks)
+                    params_grads = append_backward(
+                        loss, parameter_list, act_no_grad_set, callbacks
+                    )
                 # Note: since we can't use all_reduce_op now,
                 #  dgc_op should be the last op of one grad.
                 self._append_dgc_ops(params_grads)
@@ -978,13 +1124,16 @@ def apply_gradients(self, params_grads):
             params_grads = append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
-        params_grads = self.append_regularization_ops(params_grads,
-                                                      self.regularization)
+        params_grads = self.append_regularization_ops(
+            params_grads, self.regularization
+        )
 
         optimize_ops = self._create_optimization_pass(params_grads)
         return optimize_ops
 
-    def _apply_optimize(self, loss, startup_program, params_grads):
+    def _apply_optimize(
+        self, loss, startup_program, params_grads, param_group_idx=0
+    ):
         """
         Second part of `minimize`, appending optimization operators for
         given `params_grads` pairs.
@@ -997,38 +1146,49 @@ def _apply_optimize(self, loss, startup_program, params_grads):
             list: A list of operators appended to the current program.
         """
         if framework._non_static_mode():
-            with program_guard(framework.default_main_program(),
-                               framework.default_startup_program()):
+            with program_guard(
+                framework.default_main_program(),
+                framework.default_startup_program(),
+            ):
                 if isinstance(params_grads, list):
                     if self._grad_clip is not None:
                         params_grads = self._grad_clip(params_grads)
                     params_grads = self.append_regularization_ops(
-                        params_grads, self.regularization)
+                        params_grads, self.regularization
+                    )
                 else:
                     grad_clip = params_grads['grad_clip']
                     if grad_clip is not None:
                         params_grads['params'] = grad_clip(
-                            params_grads['params'])
+                            params_grads['params']
+                        )
 
                     params_grads['params'] = self.append_regularization_ops(
-                        params_grads['params'], self.regularization)
-                optimize_ops = self._create_optimization_pass(params_grads)
+                        params_grads['params'], self.regularization
+                    )
+                optimize_ops = self._create_optimization_pass(
+                    params_grads, param_group_idx=param_group_idx
+                )
         else:
+            assert param_group_idx == 0
             program = loss.block.program
             with program_guard(program, startup_program):
                 optimize_ops = self.apply_gradients(params_grads)
         return optimize_ops
 
     def _create_regularization_of_grad(self, param, grad, regularization=None):
-        """ Create and add backward regularization Operators
-    
+        """Create and add backward regularization Operators
+
         Function helper of append_regularization_ops.
         """
         # If no gradient or no regularization is specified,  then we don't need to do anything
         if grad is None or (
-            (not hasattr(param, 'regularizer') or
-             (hasattr(param, 'regularizer') and param.regularizer is None))
-                and regularization is None):
+            (
+                not hasattr(param, 'regularizer')
+                or (hasattr(param, 'regularizer') and param.regularizer is None)
+            )
+            and regularization is None
+        ):
             return grad
         regularization_term = None
         if hasattr(param, 'regularizer') and param.regularizer is not None:
@@ -1057,7 +1217,8 @@ def _create_regularization_of_grad(self, param, grad, regularization=None):
                 dtype=param.dtype,
                 shape=param.shape,
                 lod_level=param.lod_level,
-                type=core.VarDesc.VarType.LOD_TENSOR)
+                type=core.VarDesc.VarType.LOD_TENSOR,
+            )
 
         inputs = {"X": [grad, regularization_term]}
         outputs = {"Out": [new_grad]}
@@ -1065,9 +1226,9 @@ def _create_regularization_of_grad(self, param, grad, regularization=None):
 
         return new_grad
 
-    def append_regularization_ops(self,
-                                  parameters_and_grads,
-                                  regularization=None):
+    def append_regularization_ops(
+        self, parameters_and_grads, regularization=None
+    ):
         r"""Create and add backward regularization Operators
     
         Creates and adds backward regularization operators in the BlockDesc.
@@ -1092,21 +1253,28 @@ def append_regularization_ops(self,
         if framework._non_static_mode():
             for param, grad in parameters_and_grads:
                 new_grad = self._create_regularization_of_grad(
-                    param, grad, regularization)
+                    param, grad, regularization
+                )
                 params_and_grads.append((param, new_grad))
         else:
             repeate_regularizer = False
             with framework.name_scope('regularization'):
                 for param, grad in parameters_and_grads:
-                    if not repeate_regularizer and param.regularizer is not None and regularization is not None:
+                    if (
+                        not repeate_regularizer
+                        and param.regularizer is not None
+                        and regularization is not None
+                    ):
                         repeate_regularizer = True
                         logging.info(
                             "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
                             "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
-                            % regularization.__str__())
+                            % regularization.__str__()
+                        )
                     with param.block.program._optimized_guard([param, grad]):
                         new_grad = self._create_regularization_of_grad(
-                            param, grad, regularization)
+                            param, grad, regularization
+                        )
                         params_and_grads.append((param, new_grad))
         return params_and_grads
 
@@ -1114,7 +1282,8 @@ def _get_no_grad_set(self, loss, no_grad_set=None):
         no_grad_set = _get_no_grad_set_name(no_grad_set)
         parameters = loss.block.program.global_block().all_parameters()
         param_no_trainable = set(
-            [param.name for param in parameters if param.stop_gradient is True])
+            [param.name for param in parameters if param.stop_gradient is True]
+        )
         # If the parameter is no trainable, it should not have a gradient.
         no_grad_set.update(param_no_trainable)
 
@@ -1128,13 +1297,13 @@ def clear_grad(self, set_to_zero=True):
         If not, new gradient will accumulat on previous gradient.
 
         There are two method to clear grad: set_to_zero or delete grad.
-        
+
         Args:
             set_to_zero (bool, optional): If set grads to zero or not, default is True.
-        
+
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -1145,7 +1314,7 @@ def clear_grad(self, set_to_zero=True):
                 a = paddle.to_tensor(value)
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
-                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                adam = paddle.optimizer.Adam(learning_rate = 0.01,
                                             parameters = linear.parameters())
                 out = linear(a)
                 out.backward()
@@ -1155,7 +1324,8 @@ def clear_grad(self, set_to_zero=True):
         """
         param_list = []
         if self._parameter_list is None or not isinstance(
-                self._parameter_list[0], dict):
+            self._parameter_list[0], dict
+        ):
             for p in self._parameter_list:
                 if not p.stop_gradient:
                     param_list.append(p)
@@ -1172,11 +1342,9 @@ def clear_grad(self, set_to_zero=True):
             core.clear_gradients(param_list, set_to_zero)
 
     @imperative_base.no_grad
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameters=None,
-                 no_grad_set=None):
+    def minimize(
+        self, loss, startup_program=None, parameters=None, no_grad_set=None
+    ):
         """
         Add operations to minimize ``loss`` by updating ``parameters``.
 
@@ -1195,13 +1363,13 @@ def minimize(self,
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) tensor pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
-            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+            indicate program pruning. If so, the program will be pruned by ``feed`` and
             ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
             .. code-block:: python
- 
+
                 import paddle
                 linear = paddle.nn.Linear(10, 10)
                 input = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
@@ -1221,17 +1389,18 @@ def minimize(self,
         """
         assert isinstance(loss, Variable), "The loss should be an Tensor."
 
-        parameter_list = parameters if parameters \
-            else self._parameter_list
+        parameter_list = parameters if parameters else self._parameter_list
 
-        params_grads = self.backward(loss,
-                                     startup_program=startup_program,
-                                     parameters=parameter_list,
-                                     no_grad_set=no_grad_set)
+        params_grads = self.backward(
+            loss,
+            startup_program=startup_program,
+            parameters=parameter_list,
+            no_grad_set=no_grad_set,
+        )
 
-        optimize_ops = self._apply_optimize(loss,
-                                            startup_program=startup_program,
-                                            params_grads=params_grads)
+        optimize_ops = self._apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads
+        )
 
         return optimize_ops, params_grads
 
@@ -1240,7 +1409,7 @@ def minimize(self,
     def step(self):
         """
         Execute the optimizer and update parameters once.
-        
+
         Returns:
             None
 
@@ -1254,7 +1423,7 @@ def step(self):
                 a = paddle.to_tensor(value)
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
-                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                adam = paddle.optimizer.Adam(learning_rate = 0.01,
                                             parameters = linear.parameters())
                 out = linear(a)
                 out.backward()
@@ -1271,13 +1440,16 @@ def step(self):
                     grad_var = param._grad_ivar()
                     params_grads.append((param, grad_var))
 
-            self._apply_optimize(loss=None,
-                                 startup_program=None,
-                                 params_grads=params_grads)
+            self._apply_optimize(
+                loss=None,
+                startup_program=None,
+                params_grads=params_grads,
+                param_group_idx=0,
+            )
 
         else:
             # optimize parameters in groups
-            for param_group in self._param_groups:
+            for idx, param_group in enumerate(self._param_groups):
                 params_grads = defaultdict(lambda: list())
                 for param in param_group['params']:
                     if param.stop_gradient:
@@ -1286,11 +1458,14 @@ def step(self):
                         grad_var = param._grad_ivar()
                         params_grads['params'].append((param, grad_var))
                 params_grads.update(
-                    {k: v
-                     for k, v in param_group.items() if k != 'params'})
-                self._apply_optimize(loss=None,
-                                     startup_program=None,
-                                     params_grads=params_grads)
+                    {k: v for k, v in param_group.items() if k != 'params'}
+                )
+                self._apply_optimize(
+                    loss=None,
+                    startup_program=None,
+                    params_grads=params_grads,
+                    param_group_idx=idx,
+                )
 
     def _add_param_group(self, param_group):
         """
@@ -1306,7 +1481,8 @@ def _add_param_group(self, param_group):
         elif isinstance(params, set):
             raise TypeError(
                 "optimizer parameters should be in ordered collections,"
-                "but received set, please use list instead.")
+                "but received set, please use list instead."
+            )
         else:
             param_group['params'] = list(params)
 
@@ -1320,18 +1496,21 @@ def _add_param_group(self, param_group):
 
         if not param_set.isdisjoint(set(param_group['params'])):
             raise ValueError(
-                "some parameters appear in more than one parameter group")
+                "some parameters appear in more than one parameter group"
+            )
 
         for param in param_group['params']:
             weight_decay = param_group['weight_decay']
             if isinstance(weight_decay, float):
                 from ..fluid.regularizer import L2Decay
+
                 regularization = L2Decay(weight_decay)
             else:
                 regularization = weight_decay
             param.regularizer = regularization
             param.optimize_attr['learning_rate'] = param_group.get(
-                'learning_rate', 1.)
+                'learning_rate', 1.0
+            )
 
         self._param_groups.append(param_group)
 
@@ -1345,7 +1524,7 @@ def _update_param_group(self, parameters):
         pass
 
     @framework.dygraph_only
-    def _multi_tensor_init(self, target_block, parameters):
+    def _multi_tensor_init(self, target_block, parameters, param_group_idx):
         """
         All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
         This function will be overridden in the corresponding optimizer file.
@@ -1357,9 +1536,10 @@ def _multi_tensor_init(self, target_block, parameters):
         pass
 
     @framework.dygraph_only
-    def _append_optimize_multi_tensor_op(self, target_block,
-                                         parameters_and_grads):
-        """ 
+    def _append_optimize_multi_tensor_op(
+        self, target_block, parameters_and_grads, param_group_idx
+    ):
+        """
         For Multi Tensor, append optimize merged_operator to block.
         """
         pass