diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 97c50adf4a7829..e9d48d8562927d 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -108,8 +108,12 @@ def __init__(self, self.regularization = regularization self._grad_clip = grad_clip self._learning_rate = learning_rate - # the learning rate type should be inferenced from loss + self._dtype = None + # Infer the dtype form parameter + if self._parameter_list: + self._dtype = self._parameter_list[0].dtype + # each program should have a independent learning rate # program -> Variable(learning_rate) self._learning_rate_map = dict() @@ -768,7 +772,10 @@ def backward(self, else: act_no_grad_set = self._get_no_grad_set(loss, no_grad_set) - self._dtype = loss.dtype + # Infer dtype by loss if None + if self._dtype is None: + self._dtype = loss.dtype + if framework.in_dygraph_mode(): parameter_list = parameter_list if parameter_list \ else self._parameter_list diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index 91d70522331636..369a5bdae046f2 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -23,7 +23,8 @@ import paddle.compat as cpt import numpy as np from paddle.fluid.backward import append_backward -from paddle.fluid.framework import Program, program_guard +from paddle.fluid.framework import Program, program_guard, convert_np_dtype_to_dtype_ +import paddle class TestOptimizer(unittest.TestCase): @@ -1042,5 +1043,37 @@ def test_program_desc(self, ): ['sgd', 'sgd']) +class TestOptimizerDtype(unittest.TestCase): + ''' + The dtype of optimizer should be inferred by parameters, and the learning rate + is cteated with the same dtype. + ''' + + def check_with_dtype(self, dtype): + class MyLayer(paddle.nn.Layer): + def __init__(self, dtype): + super(MyLayer, self).__init__() + self._w = self.create_parameter([2, 3], dtype=dtype) + self._b = self.create_parameter([2, 3], dtype=dtype) + + def forward(self, x): + return x * self._w + self._b + + with paddle.fluid.dygraph.guard(): + model = MyLayer(dtype) + x = paddle.rand([10, 2, 3], dtype=dtype) + loss = model(x) + adam = paddle.optimizer.Adam(parameters=model.parameters()) + loss.backward() + adam.step() + self.assertEqual(adam._dtype, convert_np_dtype_to_dtype_(dtype)) + + def test_float64(self): + self.check_with_dtype('float64') + + def test_float32(self): + self.check_with_dtype('float32') + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index 910c9b185dbaab..c51c00f4a716db 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -270,7 +270,6 @@ def step(self): adam.step() adam.clear_grad() """ - self._dtype = None params_grads = [] for param in self._parameter_list: if not param.trainable: diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 2aa7fa115ec2ef..5f742820178cee 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -210,7 +210,6 @@ def minimize(self, @framework.dygraph_only @imperative_base.no_grad def step(self): - self._dtype = None params_grads = [] for param in self._parameter_list: if not param.trainable: diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 1cfc0b66e7b671..212dad7c77cb4f 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -132,8 +132,12 @@ def __init__(self, self.regularization = weight_decay self._grad_clip = grad_clip self._learning_rate = learning_rate - # the learning rate type should be inferenced from loss + self._dtype = None + # Infer the dtype form parameter + if self._parameter_list: + self._dtype = self._parameter_list[0].dtype + # each program should have a independent learning rate # program -> tensor(learning_rate) self._learning_rate_map = dict() @@ -675,7 +679,10 @@ def backward(self, else: act_no_grad_set = self._get_no_grad_set(loss, no_grad_set) - self._dtype = loss.dtype + # Infer dtype by loss if None + if self._dtype is None: + self._dtype = loss.dtype + if framework.in_dygraph_mode(): parameter_list = parameters if parameters \ else self._parameter_list @@ -885,6 +892,7 @@ def minimize(self, return optimize_ops, params_grads + @imperative_base.no_grad @framework.dygraph_only def step(self): """ @@ -910,7 +918,6 @@ def step(self): adam.step() adam.clear_grad() """ - self._dtype = None params_grads = [] for param in self._parameter_list: if not param.trainable: