PaddlePaddle · luotao1 · Apr 29, 2021 · Apr 22, 2021 · Apr 22, 2021 · Apr 22, 2021
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
@@ -117,6 +117,9 @@ class FillConstantKernel : public framework::OpKernel<T> {
     }
 
     if (actual_place == 0) {
+      VLOG(4) << "[CPU] FillConstantKernel"
+              << ((data_type == framework::proto::VarType::BF16) ? "<bfloat16>"
+                                                                 : "<T>");
       tensor->mutable_data(platform::CPUPlace(), data_type);
       math::SetConstant<platform::CPUDeviceContext, T> functor;
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),

diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
@@ -18,10 +18,41 @@ limitations under the License. */
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/bfloat16.h"
 
 namespace paddle {
 namespace operators {
 
+namespace {
+template <typename T>
+inline void UniformRealDistribution(T *data, const int64_t &size,
+                                    const float &min, const float &max,
+                                    const unsigned int &seed) {
+  VLOG(4) << "[CPU] UniformRandomKernel<T>";
+  std::uniform_real_distribution<T> dist(static_cast<T>(min),
+                                         static_cast<T>(max));
+  auto engine = paddle::framework::GetCPURandomEngine(seed);
+
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
+template <>
+inline void UniformRealDistribution(paddle::platform::bfloat16 *data,
+                                    const int64_t &size, const float &min,
+                                    const float &max,
+                                    const unsigned int &seed) {
+  VLOG(4) << "[CPU] UniformRandomKernel<bfloat16>";
+  std::uniform_real_distribution<float> dist(min, max);
+  auto engine = paddle::framework::GetCPURandomEngine(seed);
+
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = static_cast<paddle::platform::bfloat16>(dist(*engine));
+  }
+}
+}  // namespace
+
 // It seems that Eigen::Tensor::random in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -61,17 +92,11 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
           framework::ToTypeName(out_var->Type())));
     }
     T *data = tensor->mutable_data<T>(ctx.GetPlace());
-
     int64_t size = tensor->numel();
-    std::uniform_real_distribution<T> dist(
-        static_cast<T>(ctx.Attr<float>("min")),
-        static_cast<T>(ctx.Attr<float>("max")));
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    auto engine = framework::GetCPURandomEngine(seed);
 
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(*engine);
-    }
+    UniformRealDistribution<T>(
+        data, size, ctx.Attr<float>("min"), ctx.Attr<float>("max"),
+        static_cast<unsigned int>(ctx.Attr<int>("seed")));
 
     unsigned int diag_num =
         static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
@@ -257,9 +282,12 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     paddle::operators::UniformRandomOpVarTypeInference);
 
-REGISTER_OP_CPU_KERNEL(uniform_random,
-                       paddle::operators::CPUUniformRandomKernel<float>,
-                       paddle::operators::CPUUniformRandomKernel<double>);
-REGISTER_OP_CPU_KERNEL(uniform_random_batch_size_like,
-                       paddle::operators::CPUUniformRandomKernel<float>,
-                       paddle::operators::CPUUniformRandomKernel<double>);
+REGISTER_OP_CPU_KERNEL(
+    uniform_random, paddle::operators::CPUUniformRandomKernel<float>,
+    paddle::operators::CPUUniformRandomKernel<double>,
+    paddle::operators::CPUUniformRandomKernel<paddle::platform::bfloat16>);
+REGISTER_OP_CPU_KERNEL(
+    uniform_random_batch_size_like,
+    paddle::operators::CPUUniformRandomKernel<float>,
+    paddle::operators::CPUUniformRandomKernel<double>,
+    paddle::operators::CPUUniformRandomKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
@@ -24,9 +24,9 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 inline std::vector<int64_t> GetNewDataFromShapeTensor(
-    const Tensor *new_data_tensor) {
+    const Tensor* new_data_tensor) {
   if (new_data_tensor->type() == framework::proto::VarType::INT64) {
-    auto *new_data = new_data_tensor->data<int64_t>();
+    auto* new_data = new_data_tensor->data<int64_t>();
     framework::Tensor cpu_starts_tensor;
     if (platform::is_gpu_place(new_data_tensor->place())) {
       TensorCopySync(*new_data_tensor, platform::CPUPlace(),
@@ -37,7 +37,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
                                       new_data + new_data_tensor->numel());
     return vec_new_data;
   } else if (new_data_tensor->type() == framework::proto::VarType::INT32) {
-    auto *new_data = new_data_tensor->data<int32_t>();
+    auto* new_data = new_data_tensor->data<int32_t>();
     std::vector<int64_t> vec_new_data;
     framework::Tensor cpu_starts_tensor;
     if (platform::is_gpu_place(new_data_tensor->place())) {
@@ -58,7 +58,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
 }
 
 inline std::vector<int64_t> GetNewDataFromShapeTensorList(
-    const std::vector<const Tensor *> &list_new_shape_tensor) {
+    const std::vector<const Tensor*>& list_new_shape_tensor) {
   std::vector<int64_t> vec_new_shape;
   vec_new_shape.reserve(list_new_shape_tensor.size());
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
@@ -97,6 +97,5 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
 
   return vec_new_shape;
 }
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
@@ -247,7 +247,7 @@ def __call__(self, var, block=None):
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initializers
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
+        if var.dtype == VarDesc.VarType.FP16:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -276,7 +276,7 @@ def __call__(self, var, block=None):
             },
             stop_gradient=True)
 
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
+        if var.dtype == VarDesc.VarType.FP16:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -542,7 +542,8 @@ def __call__(self, var, block=None):
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initalizers
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -584,7 +585,8 @@ def __call__(self, var, block=None):
                 },
                 stop_gradient=True)
 
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -673,7 +675,8 @@ def __call__(self, var, block=None):
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initalizers
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -715,7 +718,8 @@ def __call__(self, var, block=None):
                 },
                 stop_gradient=True)
 
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
@@ -10523,10 +10523,10 @@ def uniform_random_batch_size_like(input,
 
 
     """
-    check_variable_and_dtype(input, 'Input', ("float32", 'float64'),
+    check_variable_and_dtype(input, 'Input', ("float32", 'float64', "uint16"),
                              'uniform_random_batch_size_like')
     check_type(shape, 'shape', (list, tuple), 'uniform_random_batch_size_like')
-    check_dtype(dtype, 'dtype', ('float32', 'float64'),
+    check_dtype(dtype, 'dtype', ('float32', 'float64', "uint16"),
                 'uniform_random_batch_size_like')
 
     helper = LayerHelper('uniform_random_batch_size_like', **locals())
@@ -15120,7 +15120,8 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
                                        float(max), 'seed', seed, 'dtype', dtype)
 
     check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
-    check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform_random/rand')
+    check_dtype(dtype, 'dtype', ('float32', 'float64', 'uint16'),
+                'uniform_random/rand')
 
     inputs = dict()
     attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}

diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -53,7 +53,7 @@ def test_constant_initializer_default_value(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.ConstantInitializer())
-        num_ops = 2 if dtype in ["float16"] else 1
+        num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -72,7 +72,7 @@ def test_constant_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.ConstantInitializer(2.3))
-        num_ops = 2 if dtype in ["float16"] else 1
+        num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -108,7 +108,7 @@ def test_uniform_initializer_default_value(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.UniformInitializer())
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+        num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -153,7 +153,7 @@ def test_uniform_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+        num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -174,7 +174,7 @@ def test_uniform_initializer_two_op(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.UniformInitializer(-4.2, float(i), 123))
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+        num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op0 = block.ops[0]
         self.assertEqual(init_op0.type, 'uniform_random')
@@ -195,13 +195,11 @@ def test_uniform_initializer_fp16(self):
 
     def test_uniform_initializer_bf16(self):
         """Test uniform initializer with bfloat16
+           No cast operator has been added here
         """
         block = self.test_uniform_initializer_default_value("uint16")
-        self.assertTrue(check_cast_op(block.ops[1]))
         block = self.test_uniform_initializer(dtype="uint16")
-        self.assertTrue(check_cast_op(block.ops[1]))
         block = self.test_uniform_initializer_two_op("uint16")
-        self.assertTrue(check_cast_op(block.ops[1]))
 
 
 class TestNormalInitializer(unittest.TestCase):
@@ -347,7 +345,9 @@ def test_normal_xavier_initializer_conv(self):
         self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 0)
 
-    def test_xavier_initializer_supplied_arguments(self, dtype="float32"):
+    def test_xavier_initializer_supplied_arguments(self,
+                                                   dtype="float32",
+                                                   uniform=True):
         """Test the Xavier initializer with supplied arguments
         """
         program = framework.Program()
@@ -359,14 +359,18 @@ def test_xavier_initializer_supplied_arguments(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.XavierInitializer(
-                    fan_in=12, fan_out=23, seed=134))
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+                    uniform=uniform, fan_in=12, fan_out=23, seed=134))
+        num_ops = 2 if (dtype == "float16" or (dtype == "uint16" and
+                                               not uniform)) else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'uniform_random')
-        limit = np.sqrt(6.0 / (12 + 23))
-        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        if uniform:
+            self.assertEqual(init_op.type, 'uniform_random')
+            limit = np.sqrt(6.0 / (12 + 23))
+            self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+            self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        else:
+            self.assertEqual(init_op.type, 'gaussian_random')
         self.assertEqual(init_op.attr('seed'), 134)
         return block
 
@@ -379,8 +383,12 @@ def test_xavier_initializer_fp16(self):
     def test_xavier_initializer_bf16(self):
         """Test the Xavier initializer with bfloat16
         """
-        block = self.test_xavier_initializer_supplied_arguments("uint16")
-        self.assertTrue(check_cast_op(block.ops[1]))
+        block_uniform = self.test_xavier_initializer_supplied_arguments(
+            "uint16")
+        self.assertEqual(len(block_uniform.ops), 1)
+        block_gaussian = self.test_xavier_initializer_supplied_arguments(
+            "uint16", False)
+        self.assertTrue(check_cast_op(block_gaussian.ops[1]))
 
 
 class TestMSRAInitializer(unittest.TestCase):
@@ -483,7 +491,7 @@ def test_msra_initializer_supplied_arguments(self, dtype="float32"):
                 name="param",
                 initializer=initializer.MSRAInitializer(
                     fan_in=12, seed=134))
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+        num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -503,7 +511,6 @@ def test_msra_initializer_bf16(self):
         """Test the MSRA initializer with bfloat16
         """
         block = self.test_msra_initializer_supplied_arguments("uint16")
-        self.assertTrue(check_cast_op(block.ops[1]))
 
 
 class TestBilinearInitializer(unittest.TestCase):

diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
@@ -225,7 +225,7 @@ def test_uniform_common(self, dtype="float32", seed=0):
                 lod_level=0,
                 name="param",
                 initializer=initializer.Uniform())
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+        num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -256,7 +256,7 @@ def test_uniform_initializer_default_value(self,
                 lod_level=0,
                 name="param",
                 initializer=initializer.Uniform())
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+        num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -287,7 +287,7 @@ def test_uniform_initializer(self,
                 lod_level=0,
                 name="param",
                 initializer=initializer.Uniform(min_value, max_vlaue))
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+        num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -317,7 +317,7 @@ def test_uniform_initializer_two_op(self,
                 lod_level=0,
                 name="param",
                 initializer=initializer.Uniform(min_value, float(i)))
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+        num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op0 = block.ops[0]
         self.assertEqual(init_op0.type, 'uniform_random')
@@ -343,11 +343,8 @@ def test_uniform_initializer_bf16(self):
         """Test uniform initializer with bfloat16
         """
         block = self.test_uniform_initializer_default_value("uint16")  #bfloat16
-        self.assertTrue(check_cast_op(block.ops[1]))
         block = self.test_uniform_initializer(dtype="uint16")  #bfloat16
-        self.assertTrue(check_cast_op(block.ops[1]))
         block = self.test_uniform_initializer_two_op("uint16")  #bfloat16
-        self.assertTrue(check_cast_op(block.ops[1]))
 
     def test_uniform_initializer_dygraph(self):
         """Test uniform initializer in dygraph model.