PaddlePaddle · QingshuChen · Jul 29, 2022 · Jul 27, 2022 · Jul 29, 2022
diff --git a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
@@ -23,6 +23,8 @@ using Tensor = framework::Tensor;
 
 template <typename T>
 class ResNetUnitXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto place = ctx.GetPlace();
@@ -63,9 +65,12 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
     std::string act_type = ctx.Attr<std::string>("act_type");
     auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
 
-    std::vector<const T *> x_list = {input_x->data<T>()};
-    std::vector<const T *> w_list = {filter_x->data<T>()};
-    std::vector<T *> conv_y_list = {conv_out_x->mutable_data<T>(place)};
+    std::vector<const XPUType *> x_list = {
+        reinterpret_cast<const XPUType *>(input_x->data<T>())};
+    std::vector<const XPUType *> w_list = {
+        reinterpret_cast<const XPUType *>(filter_x->data<T>())};
+    std::vector<XPUType *> conv_y_list = {
+        reinterpret_cast<XPUType *>(conv_out_x->mutable_data<T>(place))};
 
     std::vector<std::vector<int>> x_shape_list = {
         phi::vectorize<int>(input_x->dims())};
@@ -107,9 +112,10 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
       Tensor *running_mean_z = ctx.Output<Tensor>("RunningMeanZ");
       Tensor *running_var_z = ctx.Output<Tensor>("RunningVarZ");
 
-      x_list.push_back(input_z->data<T>());
-      w_list.push_back(filter_z->data<T>());
-      conv_y_list.push_back(conv_out_z->mutable_data<T>(place));
+      x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
+      w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
+      conv_y_list.push_back(
+          reinterpret_cast<XPUType *>(conv_out_z->mutable_data<T>(place)));
 
       x_shape_list.push_back(phi::vectorize<int>(input_z->dims()));
 
@@ -133,17 +139,17 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
       if (fuse_add) {
         const Tensor *input_z = ctx.Input<Tensor>("Z");
         auto input_z_shape = phi::vectorize<int>(input_z->dims());
-        x_list.push_back(input_z->data<T>());
+        x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
         x_shape_list.push_back(input_z_shape);
         x_maxlist.push_back(nullptr);
       }
     }
-    int r = xpu::resnet_unit_fusion<T, T, T, int16_t>(
+    int r = xpu::resnet_unit_fusion<XPUType, XPUType, XPUType, int16_t>(
         dev_ctx.x_context(),
         x_list,
         w_list,
         conv_y_list,
-        output->mutable_data<T>(place),
+        reinterpret_cast<XPUType *>(output->mutable_data<T>(place)),
         x_shape_list,
         filter_x_shape[0],
         ksize_list,
@@ -172,6 +178,8 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
 
 template <typename T>
 class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto place = ctx.GetPlace();
@@ -208,11 +216,16 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
 
     auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
 
-    std::vector<const T *> x_list = {x->data<T>()};
-    std::vector<const T *> w_list = {filter_x->data<T>()};
-    std::vector<const T *> conv_y_list = {conv_out_x->data<T>()};
-    std::vector<T *> dx_list = {x_grad->mutable_data<T>(place)};
-    std::vector<T *> dw_list = {filter_x_grad->mutable_data<T>(place)};
+    std::vector<const XPUType *> x_list = {
+        reinterpret_cast<const XPUType *>(x->data<T>())};
+    std::vector<const XPUType *> w_list = {
+        reinterpret_cast<const XPUType *>(filter_x->data<T>())};
+    std::vector<const XPUType *> conv_y_list = {
+        reinterpret_cast<const XPUType *>(conv_out_x->data<T>())};
+    std::vector<XPUType *> dx_list = {
+        reinterpret_cast<XPUType *>(x_grad->mutable_data<T>(place))};
+    std::vector<XPUType *> dw_list = {
+        reinterpret_cast<XPUType *>(filter_x_grad->mutable_data<T>(place))};
 
     std::vector<std::vector<int>> x_shape_list = {
         phi::vectorize<int>(x->dims())};
@@ -262,11 +275,14 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
       Tensor *scale_z_grad =
           ctx.Output<Tensor>(framework::GradVarName("ScaleZ"));
       Tensor *bias_z_grad = ctx.Output<Tensor>(framework::GradVarName("BiasZ"));
-      x_list.push_back(z->data<T>());
-      w_list.push_back(filter_z->data<T>());
-      conv_y_list.push_back(conv_out_z->data<T>());
-      dx_list.push_back(z_grad->mutable_data<T>(place));
-      dw_list.push_back(filter_z_grad->mutable_data<T>(place));
+      x_list.push_back(reinterpret_cast<const XPUType *>(z->data<T>()));
+      w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
+      conv_y_list.push_back(
+          reinterpret_cast<const XPUType *>(conv_out_z->data<T>()));
+      dx_list.push_back(
+          reinterpret_cast<XPUType *>(z_grad->mutable_data<T>(place)));
+      dw_list.push_back(
+          reinterpret_cast<XPUType *>(filter_z_grad->mutable_data<T>(place)));
       x_shape_list.push_back(phi::vectorize<int>(z->dims()));
 
       auto filter_z_shape = phi::vectorize<int>(filter_z->dims());
@@ -288,38 +304,39 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
     } else {
       if (fuse_add) {
         auto z_grad = ctx.Output<Tensor>(framework::GradVarName("Z"));
-        dx_list.push_back(z_grad->mutable_data<T>(place));
+        dx_list.push_back(
+            reinterpret_cast<XPUType *>(z_grad->mutable_data<T>(place)));
       }
     }
 
-    int r =
-        xpu::resnet_unit_grad_fusion<T, T, T, int16_t>(dev_ctx.x_context(),
-                                                       x_list,
-                                                       w_list,
-                                                       y_grad->data<T>(),
-                                                       output->data<T>(),
-                                                       conv_y_list,
-                                                       dx_list,
-                                                       dw_list,
-                                                       x_shape_list,
-                                                       filter_x_shape[0],
-                                                       ksize_list,
-                                                       stride_list,
-                                                       paddings,
-                                                       dilations,
-                                                       group,
-                                                       x_maxlist,
-                                                       w_maxlist,
-                                                       scale_list,
-                                                       batch_mean_list,
-                                                       batch_invstd_list,
-                                                       dscale_list,
-                                                       dbias_list,
-                                                       xpu::Activation_t::RELU,
-                                                       eps,
-                                                       is_nchw,
-                                                       has_shortcut,
-                                                       fuse_add);
+    int r = xpu::resnet_unit_grad_fusion<XPUType, XPUType, XPUType, int16_t>(
+        dev_ctx.x_context(),
+        x_list,
+        w_list,
+        reinterpret_cast<const XPUType *>(y_grad->data<T>()),
+        reinterpret_cast<const XPUType *>(output->data<T>()),
+        conv_y_list,
+        dx_list,
+        dw_list,
+        x_shape_list,
+        filter_x_shape[0],
+        ksize_list,
+        stride_list,
+        paddings,
+        dilations,
+        group,
+        x_maxlist,
+        w_maxlist,
+        scale_list,
+        batch_mean_list,
+        batch_invstd_list,
+        dscale_list,
+        dbias_list,
+        xpu::Activation_t::RELU,
+        eps,
+        is_nchw,
+        has_shortcut,
+        fuse_add);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "resnet_unit_grad_fusion");
   }
 };
@@ -329,5 +346,9 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_XPU_KERNEL(resnet_unit, ops::ResNetUnitXPUKernel<float>);
-REGISTER_OP_XPU_KERNEL(resnet_unit_grad, ops::ResNetUnitGradXPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(resnet_unit,
+                       ops::ResNetUnitXPUKernel<plat::float16>,
+                       ops::ResNetUnitXPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(resnet_unit_grad,
+                       ops::ResNetUnitGradXPUKernel<plat::float16>,
+                       ops::ResNetUnitGradXPUKernel<float>);
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc b/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc
@@ -22,6 +22,8 @@ namespace operators {
 
 template <typename T>
 class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     bool multi_precision = ctx.Attr<bool>("multi_precision");
@@ -35,14 +37,14 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
     auto master_param = ctx.MultiInput<framework::LoDTensor>("MasterParam");
     auto master_param_out =
         ctx.MultiOutput<framework::LoDTensor>("MasterParamOut");
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    T lars_coeff = ctx.Attr<float>("lars_coeff");
-    T epsilon = ctx.Attr<float>("epsilon");
-    T rescale_grad = ctx.Attr<float>("rescale_grad");
+    float mu = static_cast<T>(ctx.Attr<float>("mu"));
+    float lars_coeff = ctx.Attr<float>("lars_coeff");
+    float epsilon = ctx.Attr<float>("epsilon");
+    float rescale_grad = ctx.Attr<float>("rescale_grad");
 
-    std::vector<T*> param_list;
-    std::vector<T*> grad_list;
-    std::vector<T*> param_out_list;
+    std::vector<XPUType*> param_list;
+    std::vector<XPUType*> grad_list;
+    std::vector<XPUType*> param_out_list;
     std::vector<float*> velocity_list;
     std::vector<float*> velocity_out_list;
     std::vector<float*> lrs;
@@ -52,9 +54,12 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
     std::vector<float*> master_param_out_list;
     int op_num = param.size();
     for (int i = 0; i < op_num; ++i) {
-      param_list.push_back(const_cast<T*>(param[i]->data<T>()));
-      grad_list.push_back(const_cast<T*>(grad[i]->data<T>()));
-      param_out_list.push_back(param_out[i]->mutable_data<T>(ctx.GetPlace()));
+      param_list.push_back(
+          reinterpret_cast<XPUType*>(const_cast<T*>((param[i]->data<T>()))));
+      grad_list.push_back(
+          reinterpret_cast<XPUType*>(const_cast<T*>(grad[i]->data<T>())));
+      param_out_list.push_back(reinterpret_cast<XPUType*>(
+          param_out[i]->mutable_data<T>(ctx.GetPlace())));
       velocity_list.push_back(const_cast<float*>(velocity[i]->data<float>()));
       velocity_out_list.push_back(
           velocity_out[i]->mutable_data<float>(ctx.GetPlace()));
@@ -111,5 +116,7 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(lars_momentum, ops::LarsMomentumOpXPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(lars_momentum,
+                       ops::LarsMomentumOpXPUKernel<paddle::platform::float16>,
+                       ops::LarsMomentumOpXPUKernel<float>);
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -235,7 +235,9 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"generate_proposals_v2",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"grad_add", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"grad_add",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"greater_equal",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
@@ -258,9 +260,8 @@ XPUOpMap& get_kl2_ops() {
       {"label_smooth",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"lars_momentum",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"layer_norm_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"layer_norm_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
@@ -380,9 +381,12 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::BOOL, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"resnet_unit", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"resnet_unit",
+       XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
       {"resnet_unit_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -496,6 +500,9 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"top_k_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"update_loss_scaling",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"unsqueeze2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),

diff --git a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
@@ -24,18 +24,25 @@ void GradAddXPUKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& y,
                       DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
   dev_ctx.template Alloc<T>(out);
   auto x_shape = phi::vectorize<int>(x.dims());
   auto y_shape = phi::vectorize<int>(y.dims());
   int r = xpu::broadcast_add(dev_ctx.x_context(),
-                             x.data<T>(),
-                             y.data<T>(),
-                             out->data<T>(),
+                             reinterpret_cast<const XPUType*>(x.data<T>()),
+                             reinterpret_cast<const XPUType*>(y.data<T>()),
+                             reinterpret_cast<XPUType*>(out->data<T>()),
                              x_shape,
                              y_shape);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
 }
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(grad_add, XPU, ALL_LAYOUT, phi::GradAddXPUKernel, float) {}
+PD_REGISTER_KERNEL(grad_add,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::GradAddXPUKernel,
+                   phi::dtype::float16,
+                   float) {}
diff --git a/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc
@@ -26,6 +26,7 @@ void LogSoftmaxGradKernel(const Context& dev_ctx,
                           const DenseTensor& out_grad,
                           int axis,
                           DenseTensor* x_grad) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   const int rank = out.dims().size();
   axis = funcs::CanonicalAxis(axis, rank);
 
@@ -40,24 +41,29 @@ void LogSoftmaxGradKernel(const Context& dev_ctx,
     PADDLE_ENFORCE_NE(
         tmp2_ptr, nullptr, phi::errors::External("no enough memory in xpu"));
 
-    int r =
-        xpu::exp(dev_ctx.x_context(), out.data<T>(), tmp_ptr, out_grad.numel());
+    int r = xpu::exp<XPUType>(dev_ctx.x_context(),
+                              reinterpret_cast<const XPUType*>(out.data<T>()),
+                              reinterpret_cast<XPUType*>(tmp_ptr),
+                              out_grad.numel());
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "exp");
-    r = xpu::reciprocal(
-        dev_ctx.x_context(), tmp_ptr, tmp2_ptr, out_grad.numel());
+    r = xpu::reciprocal<XPUType>(dev_ctx.x_context(),
+                                 reinterpret_cast<const XPUType*>(tmp_ptr),
+                                 reinterpret_cast<XPUType*>(tmp2_ptr),
+                                 out_grad.numel());
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "reciprocal");
-    r = xpu::mul(dev_ctx.x_context(),
-                 tmp2_ptr,
-                 out_grad.data<T>(),
-                 tmp2_ptr,
-                 out_grad.numel());
+    r = xpu::mul<XPUType>(dev_ctx.x_context(),
+                          reinterpret_cast<const XPUType*>(tmp2_ptr),
+                          reinterpret_cast<const XPUType*>(out_grad.data<T>()),
+                          reinterpret_cast<XPUType*>(tmp2_ptr),
+                          out_grad.numel());
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul");
-    r = xpu::softmax_grad(dev_ctx.x_context(),
-                          tmp_ptr,
-                          tmp2_ptr,
-                          x_grad->data<T>(),
-                          out_shape,
-                          axis);
+    r = xpu::softmax_grad<XPUType>(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType*>(tmp_ptr),
+        reinterpret_cast<const XPUType*>(tmp2_ptr),
+        reinterpret_cast<XPUType*>(x_grad->data<T>()),
+        out_shape,
+        axis);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax_grad");
   }
 }