PaddlePaddle · luotao1 · Jan 25, 2024 · Nov 8, 2023 · Nov 8, 2023 · Nov 22, 2023
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
@@ -191,6 +191,19 @@
   backward : as_strided_grad
   no_need_buffer : input
 
+- op : asgd_
+  args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor d, Tensor y, Tensor n, Tensor master_param, bool multi_precision=false)
+  output : Tensor(param_out), Tensor(d_out), Tensor(y_out), Tensor(master_param_out)
+  infer_meta :
+    func : ASGDInferMeta
+  kernel :
+    func : asgd
+    data_type : param
+  data_transform :
+    support_trans_dtype : learning_rate, n
+  optional : master_param, master_param_out
+  inplace : (param -> param_out), (d -> d_out), (y -> y_out), (master_param -> master_param_out)
+
 - op : asin
   args : (Tensor x)
   output : Tensor(out)

diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
@@ -481,6 +481,48 @@ void AddNTensorArrayInferMeta(const std::vector<const MetaTensor*>& x,
   }
 }
 
+void ASGDInferMeta(const MetaTensor& param,
+                   const MetaTensor& grad,
+                   const MetaTensor& learning_rate,
+                   const MetaTensor& d,
+                   const MetaTensor& y,
+                   const MetaTensor& n,
+                   const MetaTensor& master_param,
+                   bool multi_precision,
+                   MetaTensor* param_out,
+                   MetaTensor* d_out,
+                   MetaTensor* y_out,
+                   MetaTensor* master_param_out) {
+  PADDLE_ENFORCE_NOT_NULL(
+      param_out,
+      phi::errors::InvalidArgument(
+          "Output(ParamOut) of ASGDOp should not be null."));
+
+  PADDLE_ENFORCE_NOT_NULL(d_out,
+                          phi::errors::InvalidArgument(
+                              "Output(DOut) of ASGDOp should not be null."));
+
+  PADDLE_ENFORCE_NOT_NULL(y_out,
+                          phi::errors::InvalidArgument(
+                              "Output(YOut) of ASGDOp should not be null."));
+
+  param_out->set_dims(param.dims());
+  param_out->set_dtype(param.dtype());
+  d_out->set_dims(d.dims());
+  d_out->set_dtype(d.dtype());
+  y_out->set_dims(y.dims());
+  y_out->set_dtype(y.dtype());
+  if (multi_precision) {
+    master_param_out->set_dims(master_param.dims());
+    if (DataType::FLOAT16 == master_param.dtype() ||
+        DataType::BFLOAT16 == master_param.dtype()) {
+      master_param_out->set_dtype(DataType::FLOAT32);
+    } else {
+      master_param_out->set_dtype(master_param.dtype());
+    }
+  }
+}
+
 void AucInferMeta(const MetaTensor& input,
                   const MetaTensor& label,
                   const MetaTensor& stat_pos,

diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
@@ -138,6 +138,19 @@ void AddNTensorArrayInferMeta(const std::vector<const MetaTensor*>& x,
                               MetaTensor* out,
                               MetaConfig config);
 
+void ASGDInferMeta(const MetaTensor& param,
+                   const MetaTensor& grad,
+                   const MetaTensor& learning_rate,
+                   const MetaTensor& d,
+                   const MetaTensor& y,
+                   const MetaTensor& n,
+                   const MetaTensor& master_param,
+                   bool multi_precision,
+                   MetaTensor* param_out,
+                   MetaTensor* d_out,
+                   MetaTensor* y_out,
+                   MetaTensor* master_param_out);
+
 void AucInferMeta(const MetaTensor& input,
                   const MetaTensor& label,
                   const MetaTensor& stat_pos,

diff --git a/paddle/phi/kernels/asgd_kernel.h b/paddle/phi/kernels/asgd_kernel.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ASGDKernel(const Context& dev_ctx,
+                const DenseTensor& param,
+                const DenseTensor& grad,
+                const DenseTensor& learning_rate,
+                const DenseTensor& d,
+                const DenseTensor& y,
+                const DenseTensor& n,
+                const paddle::optional<DenseTensor>& master_param,
+                bool multi_precision,
+                DenseTensor* param_out,
+                DenseTensor* d_out,
+                DenseTensor* y_out,
+                DenseTensor* master_param_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/asgd_kernel.cc b/paddle/phi/kernels/cpu/asgd_kernel.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/asgd_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/jit/kernels.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ASGDKernelCPUImpl(const Context& dev_ctx,
+                       const DenseTensor& param,
+                       const DenseTensor& grad,
+                       const DenseTensor& learning_rate,
+                       const DenseTensor& d,
+                       const DenseTensor& y,
+                       const DenseTensor& n,
+                       DenseTensor* param_out,
+                       DenseTensor* d_out,
+                       DenseTensor* y_out) {
+  auto param_eigen = EigenVector<T>::Flatten(param);
+  auto grad_eigen = EigenVector<T>::Flatten(grad);
+  auto d_eigen = EigenVector<T>::Flatten(d);
+  auto y_eigen = EigenVector<T>::Flatten(y);
+  auto param_out_eigen = EigenVector<T>::Flatten(*param_out);
+  auto d_out_eigen = EigenVector<T>::Flatten(*d_out);
+  auto y_out_eigen = EigenVector<T>::Flatten(*y_out);
+  T learning_rate_T = learning_rate.data<T>()[0];
+  T n_T = n.data<T>()[0];
+
+  d_out_eigen = d_eigen - y_eigen + grad_eigen;
+  y_out_eigen = grad_eigen;
+  param_out_eigen = param_eigen - (learning_rate_T / n_T) * d_out_eigen;
+}
+
+template <typename T, typename Context>
+void ASGDKernel(const Context& dev_ctx,
+                const DenseTensor& param,
+                const DenseTensor& grad,
+                const DenseTensor& learning_rate,
+                const DenseTensor& d,
+                const DenseTensor& y,
+                const DenseTensor& n,
+                const paddle::optional<DenseTensor>& master_param UNUSED,
+                bool multi_precision UNUSED,
+                DenseTensor* param_out,
+                DenseTensor* d_out,
+                DenseTensor* y_out,
+                DenseTensor* master_param_out UNUSED) {
+  dev_ctx.template Alloc<T>(param_out);
+  dev_ctx.template Alloc<T>(d_out);
+  dev_ctx.template Alloc<T>(y_out);
+  ASGDKernelCPUImpl<T, Context>(
+      dev_ctx, param, grad, learning_rate, d, y, n, param_out, d_out, y_out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(asgd, CPU, ALL_LAYOUT, phi::ASGDKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/asgd_kernel.cu b/paddle/phi/kernels/gpu/asgd_kernel.cu
@@ -0,0 +1,106 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/asgd_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_helper.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/mixed_vector.h"
+
+namespace phi {
+
+template <typename T, typename MT>
+__global__ void ASGDKernelGPUImpl(const T* param,
+                                  const T* grad,
+                                  const T* learning_rate,
+                                  const T* d,
+                                  const T* y,
+                                  const T* n,
+                                  const MT* master_param,
+                                  int num,
+                                  T* param_out,
+                                  T* d_out,
+                                  T* y_out,
+                                  MT* master_param_out) {
+  MT learning_rate_MT = static_cast<MT>(learning_rate[0]);
+  MT n_MT = static_cast<MT>(n[0]);
+  CUDA_KERNEL_LOOP(i, num) {
+    MT param_data = master_param ? master_param[i] : static_cast<MT>(param[i]);
+    MT grad_data = static_cast<MT>(grad[i]);
+    MT d_data = static_cast<MT>(d[i]);
+    MT y_data = static_cast<MT>(y[i]);
+    d_data = d_data - y_data + grad_data;
+    y_data = grad_data;
+    param_data = param_data - (learning_rate_MT / n_MT) * d_data;
+    param_out[i] = static_cast<T>(param_data);
+    d_out[i] = static_cast<T>(d_data);
+    y_out[i] = static_cast<T>(y_data);
+    if (master_param_out) {
+      master_param_out[i] = param_data;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ASGDKernel(const Context& dev_ctx,
+                const DenseTensor& param,
+                const DenseTensor& grad,
+                const DenseTensor& learning_rate,
+                const DenseTensor& d,
+                const DenseTensor& y,
+                const DenseTensor& n,
+                const paddle::optional<DenseTensor>& master_param,
+                bool multi_precision,
+                DenseTensor* param_out,
+                DenseTensor* d_out,
+                DenseTensor* y_out,
+                DenseTensor* master_param_out) {
+  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
+  const MPDType* master_in_data =
+      multi_precision ? master_param->data<MPDType>() : nullptr;
+  MPDType* master_out_data =
+      multi_precision ? dev_ctx.template Alloc<MPDType>(master_param_out)
+                      : nullptr;
+
+  int block = 512;
+  int grid = (param.numel() + block - 1) / block;
+
+  ASGDKernelGPUImpl<T, MPDType><<<grid, block, 0, dev_ctx.stream()>>>(
+      param.data<T>(),
+      grad.data<T>(),
+      learning_rate.data<T>(),
+      d.data<T>(),
+      y.data<T>(),
+      n.data<T>(),
+      master_in_data,
+      param.numel(),
+      dev_ctx.template Alloc<T>(param_out),
+      dev_ctx.template Alloc<T>(d_out),
+      dev_ctx.template Alloc<T>(y_out),
+      master_out_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(asgd,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ASGDKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
@@ -18,6 +18,7 @@
 from .adam import Adam
 from .adamax import Adamax
 from .adamw import AdamW
+from .asgd import ASGD
 from .lamb import Lamb
 from .lbfgs import LBFGS
 from .momentum import Momentum
@@ -32,6 +33,7 @@
     'Adam',
     'AdamW',
     'Adamax',
+    'ASGD',
     'RMSProp',
     'Adadelta',
     'SGD',