From a89fbdc171ef9a7cb43df3d38fb41caa6e0142ad Mon Sep 17 00:00:00 2001
From: huangjiyi <947613776@qq.com>
Date: Sat, 17 Dec 2022 03:20:01 +0800
Subject: [PATCH 1/2] move gather_scatter_kernel from fluid to phi

---
 paddle/fluid/operators/CMakeLists.txt         |  8 +--
 paddle/phi/kernels/CMakeLists.txt             |  1 -
 .../kernels/cpu}/gather_scatter_kernel.cc     | 23 +++----
 .../kernels/cpu/put_along_axis_grad_kernel.cc | 10 +--
 .../phi/kernels/cpu/put_along_axis_kernel.cc  | 14 ++--
 .../cpu/take_along_axis_grad_kernel.cc        |  6 +-
 .../phi/kernels/cpu/take_along_axis_kernel.cc |  8 +--
 .../kernels}/gather_scatter_kernel.h          | 67 +++++++++----------
 .../kernels/gpu}/gather_scatter_kernel.cu     | 21 +++---
 .../kernels/gpu/put_along_axis_grad_kernel.cu | 10 +--
 .../phi/kernels/gpu/put_along_axis_kernel.cu  | 14 ++--
 .../gpu/take_along_axis_grad_kernel.cu        |  6 +-
 .../phi/kernels/gpu/take_along_axis_kernel.cu |  8 +--
 13 files changed, 91 insertions(+), 105 deletions(-)
 rename paddle/{fluid/operators => phi/kernels/cpu}/gather_scatter_kernel.cc (92%)
 rename paddle/{fluid/operators => phi/kernels}/gather_scatter_kernel.h (76%)
 rename paddle/{fluid/operators => phi/kernels/gpu}/gather_scatter_kernel.cu (95%)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 882706dc8dd62..4aeb3d6b74398 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -95,13 +95,7 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
-if (WITH_ROCM)
-    hip_library(gather_scatter_kernel SRCS gather_scatter_kernel.cc gather_scatter_kernel.cu DEPS tensor)
-else()
-    cc_library(gather_scatter_kernel SRCS gather_scatter_kernel.cc gather_scatter_kernel.cu DEPS tensor)
-endif()
-
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel backward_infermeta sparse_backward_infermeta)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils backward_infermeta sparse_backward_infermeta)
 
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op
         recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index abe35f284d664..a21319dc7f1e3 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -65,7 +65,6 @@ set(COMMON_KERNEL_DEPS
     deformable_conv_functor
     matrix_reduce
     segment_pooling
-    gather_scatter_kernel
     pooling
     maxouting
     matrix_inverse
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cc b/paddle/phi/kernels/cpu/gather_scatter_kernel.cc
similarity index 92%
rename from paddle/fluid/operators/gather_scatter_kernel.cc
rename to paddle/phi/kernels/cpu/gather_scatter_kernel.cc
index 1c6b2e6c1a095..824bed9ef2ed5 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.cc
+++ b/paddle/phi/kernels/cpu/gather_scatter_kernel.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
-namespace paddle {
-namespace operators {
+#include "paddle/phi/kernels/gather_scatter_kernel.h"
+
+namespace phi {
 
 class TensorAssign {
  public:
@@ -54,7 +54,7 @@ struct cpu_gather_scatter_functor {
                   const phi::DenseTensor& src,
                   const std::string& method_name,
                   const func_t& reduce_op,
-                  const platform::DeviceContext& ctx) {
+                  const phi::DeviceContext& ctx) {
     if (index.numel() == 0) {
       return;
     }
@@ -69,7 +69,7 @@ struct cpu_gather_scatter_functor {
     auto src_dims = src.dims();
     if (self_size == 0 || src_size == 0 || index_size == 0) {
       VLOG(3) << "zero size input found";
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "self_size, src_size, index_size cannot be 0");
       return;
     }
@@ -132,7 +132,7 @@ void cpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
-                       const platform::DeviceContext& ctx) {
+                       const phi::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/false>()(
@@ -144,7 +144,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
-                               const platform::DeviceContext& ctx) {
+                               const phi::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(
@@ -156,7 +156,7 @@ void cpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
-                            const platform::DeviceContext& ctx) {
+                            const phi::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(
@@ -168,7 +168,7 @@ void cpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
-                            const platform::DeviceContext& ctx) {
+                            const phi::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(
@@ -180,7 +180,7 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor output,
-                                   const platform::DeviceContext& ctx) {
+                                   const phi::DeviceContext& ctx) {
   auto* index_data = index.data<index_t>();
   auto* output_data = output.data<tensor_t>();
 
@@ -219,5 +219,4 @@ Instantiate_Template_Function(cpu_gather_kernel)
             Instantiate_Template_Function(cpu_scatter_mul_kernel)
                 Instantiate_Template_Function(cpu_scatter_input_grad_kernel)
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
index 969c5b9fe3306..f891f08cc412c 100644
--- a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
@@ -14,12 +14,12 @@
 
 #include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
 
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/gather_scatter_kernel.h"
 
 namespace phi {
 
@@ -41,7 +41,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
   if (x_grad) {
     phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
     if (index_type == DataType::INT32) {
-      paddle::operators::cpu_scatter_input_grad_kernel<T, int32_t>(
+      phi::cpu_scatter_input_grad_kernel<T, int32_t>(
           // Here passing an unused argument out_grad, because it's
           // convenient to instantiate a bunch of template function with the
           // same arguments list.
@@ -51,7 +51,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
           *x_grad,
           dev_ctx);
     } else {
-      paddle::operators::cpu_scatter_input_grad_kernel<T, int64_t>(
+      phi::cpu_scatter_input_grad_kernel<T, int64_t>(
           out_grad, axis, index, *x_grad, dev_ctx);
     }
   }
@@ -60,10 +60,10 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
     value_grad->Resize(index.dims());
     dev_ctx.template Alloc<T>(value_grad);
     if (index_type == DataType::INT32) {
-      paddle::operators::cpu_gather_kernel<T, int32_t>(
+      phi::cpu_gather_kernel<T, int32_t>(
           out_grad, axis, index, *value_grad, dev_ctx);
     } else if (index_type == DataType::INT64) {
-      paddle::operators::cpu_gather_kernel<T, int64_t>(
+      phi::cpu_gather_kernel<T, int64_t>(
           out_grad, axis, index, *value_grad, dev_ctx);
     }
   }
diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
index e0cf5f6730c24..1ffb1bce75b8d 100644
--- a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
@@ -14,12 +14,12 @@
 
 #include "paddle/phi/kernels/put_along_axis_kernel.h"
 
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/gather_scatter_kernel.h"
 
 namespace phi {
 
@@ -40,26 +40,26 @@ void PutAlongAxisKernel(const Context& dev_ctx,
   const auto& index_type = index.dtype();
   if (reduce == "add") {
     if (index_type == DataType::INT32) {
-      paddle::operators::cpu_scatter_add_kernel<T, int32_t>(
+      phi::cpu_scatter_add_kernel<T, int32_t>(
           *out, axis, index, value, dev_ctx);
     } else if (index_type == DataType::INT64) {
-      paddle::operators::cpu_scatter_add_kernel<T, int64_t>(
+      phi::cpu_scatter_add_kernel<T, int64_t>(
           *out, axis, index, value, dev_ctx);
     }
   } else if (reduce == "multiply" || reduce == "mul") {
     if (index_type == DataType::INT32) {
-      paddle::operators::cpu_scatter_mul_kernel<T, int32_t>(
+      phi::cpu_scatter_mul_kernel<T, int32_t>(
           *out, axis, index, value, dev_ctx);
     } else if (index_type == DataType::INT64) {
-      paddle::operators::cpu_scatter_mul_kernel<T, int64_t>(
+      phi::cpu_scatter_mul_kernel<T, int64_t>(
           *out, axis, index, value, dev_ctx);
     }
   } else if (reduce == "assign") {
     if (index_type == DataType::INT32) {
-      paddle::operators::cpu_scatter_assign_kernel<T, int32_t>(
+      phi::cpu_scatter_assign_kernel<T, int32_t>(
           *out, axis, index, value, dev_ctx);
     } else if (index_type == DataType::INT64) {
-      paddle::operators::cpu_scatter_assign_kernel<T, int64_t>(
+      phi::cpu_scatter_assign_kernel<T, int64_t>(
           *out, axis, index, value, dev_ctx);
     }
   } else {
diff --git a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
index e8fc15cc171b5..9571d7c1be1c7 100644
--- a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
@@ -14,11 +14,11 @@
 
 #include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
 
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gather_scatter_kernel.h"
 
 namespace phi {
 
@@ -46,14 +46,14 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
   const auto& index_type =
       paddle::framework::TransToProtoVarType(index.dtype());
   if (index_type == paddle::framework::proto::VarType::INT32) {
-    paddle::operators::cpu_scatter_add_kernel<T, int32_t>(
+    phi::cpu_scatter_add_kernel<T, int32_t>(
         *x_grad,
         axis,
         index,
         out_grad,
         dev_ctx);  // the gradient of gather is scatter
   } else if (index_type == paddle::framework::proto::VarType::INT64) {
-    paddle::operators::cpu_scatter_add_kernel<T, int64_t>(
+    phi::cpu_scatter_add_kernel<T, int64_t>(
         *x_grad, axis, index, out_grad, dev_ctx);
   }
 }
diff --git a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
index cd1ff2e926288..737ba648fa0b2 100644
--- a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
+++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
@@ -14,11 +14,11 @@
 
 #include "paddle/phi/kernels/take_along_axis_kernel.h"
 
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gather_scatter_kernel.h"
 
 namespace phi {
 
@@ -38,11 +38,9 @@ void TakeAlongAxisKernel(const Context& dev_ctx,
 
   const auto& index_type = index.dtype();
   if (index_type == DataType::INT32) {
-    paddle::operators::cpu_gather_kernel<T, int32_t>(
-        x, axis, index, *out, dev_ctx);
+    phi::cpu_gather_kernel<T, int32_t>(x, axis, index, *out, dev_ctx);
   } else if (index_type == DataType::INT64) {
-    paddle::operators::cpu_gather_kernel<T, int64_t>(
-        x, axis, index, *out, dev_ctx);
+    phi::cpu_gather_kernel<T, int64_t>(x, axis, index, *out, dev_ctx);
   }
 }
 
diff --git a/paddle/fluid/operators/gather_scatter_kernel.h b/paddle/phi/kernels/gather_scatter_kernel.h
similarity index 76%
rename from paddle/fluid/operators/gather_scatter_kernel.h
rename to paddle/phi/kernels/gather_scatter_kernel.h
index 9cf3c3e33009a..3df66379658fa 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.h
+++ b/paddle/phi/kernels/gather_scatter_kernel.h
@@ -12,103 +12,102 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
 
 #pragma once
 
-namespace paddle {
-namespace operators {
+namespace phi {
 
-#define Instantiate_Template_Function(func)                                  \
-  Instantiate_Template_Function_index_t(                                     \
-      func, int) Instantiate_Template_Function_index_t(func, float)          \
-      Instantiate_Template_Function_index_t(func, double)                    \
-          Instantiate_Template_Function_index_t(func, int64_t)               \
-              Instantiate_Template_Function_index_t(func, platform::float16) \
+#define Instantiate_Template_Function(func)                                    \
+  Instantiate_Template_Function_index_t(                                       \
+      func, int) Instantiate_Template_Function_index_t(func, float)            \
+      Instantiate_Template_Function_index_t(func, double)                      \
+          Instantiate_Template_Function_index_t(func, int64_t)                 \
+              Instantiate_Template_Function_index_t(func, phi::dtype::float16) \
                   Instantiate_Template_Function_index_t(func, unsigned char)
 
-#define Instantiate_Template_Function_index_t(func, tensor_t)            \
-  template void func<tensor_t, int>(phi::DenseTensor input,              \
-                                    int dim,                             \
-                                    const phi::DenseTensor& index,       \
-                                    phi::DenseTensor result,             \
-                                    const platform::DeviceContext& ctx); \
-  template void func<tensor_t, int64_t>(phi::DenseTensor input,          \
-                                        int dim,                         \
-                                        const phi::DenseTensor& index,   \
-                                        phi::DenseTensor result,         \
-                                        const platform::DeviceContext& ctx);
+#define Instantiate_Template_Function_index_t(func, tensor_t)          \
+  template void func<tensor_t, int>(phi::DenseTensor input,            \
+                                    int dim,                           \
+                                    const phi::DenseTensor& index,     \
+                                    phi::DenseTensor result,           \
+                                    const phi::DeviceContext& ctx);    \
+  template void func<tensor_t, int64_t>(phi::DenseTensor input,        \
+                                        int dim,                       \
+                                        const phi::DenseTensor& index, \
+                                        phi::DenseTensor result,       \
+                                        const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void cpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
-                       const platform::DeviceContext& ctx);
+                       const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
-                               const platform::DeviceContext& ctx);
+                               const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void cpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
-                            const platform::DeviceContext& ctx);
+                            const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void cpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
-                            const platform::DeviceContext& ctx);
+                            const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void cpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor result,
-                                   const platform::DeviceContext& ctx);
+                                   const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void gpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
-                       const platform::DeviceContext& ctx);
+                       const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void gpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
-                               const platform::DeviceContext& ctx);
+                               const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void gpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
-                            const platform::DeviceContext& ctx);
+                            const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void gpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
-                            const platform::DeviceContext& ctx);
+                            const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void gpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor result,
-                                   const platform::DeviceContext& ctx);
-}  // namespace operators
-}  // namespace paddle
+                                   const phi::DeviceContext& ctx);
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/phi/kernels/gpu/gather_scatter_kernel.cu
similarity index 95%
rename from paddle/fluid/operators/gather_scatter_kernel.cu
rename to paddle/phi/kernels/gpu/gather_scatter_kernel.cu
index 1cb4e4a4e9d78..4c3ac25620da6 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_scatter_kernel.cu
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/phi/kernels/gather_scatter_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
 
 class TensorAssign {
  public:
@@ -111,7 +111,7 @@ struct gpu_gather_scatter_functor {
                   phi::DenseTensor src,
                   const std::string& method_name,
                   const func_t& reduce_op,
-                  const platform::DeviceContext& ctx) {
+                  const phi::DeviceContext& ctx) {
     if (index.numel() == 0) {
       return;
     }
@@ -162,7 +162,7 @@ void gpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
-                       const platform::DeviceContext& ctx) {
+                       const phi::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/false>()(
@@ -175,7 +175,7 @@ void gpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
-                               const platform::DeviceContext& ctx) {
+                               const phi::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(
@@ -187,7 +187,7 @@ void gpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
-                            const platform::DeviceContext& ctx) {
+                            const phi::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(
@@ -199,7 +199,7 @@ void gpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
-                            const platform::DeviceContext& ctx) {
+                            const phi::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(
@@ -232,7 +232,7 @@ void gpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
-                                   const platform::DeviceContext& ctx) {
+                                   const phi::DeviceContext& ctx) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
 
@@ -273,5 +273,4 @@ Instantiate_Template_Function(gpu_gather_kernel)
             Instantiate_Template_Function(gpu_scatter_mul_kernel)
                 Instantiate_Template_Function(gpu_scatter_input_grad_kernel)
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
index fcf43f9f42718..f0a164ac8b97b 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -14,12 +14,12 @@
 
 #include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
 
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/gather_scatter_kernel.h"
 
 namespace phi {
 
@@ -41,10 +41,10 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
   if (x_grad) {
     phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
     if (index_type == DataType::INT32) {
-      paddle::operators::gpu_scatter_input_grad_kernel<T, int32_t>(
+      phi::gpu_scatter_input_grad_kernel<T, int32_t>(
           out_grad, axis, index, *x_grad, dev_ctx);
     } else {
-      paddle::operators::gpu_scatter_input_grad_kernel<T, int64_t>(
+      phi::gpu_scatter_input_grad_kernel<T, int64_t>(
           out_grad, axis, index, *x_grad, dev_ctx);
     }
   }
@@ -52,14 +52,14 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
     value_grad->Resize(index.dims());
     dev_ctx.template Alloc<T>(value_grad);
     if (index_type == DataType::INT32) {
-      paddle::operators::gpu_gather_kernel<T, int32_t>(
+      phi::gpu_gather_kernel<T, int32_t>(
           out_grad,
           axis,
           index,
           *value_grad,
           dev_ctx);  // the gradient of scatter is gather
     } else if (index_type == DataType::INT64) {
-      paddle::operators::gpu_gather_kernel<T, int64_t>(
+      phi::gpu_gather_kernel<T, int64_t>(
           out_grad, axis, index, *value_grad, dev_ctx);
     }
   }
diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
index b43d6fafa72a6..f48cd4ea1ea97 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -14,12 +14,12 @@
 
 #include "paddle/phi/kernels/put_along_axis_kernel.h"
 
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/gather_scatter_kernel.h"
 
 namespace phi {
 
@@ -41,26 +41,26 @@ void PutAlongAxisKernel(const Context& dev_ctx,
   phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
   if (reduce == "add") {
     if (index_type == DataType::INT32) {
-      paddle::operators::gpu_scatter_add_kernel<T, int32_t>(
+      phi::gpu_scatter_add_kernel<T, int32_t>(
           *out, axis, index, value, dev_ctx);
     } else if (index_type == DataType::INT64) {
-      paddle::operators::gpu_scatter_add_kernel<T, int64_t>(
+      phi::gpu_scatter_add_kernel<T, int64_t>(
           *out, axis, index, value, dev_ctx);
     }
   } else if (reduce == "multiply" || reduce == "mul") {
     if (index_type == DataType::INT32) {
-      paddle::operators::gpu_scatter_mul_kernel<T, int32_t>(
+      phi::gpu_scatter_mul_kernel<T, int32_t>(
           *out, axis, index, value, dev_ctx);
     } else if (index_type == DataType::INT64) {
-      paddle::operators::gpu_scatter_mul_kernel<T, int64_t>(
+      phi::gpu_scatter_mul_kernel<T, int64_t>(
           *out, axis, index, value, dev_ctx);
     }
   } else if (reduce == "assign") {
     if (index_type == DataType::INT32) {
-      paddle::operators::gpu_scatter_assign_kernel<T, int32_t>(
+      phi::gpu_scatter_assign_kernel<T, int32_t>(
           *out, axis, index, value, dev_ctx);
     } else if (index_type == DataType::INT64) {
-      paddle::operators::gpu_scatter_assign_kernel<T, int64_t>(
+      phi::gpu_scatter_assign_kernel<T, int64_t>(
           *out, axis, index, value, dev_ctx);
     }
   } else {
diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
index 07afc3ba8bb18..61871dd8e57de 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
@@ -14,12 +14,12 @@
 
 #include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
 
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gather_scatter_kernel.h"
 
 namespace phi {
 
@@ -46,14 +46,14 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
   const auto& index_type = index.dtype();
 
   if (index_type == DataType::INT32) {
-    paddle::operators::gpu_scatter_add_kernel<T, int32_t>(
+    phi::gpu_scatter_add_kernel<T, int32_t>(
         *x_grad,
         axis,
         index,
         out_grad,
         dev_ctx);  // the gradient of gather is scatter
   } else if (index_type == DataType::INT64) {
-    paddle::operators::gpu_scatter_add_kernel<T, int64_t>(
+    phi::gpu_scatter_add_kernel<T, int64_t>(
         *x_grad, axis, index, out_grad, dev_ctx);
   }
 }
diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
index 28a1c9b657d7a..729b9b5e32de3 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -14,11 +14,11 @@
 
 #include "paddle/phi/kernels/take_along_axis_kernel.h"
 
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/gather_scatter_kernel.h"
 
 namespace phi {
 
@@ -38,11 +38,9 @@ void TakeAlongAxisKernel(const Context& dev_ctx,
 
   const auto& index_type = index.dtype();
   if (index_type == DataType::INT32) {
-    paddle::operators::gpu_gather_kernel<T, int32_t>(
-        x, axis, index, *out, dev_ctx);
+    phi::gpu_gather_kernel<T, int32_t>(x, axis, index, *out, dev_ctx);
   } else if (index_type == DataType::INT64) {
-    paddle::operators::gpu_gather_kernel<T, int64_t>(
-        x, axis, index, *out, dev_ctx);
+    phi::gpu_gather_kernel<T, int64_t>(x, axis, index, *out, dev_ctx);
   }
 }
 

From 6a17d4692ea3aeac394ee7888abe05090ee85bdb Mon Sep 17 00:00:00 2001
From: huangjiyi <947613776@qq.com>
Date: Mon, 19 Dec 2022 14:10:44 +0800
Subject: [PATCH 2/2] mv gather_scatter_kernel to gather_scatter_functor

---
 paddle/phi/kernels/CMakeLists.txt                  |  3 ++-
 .../phi/kernels/cpu/put_along_axis_grad_kernel.cc  | 10 +++++-----
 paddle/phi/kernels/cpu/put_along_axis_kernel.cc    | 14 +++++++-------
 .../phi/kernels/cpu/take_along_axis_grad_kernel.cc |  6 +++---
 paddle/phi/kernels/cpu/take_along_axis_kernel.cc   |  6 +++---
 paddle/phi/kernels/funcs/CMakeLists.txt            | 12 ++++++++++++
 .../gather_scatter_functor.cc}                     |  4 +++-
 .../gather_scatter_functor.cu}                     |  4 +++-
 .../gather_scatter_functor.h}                      |  2 ++
 .../phi/kernels/gpu/put_along_axis_grad_kernel.cu  | 10 +++++-----
 paddle/phi/kernels/gpu/put_along_axis_kernel.cu    | 14 +++++++-------
 .../phi/kernels/gpu/take_along_axis_grad_kernel.cu |  6 +++---
 paddle/phi/kernels/gpu/take_along_axis_kernel.cu   |  6 +++---
 13 files changed, 58 insertions(+), 39 deletions(-)
 rename paddle/phi/kernels/{cpu/gather_scatter_kernel.cc => funcs/gather_scatter_functor.cc} (98%)
 rename paddle/phi/kernels/{gpu/gather_scatter_kernel.cu => funcs/gather_scatter_functor.cu} (99%)
 rename paddle/phi/kernels/{gather_scatter_kernel.h => funcs/gather_scatter_functor.h} (99%)

diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index a21319dc7f1e3..4d30ecd9bd7c4 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -75,7 +75,8 @@ set(COMMON_KERNEL_DEPS
     fft
     phi_data_layout_transform
     gpc
-    utf8proc)
+    utf8proc
+    gather_scatter_functor)
 
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup)
 if(WITH_NCCL OR WITH_RCCL)
diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
index f891f08cc412c..a4af5f6db57da 100644
--- a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/gather_scatter_kernel.h"
+#include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 
 namespace phi {
 
@@ -41,7 +41,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
   if (x_grad) {
     phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
     if (index_type == DataType::INT32) {
-      phi::cpu_scatter_input_grad_kernel<T, int32_t>(
+      phi::funcs::cpu_scatter_input_grad_kernel<T, int32_t>(
           // Here passing an unused argument out_grad, because it's
           // convenient to instantiate a bunch of template function with the
           // same arguments list.
@@ -51,7 +51,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
           *x_grad,
           dev_ctx);
     } else {
-      phi::cpu_scatter_input_grad_kernel<T, int64_t>(
+      phi::funcs::cpu_scatter_input_grad_kernel<T, int64_t>(
           out_grad, axis, index, *x_grad, dev_ctx);
     }
   }
@@ -60,10 +60,10 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
     value_grad->Resize(index.dims());
     dev_ctx.template Alloc<T>(value_grad);
     if (index_type == DataType::INT32) {
-      phi::cpu_gather_kernel<T, int32_t>(
+      phi::funcs::cpu_gather_kernel<T, int32_t>(
           out_grad, axis, index, *value_grad, dev_ctx);
     } else if (index_type == DataType::INT64) {
-      phi::cpu_gather_kernel<T, int64_t>(
+      phi::funcs::cpu_gather_kernel<T, int64_t>(
           out_grad, axis, index, *value_grad, dev_ctx);
     }
   }
diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
index 1ffb1bce75b8d..34516a7e4dd93 100644
--- a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/gather_scatter_kernel.h"
+#include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 
 namespace phi {
 
@@ -40,26 +40,26 @@ void PutAlongAxisKernel(const Context& dev_ctx,
   const auto& index_type = index.dtype();
   if (reduce == "add") {
     if (index_type == DataType::INT32) {
-      phi::cpu_scatter_add_kernel<T, int32_t>(
+      phi::funcs::cpu_scatter_add_kernel<T, int32_t>(
           *out, axis, index, value, dev_ctx);
     } else if (index_type == DataType::INT64) {
-      phi::cpu_scatter_add_kernel<T, int64_t>(
+      phi::funcs::cpu_scatter_add_kernel<T, int64_t>(
           *out, axis, index, value, dev_ctx);
     }
   } else if (reduce == "multiply" || reduce == "mul") {
     if (index_type == DataType::INT32) {
-      phi::cpu_scatter_mul_kernel<T, int32_t>(
+      phi::funcs::cpu_scatter_mul_kernel<T, int32_t>(
           *out, axis, index, value, dev_ctx);
     } else if (index_type == DataType::INT64) {
-      phi::cpu_scatter_mul_kernel<T, int64_t>(
+      phi::funcs::cpu_scatter_mul_kernel<T, int64_t>(
           *out, axis, index, value, dev_ctx);
     }
   } else if (reduce == "assign") {
     if (index_type == DataType::INT32) {
-      phi::cpu_scatter_assign_kernel<T, int32_t>(
+      phi::funcs::cpu_scatter_assign_kernel<T, int32_t>(
           *out, axis, index, value, dev_ctx);
     } else if (index_type == DataType::INT64) {
-      phi::cpu_scatter_assign_kernel<T, int64_t>(
+      phi::funcs::cpu_scatter_assign_kernel<T, int64_t>(
           *out, axis, index, value, dev_ctx);
     }
   } else {
diff --git a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
index 9571d7c1be1c7..435490d93266e 100644
--- a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
@@ -17,8 +17,8 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/gather_scatter_kernel.h"
 
 namespace phi {
 
@@ -46,14 +46,14 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
   const auto& index_type =
       paddle::framework::TransToProtoVarType(index.dtype());
   if (index_type == paddle::framework::proto::VarType::INT32) {
-    phi::cpu_scatter_add_kernel<T, int32_t>(
+    phi::funcs::cpu_scatter_add_kernel<T, int32_t>(
         *x_grad,
         axis,
         index,
         out_grad,
         dev_ctx);  // the gradient of gather is scatter
   } else if (index_type == paddle::framework::proto::VarType::INT64) {
-    phi::cpu_scatter_add_kernel<T, int64_t>(
+    phi::funcs::cpu_scatter_add_kernel<T, int64_t>(
         *x_grad, axis, index, out_grad, dev_ctx);
   }
 }
diff --git a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
index 737ba648fa0b2..417b4169d3fdb 100644
--- a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
+++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
@@ -18,7 +18,7 @@
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gather_scatter_kernel.h"
+#include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 
 namespace phi {
 
@@ -38,9 +38,9 @@ void TakeAlongAxisKernel(const Context& dev_ctx,
 
   const auto& index_type = index.dtype();
   if (index_type == DataType::INT32) {
-    phi::cpu_gather_kernel<T, int32_t>(x, axis, index, *out, dev_ctx);
+    phi::funcs::cpu_gather_kernel<T, int32_t>(x, axis, index, *out, dev_ctx);
   } else if (index_type == DataType::INT64) {
-    phi::cpu_gather_kernel<T, int64_t>(x, axis, index, *out, dev_ctx);
+    phi::funcs::cpu_gather_kernel<T, int64_t>(x, axis, index, *out, dev_ctx);
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index efef150b56acb..e74f8dcc8314d 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -50,3 +50,15 @@ else()
   math_library(selected_rows_functor DEPS selected_rows_utils math_function
                blas mixed_vector)
 endif()
+
+if(WITH_ROCM)
+  hip_library(
+    gather_scatter_functor
+    SRCS gather_scatter_functor.cc gather_scatter_functor.cu
+    DEPS tensor)
+else()
+  cc_library(
+    gather_scatter_functor
+    SRCS gather_scatter_functor.cc gather_scatter_functor.cu
+    DEPS tensor)
+endif()
diff --git a/paddle/phi/kernels/cpu/gather_scatter_kernel.cc b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
similarity index 98%
rename from paddle/phi/kernels/cpu/gather_scatter_kernel.cc
rename to paddle/phi/kernels/funcs/gather_scatter_functor.cc
index 824bed9ef2ed5..67af6a3322d9a 100644
--- a/paddle/phi/kernels/cpu/gather_scatter_kernel.cc
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/gather_scatter_kernel.h"
+#include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 
 namespace phi {
+namespace funcs {
 
 class TensorAssign {
  public:
@@ -219,4 +220,5 @@ Instantiate_Template_Function(cpu_gather_kernel)
             Instantiate_Template_Function(cpu_scatter_mul_kernel)
                 Instantiate_Template_Function(cpu_scatter_input_grad_kernel)
 
+}  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/gather_scatter_kernel.cu b/paddle/phi/kernels/funcs/gather_scatter_functor.cu
similarity index 99%
rename from paddle/phi/kernels/gpu/gather_scatter_kernel.cu
rename to paddle/phi/kernels/funcs/gather_scatter_functor.cu
index 4c3ac25620da6..b53de3beef9aa 100644
--- a/paddle/phi/kernels/gpu/gather_scatter_kernel.cu
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cu
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/gather_scatter_kernel.h"
+#include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace phi {
+namespace funcs {
 
 class TensorAssign {
  public:
@@ -273,4 +274,5 @@ Instantiate_Template_Function(gpu_gather_kernel)
             Instantiate_Template_Function(gpu_scatter_mul_kernel)
                 Instantiate_Template_Function(gpu_scatter_input_grad_kernel)
 
+}  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gather_scatter_kernel.h b/paddle/phi/kernels/funcs/gather_scatter_functor.h
similarity index 99%
rename from paddle/phi/kernels/gather_scatter_kernel.h
rename to paddle/phi/kernels/funcs/gather_scatter_functor.h
index 3df66379658fa..00e8f45d8ffb0 100644
--- a/paddle/phi/kernels/gather_scatter_kernel.h
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #pragma once
 
 namespace phi {
+namespace funcs {
 
 #define Instantiate_Template_Function(func)                                    \
   Instantiate_Template_Function_index_t(                                       \
@@ -110,4 +111,5 @@ void gpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    phi::DenseTensor result,
                                    const phi::DeviceContext& ctx);
 
+}  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
index f0a164ac8b97b..afc9984463661 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/utils/data_type.h"
-#include "paddle/phi/kernels/gather_scatter_kernel.h"
+#include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 
 namespace phi {
 
@@ -41,10 +41,10 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
   if (x_grad) {
     phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
     if (index_type == DataType::INT32) {
-      phi::gpu_scatter_input_grad_kernel<T, int32_t>(
+      phi::funcs::gpu_scatter_input_grad_kernel<T, int32_t>(
           out_grad, axis, index, *x_grad, dev_ctx);
     } else {
-      phi::gpu_scatter_input_grad_kernel<T, int64_t>(
+      phi::funcs::gpu_scatter_input_grad_kernel<T, int64_t>(
           out_grad, axis, index, *x_grad, dev_ctx);
     }
   }
@@ -52,14 +52,14 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
     value_grad->Resize(index.dims());
     dev_ctx.template Alloc<T>(value_grad);
     if (index_type == DataType::INT32) {
-      phi::gpu_gather_kernel<T, int32_t>(
+      phi::funcs::gpu_gather_kernel<T, int32_t>(
           out_grad,
           axis,
           index,
           *value_grad,
           dev_ctx);  // the gradient of scatter is gather
     } else if (index_type == DataType::INT64) {
-      phi::gpu_gather_kernel<T, int64_t>(
+      phi::funcs::gpu_gather_kernel<T, int64_t>(
           out_grad, axis, index, *value_grad, dev_ctx);
     }
   }
diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
index f48cd4ea1ea97..12127516f0ef3 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/utils/data_type.h"
-#include "paddle/phi/kernels/gather_scatter_kernel.h"
+#include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 
 namespace phi {
 
@@ -41,26 +41,26 @@ void PutAlongAxisKernel(const Context& dev_ctx,
   phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
   if (reduce == "add") {
     if (index_type == DataType::INT32) {
-      phi::gpu_scatter_add_kernel<T, int32_t>(
+      phi::funcs::gpu_scatter_add_kernel<T, int32_t>(
           *out, axis, index, value, dev_ctx);
     } else if (index_type == DataType::INT64) {
-      phi::gpu_scatter_add_kernel<T, int64_t>(
+      phi::funcs::gpu_scatter_add_kernel<T, int64_t>(
           *out, axis, index, value, dev_ctx);
     }
   } else if (reduce == "multiply" || reduce == "mul") {
     if (index_type == DataType::INT32) {
-      phi::gpu_scatter_mul_kernel<T, int32_t>(
+      phi::funcs::gpu_scatter_mul_kernel<T, int32_t>(
           *out, axis, index, value, dev_ctx);
     } else if (index_type == DataType::INT64) {
-      phi::gpu_scatter_mul_kernel<T, int64_t>(
+      phi::funcs::gpu_scatter_mul_kernel<T, int64_t>(
           *out, axis, index, value, dev_ctx);
     }
   } else if (reduce == "assign") {
     if (index_type == DataType::INT32) {
-      phi::gpu_scatter_assign_kernel<T, int32_t>(
+      phi::funcs::gpu_scatter_assign_kernel<T, int32_t>(
           *out, axis, index, value, dev_ctx);
     } else if (index_type == DataType::INT64) {
-      phi::gpu_scatter_assign_kernel<T, int64_t>(
+      phi::funcs::gpu_scatter_assign_kernel<T, int64_t>(
           *out, axis, index, value, dev_ctx);
     }
   } else {
diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
index 61871dd8e57de..8530e3af1892f 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
@@ -18,8 +18,8 @@
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/gather_scatter_kernel.h"
 
 namespace phi {
 
@@ -46,14 +46,14 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
   const auto& index_type = index.dtype();
 
   if (index_type == DataType::INT32) {
-    phi::gpu_scatter_add_kernel<T, int32_t>(
+    phi::funcs::gpu_scatter_add_kernel<T, int32_t>(
         *x_grad,
         axis,
         index,
         out_grad,
         dev_ctx);  // the gradient of gather is scatter
   } else if (index_type == DataType::INT64) {
-    phi::gpu_scatter_add_kernel<T, int64_t>(
+    phi::funcs::gpu_scatter_add_kernel<T, int64_t>(
         *x_grad, axis, index, out_grad, dev_ctx);
   }
 }
diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
index 729b9b5e32de3..a548ac1a14ddf 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
-#include "paddle/phi/kernels/gather_scatter_kernel.h"
+#include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 
 namespace phi {
 
@@ -38,9 +38,9 @@ void TakeAlongAxisKernel(const Context& dev_ctx,
 
   const auto& index_type = index.dtype();
   if (index_type == DataType::INT32) {
-    phi::gpu_gather_kernel<T, int32_t>(x, axis, index, *out, dev_ctx);
+    phi::funcs::gpu_gather_kernel<T, int32_t>(x, axis, index, *out, dev_ctx);
   } else if (index_type == DataType::INT64) {
-    phi::gpu_gather_kernel<T, int64_t>(x, axis, index, *out, dev_ctx);
+    phi::funcs::gpu_gather_kernel<T, int64_t>(x, axis, index, *out, dev_ctx);
   }
 }