PaddlePaddle · QingshuChen · Jul 20, 2022 · Jul 15, 2022 · Jul 19, 2022 · Jul 19, 2022
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -329,14 +329,12 @@ void apply_device_guard(const OperatorBase* op_base,
       VLOG(3) << "Switch into CPUPlace by device_guard.";
       expected_kernel_key->place_ = platform::CPUPlace();
     } else if (op_device.find("gpu") != std::string::npos &&
-               (platform::is_gpu_place(place) ||
-                platform::is_npu_place(place))) {
-      // when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
-      // will be executed and a warning will be given at the same time.
+               platform::is_gpu_place(place)) {
+      // when the Op that does not have GPUKernel is assigned to GPU, the
+      // CPUKernel will be executed and a warning will be given at the same
+      // time.
       if (op_base->SupportGPU()) {
         expected_kernel_key->place_ = place;
-      } else if (op_base->SupportNPU()) {
-        expected_kernel_key->place_ = place;
       } else {
         expected_kernel_key->place_ = platform::CPUPlace();
         LOG_FIRST_N(WARNING, 1)
@@ -345,6 +343,36 @@ void apply_device_guard(const OperatorBase* op_base,
       }
       VLOG(3) << "Switch into " << expected_kernel_key->place_
               << " by device_guard.";
+    } else if (op_device.find("npu") != std::string::npos &&
+               platform::is_npu_place(place)) {
+      // when the Op that does not have NPUKernel is assigned to NPU, the
+      // CPUKernel will be executed and a warning will be given at the same
+      // time.
+      if (op_base->SupportNPU()) {
+        expected_kernel_key->place_ = place;
+      } else {
+        expected_kernel_key->place_ = platform::CPUPlace();
+        LOG_FIRST_N(WARNING, 1)
+            << "Op(" << op_base->Type()
+            << ") has no NPU implementation. It will be assigned to CPUPlace.";
+      }
+      VLOG(3) << "Switch into " << expected_kernel_key->place_
+              << " by device_guard.";
+    } else if (op_device.find("xpu") != std::string::npos &&
+               platform::is_xpu_place(place)) {
+      // when the Op that does not have XPUKernel is assigned to XPU, the
+      // CPUKernel will be executed and a warning will be given at the same
+      // time.
+      if (op_base->SupportXPU()) {
+        expected_kernel_key->place_ = place;
+      } else {
+        expected_kernel_key->place_ = platform::CPUPlace();
+        LOG_FIRST_N(WARNING, 1)
+            << "Op(" << op_base->Type()
+            << ") has no XPU implementation. It will be assigned to CPUPlace.";
+      }
+      VLOG(3) << "Switch into " << expected_kernel_key->place_
+              << " by device_guard.";
     } else {
       PADDLE_THROW(
           platform::errors::Fatal("Unsupported current place %s", op_device));

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
@@ -1274,6 +1274,43 @@ bool OperatorWithKernel::SupportNPU() const {
   }
 }
 
+bool OperatorWithKernel::SupportXPU() const {
+#ifdef PADDLE_WITH_XPU
+  auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
+      phi::TransToPhiKernelName(type_));
+  auto has_phi_kernel =
+      std::any_of(phi_kernels.begin(),
+                  phi_kernels.end(),
+                  [](phi::KernelKeyMap::const_reference kern_pair) {
+                    return kern_pair.first.backend() == phi::Backend::XPU;
+                  });
+  if (has_phi_kernel) {
+    return true;
+  } else {
+    auto kernel_iter = OperatorWithKernel::AllOpKernels().find(type_);
+    if (kernel_iter == OperatorWithKernel::AllOpKernels().end()) {
+      return false;
+    } else {
+      auto& op_kernels = kernel_iter->second;
+      return std::any_of(
+          op_kernels.begin(),
+          op_kernels.end(),
+          [this](OpKernelMap::const_reference kern_pair) {
+            return platform::is_xpu_place(kern_pair.first.place_) &&
+                   paddle::platform::is_xpu_support_op(type_,
+                                                       kern_pair.first) &&
+                   !paddle::platform::is_in_xpu_black_list(type_);
+          });
+    }
+  }
+#else
+  PADDLE_THROW(platform::errors::PreconditionNotMet(
+      "should not call OperatorWithKernel::SupportXPU() when not compiled with "
+      "XPU support."));
+  return false;
+#endif
+}
+
 bool OperatorWithKernel::SupportsMKLDNN(
     const proto::VarType::Type data_type) const {
   auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
@@ -1728,15 +1765,35 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
             << "Device index is only supported under pipeline parallelism, "
             << "so it will be ignored.";
       }
-      // when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
-      // will be executed and a warning will be given at the same time.
+      // when the Op that does not have GPUKernel is assigned to GPU, the
+      // CPUKernel will be executed and a warning will be given at the same
+      // time.
       expected_kernel_key.place_ = platform::CPUPlace();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (SupportGPU()) {
         auto& dev_ctx = ctx.device_context();
         expected_kernel_key.place_ = dev_ctx.GetPlace();
       }
 #endif
+      if (platform::is_cpu_place(expected_kernel_key.place_)) {
+        LOG_FIRST_N(WARNING, 1)
+            << "Op(" << type_
+            << ") has no CUDA implementation. It will be assigned to CPUPlace.";
+      }
+    } else if (Attr<std::string>("op_device").find("npu") !=
+               std::string::npos) {
+      auto device = Attr<std::string>("op_device");
+      size_t pos = device.find(':');
+      if (pos != std::string::npos) {
+        device = device.substr(0, pos);
+        LOG_FIRST_N(WARNING, 1)
+            << "Device index is only supported under pipeline parallelism, "
+            << "so it will be ignored.";
+      }
+      // when the Op that does not have NPUKernel is assigned to NPU, the
+      // CPUKernel will be executed and a warning will be given at the same
+      // time.
+      expected_kernel_key.place_ = platform::CPUPlace();
 #ifdef PADDLE_WITH_ASCEND_CL
       if (SupportNPU()) {
         auto& dev_ctx = ctx.device_context();
@@ -1746,7 +1803,32 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
       if (platform::is_cpu_place(expected_kernel_key.place_)) {
         LOG_FIRST_N(WARNING, 1)
             << "Op(" << type_
-            << ") has no CUDA implementation. It will be assigned to CPUPlace.";
+            << ") has no NPU implementation. It will be assigned to CPUPlace.";
+      }
+    } else if (Attr<std::string>("op_device").find("xpu") !=
+               std::string::npos) {
+      auto device = Attr<std::string>("op_device");
+      size_t pos = device.find(':');
+      if (pos != std::string::npos) {
+        device = device.substr(0, pos);
+        LOG_FIRST_N(WARNING, 1)
+            << "Device index is only supported under pipeline parallelism, "
+            << "so it will be ignored.";
+      }
+      // when the Op that does not have XPUKernel is assigned to XPU, the
+      // CPUKernel will be executed and a warning will be given at the same
+      // time.
+      expected_kernel_key.place_ = platform::CPUPlace();
+#ifdef PADDLE_WITH_XPU
+      if (SupportXPU()) {
+        auto& dev_ctx = ctx.device_context();
+        expected_kernel_key.place_ = dev_ctx.GetPlace();
+      }
+#endif
+      if (platform::is_cpu_place(expected_kernel_key.place_)) {
+        LOG_FIRST_N(WARNING, 1)
+            << "Op(" << type_
+            << ") has no XPU implementation. It will be assigned to CPUPlace.";
       }
     }
   }

@@ -173,6 +173,7 @@ class OperatorBase {
   virtual bool SupportGPU() const { return false; }
   virtual bool SupportNPU() const { return false; }
   virtual bool SupportMLU() const { return false; }
+  virtual bool SupportXPU() const { return false; }
 
   const std::string& Type() const { return type_; }
 
@@ -596,6 +597,9 @@ class OperatorWithKernel : public OperatorBase {
                          return platform::is_mlu_place(kern_pair.first.place_);
                        });
   }
+
+  bool SupportXPU() const override;
+
   bool SupportsMKLDNN(proto::VarType::Type data_type) const;
 
   bool SupportsKernelType(const OpKernelType& kernel_type) const;

diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -225,16 +225,17 @@ bool GetCondData(const framework::LoDTensor &cond) {
     return cond.data<bool>()[0];
   }
   // when platform::is_gpu_place(cond.place()) or
-  // platform::is_npu_place(cond.place()) is true
+  // platform::is_npu_place(cond.place()) or
+  // platform::is_xpu_place(cond.place()) is true
   std::unique_ptr<framework::LoDTensor> cpu_cond{new framework::LoDTensor()};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU)
   framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet(
-      "This version of PaddlePaddle does NOT support GPU/NPU but got GPU/NPU "
-      "tensor "
-      "Cond in WhileOp. Please compile WITH_GPU or WITH_ASCEND_CL option."));
+      "This version of PaddlePaddle does NOT support GPU/NPU/XPU but got "
+      "GPU/NPU/XPU tensor Cond in WhileOp. Please compile WITH_GPU or "
+      "WITH_ASCEND_CL or WITH_XPU option."));
 #endif
   return cpu_cond->data<bool>()[0];
 }

diff --git a/paddle/fluid/operators/sum_op_xpu.cc b/paddle/fluid/operators/sum_op_xpu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
@@ -28,38 +29,93 @@ class SumXPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &context) const override {
     auto in_vars = context.MultiInputVar("X");
     auto out_var = context.OutputVar("Out");
-    auto *out = context.Output<LoDTensor>("Out");
-    bool in_place = out_var == in_vars[0];
-    int N = in_vars.size();
-    PADDLE_ENFORCE_EQ(
-        out_var->IsType<framework::LoDTensor>(),
-        true,
-        platform::errors::InvalidArgument("XPU only surpport LodTensor"));
-    if (!in_place) {
-      out->mutable_data<T>(context.GetPlace());
-    }
-    auto &dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<const XPUType *> ptrs;
-    for (int i = 0; i < N; ++i) {
-      PADDLE_ENFORCE_EQ(
-          in_vars[i]->IsType<framework::LoDTensor>(),
-          true,
-          platform::errors::InvalidArgument("XPU only surpport LodTensor"));
-      auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
-      if (in_t.numel() == 0) {
-        continue;
+
+    if (out_var->IsType<framework::LoDTensor>()) {
+      auto *out = context.Output<LoDTensor>("Out");
+      bool in_place = out_var == in_vars[0];
+      int N = in_vars.size();
+
+      if (!in_place) {
+        out->mutable_data<T>(context.GetPlace());
+      }
+      auto &dev_ctx = context.template device_context<DeviceContext>();
+      std::vector<const XPUType *> ptrs;
+      for (int i = 0; i < N; ++i) {
+        PADDLE_ENFORCE_EQ(
+            in_vars[i]->IsType<framework::LoDTensor>(),
+            true,
+            platform::errors::InvalidArgument("XPU only support LodTensor"));
+        auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
+        if (in_t.numel() == 0) {
+          continue;
+        }
+        ptrs.push_back(reinterpret_cast<const XPUType *>(in_t.data<T>()));
+      }
+      int r = xpu::sum(dev_ctx.x_context(),
+                       ptrs,
+                       reinterpret_cast<XPUType *>(out->data<T>()),
+                       out->numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "sum");
+    } else if (out_var->IsType<framework::LoDTensorArray>()) {
+      bool in_place = out_var == in_vars[0];
+      auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
+
+      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
+        PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensorArray>(),
+                          true,
+                          platform::errors::InvalidArgument(
+                              "Only support all inputs are TensorArray, "
+                              "but inputs[%d] is not TensorArray.",
+                              i));
+        auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
+
+        for (size_t i = 0; i < in_array.size(); ++i) {
+          if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) {
+            if (i >= out_array.size()) {
+              out_array.resize(i + 1);
+            }
+            if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) {
+              framework::TensorCopy(in_array[i],
+                                    in_array[i].place(),
+                                    context.device_context(),
+                                    &out_array[i]);
+              out_array[i].set_lod(in_array[i].lod());
+            } else {
+              PADDLE_ENFORCE_EQ(
+                  out_array[i].lod(),
+                  in_array[i].lod(),
+                  platform::errors::InvalidArgument(
+                      "The lod message between inputs[%d] and"
+                      " outputs[%d] must be same, but now is not same.",
+                      i,
+                      i));
+
+              std::vector<const XPUType *> ptrs;
+              ptrs.push_back(
+                  reinterpret_cast<const XPUType *>(in_array[i].data<T>()));
+              ptrs.push_back(
+                  reinterpret_cast<const XPUType *>(out_array[i].data<T>()));
+
+              auto &dev_ctx = context.template device_context<DeviceContext>();
+              // int sum(Context* ctx, const std::vector<const T*>& x_list, T*
+              // y, int len);
+              int r =
+                  xpu::sum(dev_ctx.x_context(),
+                           ptrs,
+                           reinterpret_cast<XPUType *>(out_array[i].data<T>()),
+                           out_array[i].numel());
+              PADDLE_ENFORCE_XDNN_SUCCESS(r, "sum");
+            }
+          }
+        }
       }
-      ptrs.push_back(reinterpret_cast<const XPUType *>(in_t.data<T>()));
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Output(out) must be Tensor or "
+          "LoDTensorArray. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
     }
-    int r = xpu::sum(dev_ctx.x_context(),
-                     ptrs,
-                     reinterpret_cast<XPUType *>(out->data<T>()),
-                     out->numel());
-    PADDLE_ENFORCE_EQ(
-        r,
-        XPU_SUCCESS,
-        platform::errors::External(
-            "XPU sum kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r]));
   }
 };
 

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
@@ -7078,9 +7078,9 @@ def device_guard(device=None):
         device, index = device.split(':')
         if device == 'cpu':
             raise ValueError("Should not set device id for cpu.")
-    if device not in ['cpu', 'gpu', 'npu', '', None]:
+    if device not in ['cpu', 'gpu', 'npu', 'xpu', '', None]:
         raise ValueError(
-            "The Attr(device) should be 'cpu' 'npu' or 'gpu', and it can also be empty string or None "
+            "The Attr(device) should be 'cpu' 'npu' 'xpu' or 'gpu', and it can also be empty string or None "
             "when there is no need to specify device. But received %s" % device)
     if index:
         device = ":".join([device, index])