Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[XPU][NPU] (1) add device_guard. (2) add support for LoDTensorArray of sum op. #44367

Merged
merged 3 commits into from
Jul 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 34 additions & 6 deletions paddle/fluid/framework/new_executor/interpretercore_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -329,14 +329,12 @@ void apply_device_guard(const OperatorBase* op_base,
VLOG(3) << "Switch into CPUPlace by device_guard.";
expected_kernel_key->place_ = platform::CPUPlace();
} else if (op_device.find("gpu") != std::string::npos &&
(platform::is_gpu_place(place) ||
platform::is_npu_place(place))) {
// when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
// will be executed and a warning will be given at the same time.
platform::is_gpu_place(place)) {
// when the Op that does not have GPUKernel is assigned to GPU, the
// CPUKernel will be executed and a warning will be given at the same
// time.
if (op_base->SupportGPU()) {
expected_kernel_key->place_ = place;
} else if (op_base->SupportNPU()) {
expected_kernel_key->place_ = place;
} else {
expected_kernel_key->place_ = platform::CPUPlace();
LOG_FIRST_N(WARNING, 1)
Expand All @@ -345,6 +343,36 @@ void apply_device_guard(const OperatorBase* op_base,
}
VLOG(3) << "Switch into " << expected_kernel_key->place_
<< " by device_guard.";
} else if (op_device.find("npu") != std::string::npos &&
platform::is_npu_place(place)) {
// when the Op that does not have NPUKernel is assigned to NPU, the
// CPUKernel will be executed and a warning will be given at the same
// time.
if (op_base->SupportNPU()) {
expected_kernel_key->place_ = place;
} else {
expected_kernel_key->place_ = platform::CPUPlace();
LOG_FIRST_N(WARNING, 1)
<< "Op(" << op_base->Type()
<< ") has no NPU implementation. It will be assigned to CPUPlace.";
}
VLOG(3) << "Switch into " << expected_kernel_key->place_
<< " by device_guard.";
} else if (op_device.find("xpu") != std::string::npos &&
platform::is_xpu_place(place)) {
// when the Op that does not have XPUKernel is assigned to XPU, the
// CPUKernel will be executed and a warning will be given at the same
// time.
if (op_base->SupportXPU()) {
expected_kernel_key->place_ = place;
} else {
expected_kernel_key->place_ = platform::CPUPlace();
LOG_FIRST_N(WARNING, 1)
<< "Op(" << op_base->Type()
<< ") has no XPU implementation. It will be assigned to CPUPlace.";
}
VLOG(3) << "Switch into " << expected_kernel_key->place_
<< " by device_guard.";
} else {
PADDLE_THROW(
platform::errors::Fatal("Unsupported current place %s", op_device));
Expand Down
88 changes: 85 additions & 3 deletions paddle/fluid/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1274,6 +1274,43 @@ bool OperatorWithKernel::SupportNPU() const {
}
}

bool OperatorWithKernel::SupportXPU() const {
#ifdef PADDLE_WITH_XPU
auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
phi::TransToPhiKernelName(type_));
auto has_phi_kernel =
std::any_of(phi_kernels.begin(),
phi_kernels.end(),
[](phi::KernelKeyMap::const_reference kern_pair) {
return kern_pair.first.backend() == phi::Backend::XPU;
});
if (has_phi_kernel) {
return true;
} else {
auto kernel_iter = OperatorWithKernel::AllOpKernels().find(type_);
if (kernel_iter == OperatorWithKernel::AllOpKernels().end()) {
return false;
} else {
auto& op_kernels = kernel_iter->second;
return std::any_of(
op_kernels.begin(),
op_kernels.end(),
[this](OpKernelMap::const_reference kern_pair) {
return platform::is_xpu_place(kern_pair.first.place_) &&
paddle::platform::is_xpu_support_op(type_,
kern_pair.first) &&
!paddle::platform::is_in_xpu_black_list(type_);
});
}
}
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"should not call OperatorWithKernel::SupportXPU() when not compiled with "
"XPU support."));
return false;
#endif
}

bool OperatorWithKernel::SupportsMKLDNN(
const proto::VarType::Type data_type) const {
auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
Expand Down Expand Up @@ -1728,15 +1765,35 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
<< "Device index is only supported under pipeline parallelism, "
<< "so it will be ignored.";
}
// when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
// will be executed and a warning will be given at the same time.
// when the Op that does not have GPUKernel is assigned to GPU, the
// CPUKernel will be executed and a warning will be given at the same
// time.
expected_kernel_key.place_ = platform::CPUPlace();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (SupportGPU()) {
auto& dev_ctx = ctx.device_context();
expected_kernel_key.place_ = dev_ctx.GetPlace();
}
#endif
if (platform::is_cpu_place(expected_kernel_key.place_)) {
LOG_FIRST_N(WARNING, 1)
<< "Op(" << type_
<< ") has no CUDA implementation. It will be assigned to CPUPlace.";
}
} else if (Attr<std::string>("op_device").find("npu") !=
std::string::npos) {
auto device = Attr<std::string>("op_device");
size_t pos = device.find(':');
if (pos != std::string::npos) {
device = device.substr(0, pos);
LOG_FIRST_N(WARNING, 1)
<< "Device index is only supported under pipeline parallelism, "
<< "so it will be ignored.";
}
// when the Op that does not have NPUKernel is assigned to NPU, the
// CPUKernel will be executed and a warning will be given at the same
// time.
expected_kernel_key.place_ = platform::CPUPlace();
#ifdef PADDLE_WITH_ASCEND_CL
if (SupportNPU()) {
auto& dev_ctx = ctx.device_context();
Expand All @@ -1746,7 +1803,32 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
if (platform::is_cpu_place(expected_kernel_key.place_)) {
LOG_FIRST_N(WARNING, 1)
<< "Op(" << type_
<< ") has no CUDA implementation. It will be assigned to CPUPlace.";
<< ") has no NPU implementation. It will be assigned to CPUPlace.";
}
} else if (Attr<std::string>("op_device").find("xpu") !=
std::string::npos) {
auto device = Attr<std::string>("op_device");
size_t pos = device.find(':');
if (pos != std::string::npos) {
device = device.substr(0, pos);
LOG_FIRST_N(WARNING, 1)
<< "Device index is only supported under pipeline parallelism, "
<< "so it will be ignored.";
}
// when the Op that does not have XPUKernel is assigned to XPU, the
// CPUKernel will be executed and a warning will be given at the same
// time.
expected_kernel_key.place_ = platform::CPUPlace();
#ifdef PADDLE_WITH_XPU
if (SupportXPU()) {
auto& dev_ctx = ctx.device_context();
expected_kernel_key.place_ = dev_ctx.GetPlace();
}
#endif
if (platform::is_cpu_place(expected_kernel_key.place_)) {
LOG_FIRST_N(WARNING, 1)
<< "Op(" << type_
<< ") has no XPU implementation. It will be assigned to CPUPlace.";
}
}
}
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/framework/operator.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ class OperatorBase {
virtual bool SupportGPU() const { return false; }
virtual bool SupportNPU() const { return false; }
virtual bool SupportMLU() const { return false; }
virtual bool SupportXPU() const { return false; }

const std::string& Type() const { return type_; }

Expand Down Expand Up @@ -596,6 +597,9 @@ class OperatorWithKernel : public OperatorBase {
return platform::is_mlu_place(kern_pair.first.place_);
});
}

bool SupportXPU() const override;

bool SupportsMKLDNN(proto::VarType::Type data_type) const;

bool SupportsKernelType(const OpKernelType& kernel_type) const;
Expand Down
11 changes: 6 additions & 5 deletions paddle/fluid/operators/controlflow/while_op_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -225,16 +225,17 @@ bool GetCondData(const framework::LoDTensor &cond) {
return cond.data<bool>()[0];
}
// when platform::is_gpu_place(cond.place()) or
// platform::is_npu_place(cond.place()) is true
// platform::is_npu_place(cond.place()) or
// platform::is_xpu_place(cond.place()) is true
std::unique_ptr<framework::LoDTensor> cpu_cond{new framework::LoDTensor()};
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL)
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU)
framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"This version of PaddlePaddle does NOT support GPU/NPU but got GPU/NPU "
"tensor "
"Cond in WhileOp. Please compile WITH_GPU or WITH_ASCEND_CL option."));
"This version of PaddlePaddle does NOT support GPU/NPU/XPU but got "
"GPU/NPU/XPU tensor Cond in WhileOp. Please compile WITH_GPU or "
"WITH_ASCEND_CL or WITH_XPU option."));
#endif
return cpu_cond->data<bool>()[0];
}
Expand Down
116 changes: 86 additions & 30 deletions paddle/fluid/operators/sum_op_xpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ limitations under the License. */
#include <vector>

#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/xpu/xpu_header.h"

namespace paddle {
Expand All @@ -28,38 +29,93 @@ class SumXPUKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &context) const override {
auto in_vars = context.MultiInputVar("X");
auto out_var = context.OutputVar("Out");
auto *out = context.Output<LoDTensor>("Out");
bool in_place = out_var == in_vars[0];
int N = in_vars.size();
PADDLE_ENFORCE_EQ(
out_var->IsType<framework::LoDTensor>(),
true,
platform::errors::InvalidArgument("XPU only surpport LodTensor"));
if (!in_place) {
out->mutable_data<T>(context.GetPlace());
}
auto &dev_ctx = context.template device_context<DeviceContext>();
std::vector<const XPUType *> ptrs;
for (int i = 0; i < N; ++i) {
PADDLE_ENFORCE_EQ(
in_vars[i]->IsType<framework::LoDTensor>(),
true,
platform::errors::InvalidArgument("XPU only surpport LodTensor"));
auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
if (in_t.numel() == 0) {
continue;

if (out_var->IsType<framework::LoDTensor>()) {
auto *out = context.Output<LoDTensor>("Out");
bool in_place = out_var == in_vars[0];
int N = in_vars.size();

if (!in_place) {
out->mutable_data<T>(context.GetPlace());
}
auto &dev_ctx = context.template device_context<DeviceContext>();
std::vector<const XPUType *> ptrs;
for (int i = 0; i < N; ++i) {
PADDLE_ENFORCE_EQ(
in_vars[i]->IsType<framework::LoDTensor>(),
true,
platform::errors::InvalidArgument("XPU only support LodTensor"));
auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
if (in_t.numel() == 0) {
continue;
}
ptrs.push_back(reinterpret_cast<const XPUType *>(in_t.data<T>()));
}
int r = xpu::sum(dev_ctx.x_context(),
ptrs,
reinterpret_cast<XPUType *>(out->data<T>()),
out->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "sum");
} else if (out_var->IsType<framework::LoDTensorArray>()) {
bool in_place = out_var == in_vars[0];
auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();

for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensorArray>(),
true,
platform::errors::InvalidArgument(
"Only support all inputs are TensorArray, "
"but inputs[%d] is not TensorArray.",
i));
auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();

for (size_t i = 0; i < in_array.size(); ++i) {
if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) {
if (i >= out_array.size()) {
out_array.resize(i + 1);
}
if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) {
framework::TensorCopy(in_array[i],
in_array[i].place(),
context.device_context(),
&out_array[i]);
out_array[i].set_lod(in_array[i].lod());
} else {
PADDLE_ENFORCE_EQ(
out_array[i].lod(),
in_array[i].lod(),
platform::errors::InvalidArgument(
"The lod message between inputs[%d] and"
" outputs[%d] must be same, but now is not same.",
i,
i));

std::vector<const XPUType *> ptrs;
ptrs.push_back(
reinterpret_cast<const XPUType *>(in_array[i].data<T>()));
ptrs.push_back(
reinterpret_cast<const XPUType *>(out_array[i].data<T>()));

auto &dev_ctx = context.template device_context<DeviceContext>();
// int sum(Context* ctx, const std::vector<const T*>& x_list, T*
// y, int len);
int r =
xpu::sum(dev_ctx.x_context(),
ptrs,
reinterpret_cast<XPUType *>(out_array[i].data<T>()),
out_array[i].numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "sum");
}
}
}
}
ptrs.push_back(reinterpret_cast<const XPUType *>(in_t.data<T>()));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Expected type of Output(out) must be Tensor or "
"LoDTensorArray. But got "
"unsupport type: %s.",
framework::ToTypeName(out_var->Type())));
}
int r = xpu::sum(dev_ctx.x_context(),
ptrs,
reinterpret_cast<XPUType *>(out->data<T>()),
out->numel());
PADDLE_ENFORCE_EQ(
r,
XPU_SUCCESS,
platform::errors::External(
"XPU sum kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r]));
}
};

Expand Down
4 changes: 2 additions & 2 deletions python/paddle/fluid/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -7078,9 +7078,9 @@ def device_guard(device=None):
device, index = device.split(':')
if device == 'cpu':
raise ValueError("Should not set device id for cpu.")
if device not in ['cpu', 'gpu', 'npu', '', None]:
if device not in ['cpu', 'gpu', 'npu', 'xpu', '', None]:
raise ValueError(
"The Attr(device) should be 'cpu' 'npu' or 'gpu', and it can also be empty string or None "
"The Attr(device) should be 'cpu' 'npu' 'xpu' or 'gpu', and it can also be empty string or None "
"when there is no need to specify device. But received %s" % device)
if index:
device = ":".join([device, index])
Expand Down
Loading