Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add some fp16 op for kunlun resnet50 model #44672

Merged
merged 2 commits into from
Jul 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 71 additions & 50 deletions paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ using Tensor = framework::Tensor;

template <typename T>
class ResNetUnitXPUKernel : public framework::OpKernel<T> {
using XPUType = typename XPUTypeTrait<T>::Type;

public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto place = ctx.GetPlace();
Expand Down Expand Up @@ -63,9 +65,12 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
std::string act_type = ctx.Attr<std::string>("act_type");
auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();

std::vector<const T *> x_list = {input_x->data<T>()};
std::vector<const T *> w_list = {filter_x->data<T>()};
std::vector<T *> conv_y_list = {conv_out_x->mutable_data<T>(place)};
std::vector<const XPUType *> x_list = {
reinterpret_cast<const XPUType *>(input_x->data<T>())};
std::vector<const XPUType *> w_list = {
reinterpret_cast<const XPUType *>(filter_x->data<T>())};
std::vector<XPUType *> conv_y_list = {
reinterpret_cast<XPUType *>(conv_out_x->mutable_data<T>(place))};

std::vector<std::vector<int>> x_shape_list = {
phi::vectorize<int>(input_x->dims())};
Expand Down Expand Up @@ -107,9 +112,10 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
Tensor *running_mean_z = ctx.Output<Tensor>("RunningMeanZ");
Tensor *running_var_z = ctx.Output<Tensor>("RunningVarZ");

x_list.push_back(input_z->data<T>());
w_list.push_back(filter_z->data<T>());
conv_y_list.push_back(conv_out_z->mutable_data<T>(place));
x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
conv_y_list.push_back(
reinterpret_cast<XPUType *>(conv_out_z->mutable_data<T>(place)));

x_shape_list.push_back(phi::vectorize<int>(input_z->dims()));

Expand All @@ -133,17 +139,17 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
if (fuse_add) {
const Tensor *input_z = ctx.Input<Tensor>("Z");
auto input_z_shape = phi::vectorize<int>(input_z->dims());
x_list.push_back(input_z->data<T>());
x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
x_shape_list.push_back(input_z_shape);
x_maxlist.push_back(nullptr);
}
}
int r = xpu::resnet_unit_fusion<T, T, T, int16_t>(
int r = xpu::resnet_unit_fusion<XPUType, XPUType, XPUType, int16_t>(
dev_ctx.x_context(),
x_list,
w_list,
conv_y_list,
output->mutable_data<T>(place),
reinterpret_cast<XPUType *>(output->mutable_data<T>(place)),
x_shape_list,
filter_x_shape[0],
ksize_list,
Expand Down Expand Up @@ -172,6 +178,8 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {

template <typename T>
class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
using XPUType = typename XPUTypeTrait<T>::Type;

public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto place = ctx.GetPlace();
Expand Down Expand Up @@ -208,11 +216,16 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {

auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();

std::vector<const T *> x_list = {x->data<T>()};
std::vector<const T *> w_list = {filter_x->data<T>()};
std::vector<const T *> conv_y_list = {conv_out_x->data<T>()};
std::vector<T *> dx_list = {x_grad->mutable_data<T>(place)};
std::vector<T *> dw_list = {filter_x_grad->mutable_data<T>(place)};
std::vector<const XPUType *> x_list = {
reinterpret_cast<const XPUType *>(x->data<T>())};
std::vector<const XPUType *> w_list = {
reinterpret_cast<const XPUType *>(filter_x->data<T>())};
std::vector<const XPUType *> conv_y_list = {
reinterpret_cast<const XPUType *>(conv_out_x->data<T>())};
std::vector<XPUType *> dx_list = {
reinterpret_cast<XPUType *>(x_grad->mutable_data<T>(place))};
std::vector<XPUType *> dw_list = {
reinterpret_cast<XPUType *>(filter_x_grad->mutable_data<T>(place))};

std::vector<std::vector<int>> x_shape_list = {
phi::vectorize<int>(x->dims())};
Expand Down Expand Up @@ -262,11 +275,14 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
Tensor *scale_z_grad =
ctx.Output<Tensor>(framework::GradVarName("ScaleZ"));
Tensor *bias_z_grad = ctx.Output<Tensor>(framework::GradVarName("BiasZ"));
x_list.push_back(z->data<T>());
w_list.push_back(filter_z->data<T>());
conv_y_list.push_back(conv_out_z->data<T>());
dx_list.push_back(z_grad->mutable_data<T>(place));
dw_list.push_back(filter_z_grad->mutable_data<T>(place));
x_list.push_back(reinterpret_cast<const XPUType *>(z->data<T>()));
w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
conv_y_list.push_back(
reinterpret_cast<const XPUType *>(conv_out_z->data<T>()));
dx_list.push_back(
reinterpret_cast<XPUType *>(z_grad->mutable_data<T>(place)));
dw_list.push_back(
reinterpret_cast<XPUType *>(filter_z_grad->mutable_data<T>(place)));
x_shape_list.push_back(phi::vectorize<int>(z->dims()));

auto filter_z_shape = phi::vectorize<int>(filter_z->dims());
Expand All @@ -288,38 +304,39 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
} else {
if (fuse_add) {
auto z_grad = ctx.Output<Tensor>(framework::GradVarName("Z"));
dx_list.push_back(z_grad->mutable_data<T>(place));
dx_list.push_back(
reinterpret_cast<XPUType *>(z_grad->mutable_data<T>(place)));
}
}

int r =
xpu::resnet_unit_grad_fusion<T, T, T, int16_t>(dev_ctx.x_context(),
x_list,
w_list,
y_grad->data<T>(),
output->data<T>(),
conv_y_list,
dx_list,
dw_list,
x_shape_list,
filter_x_shape[0],
ksize_list,
stride_list,
paddings,
dilations,
group,
x_maxlist,
w_maxlist,
scale_list,
batch_mean_list,
batch_invstd_list,
dscale_list,
dbias_list,
xpu::Activation_t::RELU,
eps,
is_nchw,
has_shortcut,
fuse_add);
int r = xpu::resnet_unit_grad_fusion<XPUType, XPUType, XPUType, int16_t>(
dev_ctx.x_context(),
x_list,
w_list,
reinterpret_cast<const XPUType *>(y_grad->data<T>()),
reinterpret_cast<const XPUType *>(output->data<T>()),
conv_y_list,
dx_list,
dw_list,
x_shape_list,
filter_x_shape[0],
ksize_list,
stride_list,
paddings,
dilations,
group,
x_maxlist,
w_maxlist,
scale_list,
batch_mean_list,
batch_invstd_list,
dscale_list,
dbias_list,
xpu::Activation_t::RELU,
eps,
is_nchw,
has_shortcut,
fuse_add);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "resnet_unit_grad_fusion");
}
};
Expand All @@ -329,5 +346,9 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {

namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL(resnet_unit, ops::ResNetUnitXPUKernel<float>);
REGISTER_OP_XPU_KERNEL(resnet_unit_grad, ops::ResNetUnitGradXPUKernel<float>);
REGISTER_OP_XPU_KERNEL(resnet_unit,
ops::ResNetUnitXPUKernel<plat::float16>,
ops::ResNetUnitXPUKernel<float>);
REGISTER_OP_XPU_KERNEL(resnet_unit_grad,
ops::ResNetUnitGradXPUKernel<plat::float16>,
ops::ResNetUnitGradXPUKernel<float>);
29 changes: 18 additions & 11 deletions paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ namespace operators {

template <typename T>
class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
using XPUType = typename XPUTypeTrait<T>::Type;

public:
void Compute(const framework::ExecutionContext& ctx) const override {
bool multi_precision = ctx.Attr<bool>("multi_precision");
Expand All @@ -35,14 +37,14 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
auto master_param = ctx.MultiInput<framework::LoDTensor>("MasterParam");
auto master_param_out =
ctx.MultiOutput<framework::LoDTensor>("MasterParamOut");
T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff");
T epsilon = ctx.Attr<float>("epsilon");
T rescale_grad = ctx.Attr<float>("rescale_grad");
float mu = static_cast<T>(ctx.Attr<float>("mu"));
float lars_coeff = ctx.Attr<float>("lars_coeff");
float epsilon = ctx.Attr<float>("epsilon");
float rescale_grad = ctx.Attr<float>("rescale_grad");

std::vector<T*> param_list;
std::vector<T*> grad_list;
std::vector<T*> param_out_list;
std::vector<XPUType*> param_list;
std::vector<XPUType*> grad_list;
std::vector<XPUType*> param_out_list;
std::vector<float*> velocity_list;
std::vector<float*> velocity_out_list;
std::vector<float*> lrs;
Expand All @@ -52,9 +54,12 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
std::vector<float*> master_param_out_list;
int op_num = param.size();
for (int i = 0; i < op_num; ++i) {
param_list.push_back(const_cast<T*>(param[i]->data<T>()));
grad_list.push_back(const_cast<T*>(grad[i]->data<T>()));
param_out_list.push_back(param_out[i]->mutable_data<T>(ctx.GetPlace()));
param_list.push_back(
reinterpret_cast<XPUType*>(const_cast<T*>((param[i]->data<T>()))));
grad_list.push_back(
reinterpret_cast<XPUType*>(const_cast<T*>(grad[i]->data<T>())));
param_out_list.push_back(reinterpret_cast<XPUType*>(
param_out[i]->mutable_data<T>(ctx.GetPlace())));
velocity_list.push_back(const_cast<float*>(velocity[i]->data<float>()));
velocity_out_list.push_back(
velocity_out[i]->mutable_data<float>(ctx.GetPlace()));
Expand Down Expand Up @@ -111,5 +116,7 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
} // namespace paddle

namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(lars_momentum, ops::LarsMomentumOpXPUKernel<float>);
REGISTER_OP_XPU_KERNEL(lars_momentum,
ops::LarsMomentumOpXPUKernel<paddle::platform::float16>,
ops::LarsMomentumOpXPUKernel<float>);
#endif
19 changes: 13 additions & 6 deletions paddle/fluid/platform/device/xpu/xpu2_op_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,9 @@ XPUOpMap& get_kl2_ops() {
pOpKernelType(vartype::FP16, XPUPlace())})},
{"generate_proposals_v2",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"grad_add", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"grad_add",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"greater_equal",
XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
pOpKernelType(vartype::INT32, XPUPlace()),
Expand All @@ -258,9 +260,8 @@ XPUOpMap& get_kl2_ops() {
{"label_smooth",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"lars_momentum",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"layer_norm_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"layer_norm_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
Expand Down Expand Up @@ -380,9 +381,12 @@ XPUOpMap& get_kl2_ops() {
pOpKernelType(vartype::INT32, XPUPlace()),
pOpKernelType(vartype::BOOL, XPUPlace()),
pOpKernelType(vartype::FP32, XPUPlace())})},
{"resnet_unit", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"resnet_unit",
XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()),
pOpKernelType(vartype::FP32, XPUPlace())})},
{"resnet_unit_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()),
pOpKernelType(vartype::FP32, XPUPlace())})},
{"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
Expand Down Expand Up @@ -496,6 +500,9 @@ XPUOpMap& get_kl2_ops() {
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"top_k_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"update_loss_scaling",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"unsqueeze2_grad",
XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
pOpKernelType(vartype::INT64, XPUPlace()),
Expand Down
15 changes: 11 additions & 4 deletions paddle/phi/kernels/xpu/elementwise_add_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,25 @@ void GradAddXPUKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
using XPUType = typename XPUTypeTrait<T>::Type;

dev_ctx.template Alloc<T>(out);
auto x_shape = phi::vectorize<int>(x.dims());
auto y_shape = phi::vectorize<int>(y.dims());
int r = xpu::broadcast_add(dev_ctx.x_context(),
x.data<T>(),
y.data<T>(),
out->data<T>(),
reinterpret_cast<const XPUType*>(x.data<T>()),
reinterpret_cast<const XPUType*>(y.data<T>()),
reinterpret_cast<XPUType*>(out->data<T>()),
x_shape,
y_shape);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
}

} // namespace phi

PD_REGISTER_KERNEL(grad_add, XPU, ALL_LAYOUT, phi::GradAddXPUKernel, float) {}
PD_REGISTER_KERNEL(grad_add,
XPU,
ALL_LAYOUT,
phi::GradAddXPUKernel,
phi::dtype::float16,
float) {}
36 changes: 21 additions & 15 deletions paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ void LogSoftmaxGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
int axis,
DenseTensor* x_grad) {
using XPUType = typename XPUTypeTrait<T>::Type;
const int rank = out.dims().size();
axis = funcs::CanonicalAxis(axis, rank);

Expand All @@ -40,24 +41,29 @@ void LogSoftmaxGradKernel(const Context& dev_ctx,
PADDLE_ENFORCE_NE(
tmp2_ptr, nullptr, phi::errors::External("no enough memory in xpu"));

int r =
xpu::exp(dev_ctx.x_context(), out.data<T>(), tmp_ptr, out_grad.numel());
int r = xpu::exp<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(out.data<T>()),
reinterpret_cast<XPUType*>(tmp_ptr),
out_grad.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "exp");
r = xpu::reciprocal(
dev_ctx.x_context(), tmp_ptr, tmp2_ptr, out_grad.numel());
r = xpu::reciprocal<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(tmp_ptr),
reinterpret_cast<XPUType*>(tmp2_ptr),
out_grad.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "reciprocal");
r = xpu::mul(dev_ctx.x_context(),
tmp2_ptr,
out_grad.data<T>(),
tmp2_ptr,
out_grad.numel());
r = xpu::mul<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(tmp2_ptr),
reinterpret_cast<const XPUType*>(out_grad.data<T>()),
reinterpret_cast<XPUType*>(tmp2_ptr),
out_grad.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul");
r = xpu::softmax_grad(dev_ctx.x_context(),
tmp_ptr,
tmp2_ptr,
x_grad->data<T>(),
out_shape,
axis);
r = xpu::softmax_grad<XPUType>(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(tmp_ptr),
reinterpret_cast<const XPUType*>(tmp2_ptr),
reinterpret_cast<XPUType*>(x_grad->data<T>()),
out_shape,
axis);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax_grad");
}
}
Expand Down
Loading