Skip to content

Commit

Permalink
[OpenCL] Support layout type: kImageFolder (#7143)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhaoyang-star authored Oct 11, 2021
1 parent cebfb91 commit bbfdd62
Show file tree
Hide file tree
Showing 10 changed files with 546 additions and 59 deletions.
10 changes: 9 additions & 1 deletion lite/api/tools/opt_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,14 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
} else if (target_repr == "opencl") {
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageFolder)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageFolder)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
valid_places_.emplace_back(
Expand Down Expand Up @@ -145,10 +149,14 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
} else if (target_repr == "x86_opencl") {
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageFolder)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageFolder)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
valid_places_.emplace_back(Place{TARGET(kX86), PRECISION(kFloat)});
Expand Down Expand Up @@ -508,7 +516,7 @@ void OptBase::PrintAllSupportedOpsInMdformat() {
"英特尔FPGA",
"华为昇腾NPU",
"联发科APU",
"瑞芯微NPU ",
"瑞芯微NPU",
"华为麒麟NPU",
"颖脉NNA",
"晶晨NPU"};
Expand Down
103 changes: 103 additions & 0 deletions lite/backends/opencl/cl_kernel/image/layout_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -311,3 +311,106 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
out[index + size_ch * 3] = convert_uchar_sat(in.w);
}
}

////////////////////////////////////////////////////////
// image2d_default -> image2d_folder
////////////////////////////////////////////////////////
__kernel void image2d_default_to_image2d_folder(__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_img_w,
__private const int in_img_h) {
const int pos_x = get_global_id(0);
const int pos_y = get_global_id(1);

CL_DTYPE4 in =
READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_x, pos_y));

CL_DTYPE4 in0 = 0.f;
CL_DTYPE4 in1 = 0.f;
CL_DTYPE4 in2 = 0.f;
CL_DTYPE4 in3 = 0.f;

in0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_x * 4, pos_y));
if (pos_x * 4 + 1 < in_img_w) {
in1 = READ_IMG_TYPE(
CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_x * 4 + 1, pos_y));
}
if (pos_x * 4 + 2 < in_img_w) {
in2 = READ_IMG_TYPE(
CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_x * 4 + 2, pos_y));
}
if (pos_x * 4 + 3 < in_img_w) {
in3 = READ_IMG_TYPE(
CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_x * 4 + 3, pos_y));
}

CL_DTYPE4 out = (CL_DTYPE4)(in0.x, in1.x, in2.x, in3.x);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_x, pos_y), out);
}

////////////////////////////////////////////////////////
// image2d_folder -> image2d_default
////////////////////////////////////////////////////////
__kernel void image2d_folder_to_image2d_default(__read_only image2d_t input,
__write_only image2d_t output,
__private const int out_img_w,
__private const int out_img_h) {
const int pos_x = get_global_id(0);
const int pos_y = get_global_id(1);

CL_DTYPE4 in =
READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_x, pos_y));

CL_DTYPE4 out0 = 0.f;
CL_DTYPE4 out1 = 0.f;
CL_DTYPE4 out2 = 0.f;
CL_DTYPE4 out3 = 0.f;
out0.x = in.x;
out1.x = in.y;
out2.x = in.z;
out3.x = in.w;

WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_x * 4, pos_y), out0);
if (pos_x * 4 + 1 < out_img_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_x * 4 + 1, pos_y), out1);
}
if (pos_x * 4 + 2 < out_img_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_x * 4 + 2, pos_y), out2);
}
if (pos_x * 4 + 3 < out_img_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_x * 4 + 3, pos_y), out3);
}
}

////////////////////////////////////////////////////////
// image2d_folder -> buffer
////////////////////////////////////////////////////////
__kernel void image2d_folder_to_buffer(__read_only image2d_t input,
__global float* output,
__private const int out_h,
__private const int out_w) {
const int pos_x = get_global_id(0);
const int pos_y = get_global_id(1);

CL_DTYPE4 in =
READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_x, pos_y));

float4 out0 = 0.f;
float4 out1 = 0.f;
float4 out2 = 0.f;
float4 out3 = 0.f;
float4 out = convert_float4(in);

int outpos_base = out_w * pos_y + pos_x * 4;
int length = out_w * out_h;
output[outpos_base] = out.x;
if (outpos_base + 1 < length) {
output[outpos_base + 1] = out.y;
}
if (outpos_base + 2 < length) {
output[outpos_base + 2] = out.z;
}
if (outpos_base + 3 < length) {
output[outpos_base + 3] = out.w;
}
}
24 changes: 22 additions & 2 deletions lite/core/optimizer/mir/type_layout_cast_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,33 @@ void TypeLayoutTransformPass::ComplementInputs(
};
auto* in_arg_type = const_cast<Type*>(in->AsArg().type);
if (is_host(in_arg_type->target()) &&
in_arg_type->layout() == DATALAYOUT(kImageDefault)) {
(in_arg_type->layout() == DATALAYOUT(kImageDefault) ||
in_arg_type->layout() == DATALAYOUT(kImageFolder))) {
return;
}

if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) {
VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name
<< " for kernel " << inst.op()->DebugString() << " "
<< *in->AsArg().type << " -> " << *decl_arg_type;

// Special case for opencl:
// Data layout of kImageDefault is the same as kImageFolder when the size of
// tensor's dims is greater than 2.
auto a = (*in->AsArg().type).layout();
auto b = (*decl_arg_type).layout();
const auto& tensor =
inst.op()->scope()->FindVar(in->AsArg().name)->Get<Tensor>();
const bool skip_flag = (((a == DATALAYOUT(kImageDefault)) &&
(b == DATALAYOUT(kImageFolder))) ||
((a == DATALAYOUT(kImageFolder)) &&
(b == DATALAYOUT(kImageDefault)))) &&
(tensor.dims().size() > 2);
if (skip_flag) {
VLOG(3) << "skip this case";
return;
}

AddLayoutInst(*in->AsArg().type,
*decl_arg_type,
in,
Expand Down Expand Up @@ -185,7 +204,8 @@ void TypeLayoutTransformPass::AddLayoutInst(
(TargetCompatibleTo(*in_arg_ty, from) &&
/* skip precision check: PrecisionCompatibleTo(*in_arg_ty, from) &&*/
DeviceCompatibleTo(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout())) {
DataLayoutCompatible(*in_arg_ty, from) &&
(out_arg_ty->layout() == to.layout()))) {
is_found = true;
} else if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout()) {
Expand Down
54 changes: 45 additions & 9 deletions lite/core/profile/precision_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -334,12 +334,6 @@ class PrecisionProfiler {
switch (layout_type) {
case DATALAYOUT(kImageDefault): {
auto in_dims = in->dims();
// special case
if ((in_dims.size() == 2) &&
(op_name == "fc" || op_name == "softmax")) {
in_dims = DDim(std::vector<DDim::value_type>(
{in->dims()[0], in->dims()[1], 1, 1}));
}
paddle::lite::CLImageConverterDefault default_convertor;
auto image_shape = default_convertor.InitImageDimInfoWith(in_dims);
size_t im_w = image_shape[0];
Expand All @@ -364,9 +358,6 @@ class PrecisionProfiler {
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
// TODO(zhaoyang-star): Tensor shape padding mode will change from
// high-dim padding to low-dim padding to fit image2d.
// ImageConverter will be changed.
default_convertor.ImageToNCHW(
in_data_v, real_out_v.data(), image_shape, in_dims);
CHECK(real_out_v.size() == in->numel());
Expand All @@ -386,6 +377,51 @@ class PrecisionProfiler {
}
return;
}
case DATALAYOUT(kImageFolder): {
auto in_dims = in->dims();
paddle::lite::CLImageConverterFolder folder_convertor;
auto image_shape = folder_convertor.InitImageDimInfoWith(in_dims);
size_t im_w = image_shape[0];
size_t im_h = image_shape[1];
VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " "
<< im_h;
auto* in_data_v =
use_fp16
? static_cast<void*>(
calloc(im_w * im_h * 4, sizeof(uint16_t)))
: static_cast<void*>(calloc(im_w * im_h * 4, sizeof(float)));

std::vector<float> real_out_v(in->numel());
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
TargetWrapperCL::ImgcpySync(in_data_v,
use_fp16
? in->data<uint16_t, cl::Image2D>()
: in->data<float, cl::Image2D>(),
im_w,
im_h,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
folder_convertor.ImageToNCHW(
in_data_v, real_out_v.data(), image_shape, in_dims);
CHECK(real_out_v.size() == in->numel());
*mean = compute_mean<float>(real_out_v.data(), real_out_v.size());
*std_dev = compute_standard_deviation<float>(
real_out_v.data(), in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<float>(real_out_v.data(),
real_out_v.size());
std::shared_ptr<lite::Tensor> real_out_t(new lite::Tensor);
real_out_t->Resize(in_dims);
float* real_out_data = real_out_t->mutable_data<float>();
memcpy(real_out_data,
real_out_v.data(),
real_out_v.size() * sizeof(float));
if (write_result_to_file) {
write_tensorfile<float>(real_out_t.get(), name, log_dir_);
}
return;
}
case DATALAYOUT(kNCHW): {
std::vector<float> in_data_v(in->numel(), 0);
TargetWrapperCL::MemcpySync(in_data_v.data(),
Expand Down
9 changes: 6 additions & 3 deletions lite/core/type_system.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,15 +195,18 @@ static bool DataLayoutCompatibleTo(const Type& a, const Type& b) {
return a.IsVoid() || //
(a.layout() == b.layout() || //
((b.layout() == DATALAYOUT(kAny)) &&
(a.layout() != DATALAYOUT(kImageDefault))));
(a.layout() != DATALAYOUT(kImageDefault) &&
a.layout() != DATALAYOUT(kImageFolder))));
}
static bool DataLayoutCompatible(const Type& a, const Type& b) {
return a.IsVoid() || b.IsVoid() || //
(a.layout() == b.layout() || //
((b.layout() == DATALAYOUT(kAny)) &&
(a.layout() != DATALAYOUT(kImageDefault))) ||
(a.layout() != DATALAYOUT(kImageDefault) &&
a.layout() != DATALAYOUT(kImageFolder))) ||
((a.layout() == DATALAYOUT(kAny)) &&
(b.layout() != DATALAYOUT(kImageDefault))));
(b.layout() != DATALAYOUT(kImageDefault) &&
b.layout() != DATALAYOUT(kImageFolder))));
}

static bool PrecisionCompatibleTo(const Type& a, const Type& b) {
Expand Down
8 changes: 4 additions & 4 deletions lite/kernels/opencl/fc_image_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace opencl {

class FcImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
DATALAYOUT(kImageFolder)> {
public:
void PrepareForRun() override {
auto& param = this->Param<operators::FcParam>();
Expand Down Expand Up @@ -317,18 +317,18 @@ class FcImageCompute : public KernelLite<TARGET(kOpenCL),
REGISTER_LITE_KERNEL(fc,
kOpenCL,
kFP16,
kImageDefault,
kImageFolder,
paddle::lite::kernels::opencl::FcImageCompute,
image2d)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
DATALAYOUT(kImageFolder))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("W", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("Alpha", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
DATALAYOUT(kImageFolder))})
.Finalize();
4 changes: 2 additions & 2 deletions lite/kernels/opencl/fc_image_compute_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ void test(const lite_api::CLPrecisionType p,
<< " m=" << m << " n=" << n << " k=" << k;

auto kernels = KernelRegistry::Global().Create(
"fc", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
"fc", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageFolder));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());

Expand Down Expand Up @@ -238,4 +238,4 @@ TEST(fc, compute_basic) {
} // namespace lite
} // namespace paddle

USE_LITE_KERNEL(fc, kOpenCL, kFP16, kImageDefault, image2d);
USE_LITE_KERNEL(fc, kOpenCL, kFP16, kImageFolder, image2d);
Loading

0 comments on commit bbfdd62

Please sign in to comment.