Skip to content

Commit

Permalink
[OpenCL] Optimize FC & Softmax (#6560)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhaoyang-star authored Jul 29, 2021
1 parent 7660463 commit 3dbaebd
Show file tree
Hide file tree
Showing 16 changed files with 1,031 additions and 80 deletions.
8 changes: 4 additions & 4 deletions docs/demo_guides/opencl.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ rm ./lite/api/paddle_use_ops.h
build_opencl
```

注:如果要调试cl kernel,假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件,保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`该命令会自动将修改后,再切到build目录下执行`make publish_inference`或者你要编译的单测的可执行文件名,cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。
注:如果要调试cl kernel,假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件,保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`该命令会自动更新`opencl_kernels_source.cc`,然后进入 build 目录(如`build.lite.android.armv8.gcc`)下执行`make publish_inference`或者待编译的单测的可执行文件名(如`make test_fc_image_opencl`,cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。

### 1.3 编译产物说明

Expand Down Expand Up @@ -185,7 +185,7 @@ adb shell "export LD_LIBRARY_PATH=/data/local/tmp/opencl/; \
/data/local/tmp/opencl/mobilenetv1_light_api \
/data/local/tmp/opencl/mobilenetv1_opencl_fp32_opt_releasev2.6_b8234efb_20200423.nb \
1,3,224,224 \
100 10 0 1 1 0"
100 10 0 1 1 0"
# repeats=100, warmup=10
# power_mode=0 绑定大核, thread_num=1
# accelerate_opencl=1 开启 opencl kernel cache & tuning,仅当模型运行在 opencl 后端时该选项才会生效
Expand Down Expand Up @@ -263,9 +263,9 @@ macOS x86 平台下分析:
Windows x86 平台下分析:
```
# 开启性能分析,会打印出每个 op 耗时信息和汇总信息
.\lite\tools\build_windows.bat with_opencl with_extra with_profile
.\lite\tools\build_windows.bat with_opencl with_extra with_profile
# 开启精度分析,会打印出每个 op 输出数据的均值和标准差信息
.\lite\tools\build_windows.bat with_opencl with_extra with_precision_profile
.\lite\tools\build_windows.bat with_opencl with_extra with_precision_profile
```
详细输出信息的说明可查阅[调试工具](../user_guides/debug)。
Expand Down
2 changes: 1 addition & 1 deletion lite/backends/opencl/cl_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class CLContext {
p.second.reset();
}
CLRuntime::Global()->program_map().clear();
LOG(INFO) << "release cl::Program, cl::Kernel finished.";
VLOG(4) << "release cl::Program, cl::Kernel finished.";
}

cl::CommandQueue &GetCommandQueue();
Expand Down
16 changes: 11 additions & 5 deletions lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -2468,15 +2468,19 @@ __kernel void conv2d_1x1_fc(__read_only image2d_t input,
#ifdef ELT_FUSE
__read_only image2d_t second_input_image,
#endif // ELT_FUSE
int batch,
int in_c_blks,
int out_c_blks) {
int out_n = get_global_id(2);
int out_c = get_global_id(0);
int2 tid = (int2)(get_local_id(0), get_local_id(1));
CL_DTYPE4 s = (CL_DTYPE4)(0.0f);
if (out_n >= batch) return;

if (out_c < out_c_blks) {
for (int c = tid.y; c < in_c_blks; c += 4) {
CL_DTYPE4 v = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(c, 0));
CL_DTYPE4 v =
READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(c, out_n));
CL_DTYPE16 w = weights[c * out_c_blks + out_c];
CL_DTYPE4 partial = v.x * w.s0123;
partial += v.y * w.s4567;
Expand All @@ -2496,20 +2500,22 @@ __kernel void conv2d_1x1_fc(__read_only image2d_t input,
s += temp[tid.x][1];
s += temp[tid.x][2];
s += temp[tid.x][3];
int2 output_pos0 = (int2)(out_c, 0);
int2 output_pos0 = (int2)(out_c, out_n);

#ifdef BIASE_CH
CL_DTYPE4 output0 =
s + READ_IMG_TYPE(CL_DTYPE_CHAR, biases, SAMPLER, output_pos0);
s + READ_IMG_TYPE(CL_DTYPE_CHAR, biases, SAMPLER, (int2)(out_c, 0));
#else
CL_DTYPE4 output0 = s;
#endif

CL_DTYPE4 alpha0;
#ifdef PRELU_CH
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
alpha0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0));
#elif defined(PRELU_ELE)
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
alpha0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0));
#elif defined(PRELU_ALL)
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0));
alpha0.y = alpha0.x;
Expand Down
76 changes: 76 additions & 0 deletions lite/backends/opencl/cl_kernel/image/softmax_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -188,3 +188,79 @@ __kernel void softmax_channel(__read_only image2d_t input,
WRITE_IMG_TYPE(
CL_DTYPE_CHAR, output, (int2)(cur_out_width_pos, bh_idx), input_data);
}

__kernel void softmax_1x1(__read_only image2d_t input,
__write_only image2d_t output,
__private const float4 mask,
__private const int c_blks) {
const int c_blk_idx = get_global_id(0);
const int b_idx = get_global_id(1);
const int tid = get_local_id(0);

// Compute Max
float4 maxx4 = read_imagef(input, SAMPLER, (int2)(0, b_idx));
for (int s = tid; s < c_blks; s += 32) { // as workgroup size is 32
float4 mask_a = s == c_blks - 1 ? mask : (float4)(1.0f);
float4 src = read_imagef(input, SAMPLER, (int2)(s, b_idx));
src = src * mask_a;
maxx4 = max(maxx4, src);
}
float maximum = max(maxx4.x, maxx4.y);
maximum = max(maximum, maxx4.z);
maximum = max(maximum, maxx4.w);

// We need to find the final max value among each workgroup.
// Note workgroup size is 32, so we need 8 float4 data to store 32 maximum.
__local float4 tmp[8];
__local float* tmpx1 = (__local float*)tmp;
tmpx1[tid] = maximum;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid == 0) {
maxx4 = max(tmp[0], tmp[1]);
maxx4 = max(maxx4, tmp[2]);
maxx4 = max(maxx4, tmp[3]);
maxx4 = max(maxx4, tmp[4]);
maxx4 = max(maxx4, tmp[5]);
maxx4 = max(maxx4, tmp[6]);
maxx4 = max(maxx4, tmp[7]);
maximum = max(maxx4.x, maxx4.y);
maximum = max(maximum, maxx4.z);
maximum = max(maximum, maxx4.w);
tmpx1[0] = maximum;
}
barrier(CLK_LOCAL_MEM_FENCE);
maximum = tmpx1[0];

// Compute Exp Sum
float sum = 0.0f;
for (int s = tid; s < c_blks; s += 32) {
float4 mask_temp = s == c_blks - 1 ? mask : (float4)(1.0f);
float4 src =
read_imagef(input, SAMPLER, (int2)(s, b_idx)) - (float4)(maximum);
sum += dot(mask_temp, exp(src));
}
barrier(CLK_LOCAL_MEM_FENCE);
tmpx1[tid] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid == 0) {
sum = dot((float4)(1.0f), tmp[0]);
sum += dot((float4)(1.0f), tmp[1]);
sum += dot((float4)(1.0f), tmp[2]);
sum += dot((float4)(1.0f), tmp[3]);
sum += dot((float4)(1.0f), tmp[4]);
sum += dot((float4)(1.0f), tmp[5]);
sum += dot((float4)(1.0f), tmp[6]);
sum += dot((float4)(1.0f), tmp[7]);
tmpx1[0] = 1.0f / sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
sum = tmpx1[0];

// Compute Result
if (c_blk_idx < c_blks) {
float4 src = read_imagef(input, SAMPLER, (int2)(c_blk_idx, b_idx)) -
(float4)(maximum);
CL_DTYPE4 res = CONVERT_TYPE_TO(exp(src) * sum, CL_DTYPE4);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(c_blk_idx, b_idx), res);
}
}
2 changes: 1 addition & 1 deletion lite/core/optimizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ std::unique_ptr<RuntimeProgram> RunDefaultOptimizer(
"variable_place_inference_pass", // inference arg/var's
"control_flow_op_shared_inputs_and_outputs_place_sync_pass",
"__fpga_kernel_place_correct_pass",
"opencl_kernel_place_correct_pass",
// "opencl_kernel_place_correct_pass", // uncommit this pass
"mlu_postprocess_pass",
// info(target/precision/layout/device)
// using kernel info
Expand Down
65 changes: 48 additions & 17 deletions lite/core/profile/precision_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,8 @@ class PrecisionProfiler {
MkDirRecur(log_dir_);
const char* write_to_file_raw =
std::getenv("PADDLELITE_PRECISION_WRITE_TO_FILE");
write_result_to_file_ = (write_to_file_raw && atoi(write_to_file_raw) > 0)
? atoi(write_to_file_raw) > 0
: false;
write_result_to_file_ =
(write_to_file_raw && atoi(write_to_file_raw) > 0) ? true : false;
}

std::string GetSummaryHeader() {
Expand Down Expand Up @@ -235,6 +234,7 @@ class PrecisionProfiler {
}

void compute_tensor_precision_info(const Tensor* in,
const std::string op_name,
DataLayoutType layout_type,
double* mean,
double* std_dev,
Expand All @@ -258,7 +258,9 @@ class PrecisionProfiler {
*std_dev =
compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<float>(in, name, log_dir_);
}
return;
}
#ifdef ENABLE_ARM_FP16
Expand All @@ -268,15 +270,19 @@ class PrecisionProfiler {
*std_dev =
compute_standard_deviation<__fp16>(ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<__fp16>(ptr, in->numel());
write_result_to_file&& write_tensorfile<__fp16>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<__fp16>(in, name, log_dir_);
}
return;
}
#endif
case PRECISION(kBool): {
*mean = -333333333333;
*std_dev = -33333333333;
*ave_grow_rate = -33333333333;
write_result_to_file&& write_tensorfile<bool>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<bool>(in, name, log_dir_);
}
return;
}
case PRECISION(kInt8): {
Expand All @@ -285,7 +291,9 @@ class PrecisionProfiler {
*std_dev =
compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<int8_t>(ptr, in->numel());
write_result_to_file&& write_tensorfile<int8_t>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<int8_t>(in, name, log_dir_);
}
return;
}
case PRECISION(kInt32): {
Expand All @@ -294,15 +302,19 @@ class PrecisionProfiler {
*std_dev = compute_standard_deviation<int32_t>(
ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<int32_t>(ptr, in->numel());
write_result_to_file&& write_tensorfile<int32_t>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<int32_t>(in, name, log_dir_);
}
return;
}
case PRECISION(kInt64): {
auto ptr = in->data<int64_t>();
*mean = compute_mean<int64_t>(ptr, in->numel());
*std_dev = compute_standard_deviation<int64_t>(
ptr, in->numel(), true, *mean);
write_result_to_file&& write_tensorfile<int64_t>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<int64_t>(in, name, log_dir_);
}
return;
}
default:
Expand All @@ -323,6 +335,10 @@ class PrecisionProfiler {
case DATALAYOUT(kImageDefault): {
paddle::lite::CLImageConverterDefault default_convertor;
auto image_shape = default_convertor.InitImageDimInfoWith(in->dims());
if (op_name == "fc" || op_name == "softmax") {
image_shape = DDim(std::vector<DDim::value_type>(
{in->dims()[1] / 4, in->dims()[0]}));
}
size_t im_w = image_shape[0];
size_t im_h = image_shape[1];
VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " "
Expand All @@ -345,6 +361,9 @@ class PrecisionProfiler {
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
// TODO(zhaoyang-star): Tensor shape padding mode will change from
// high-dim padding to low-dim padding to fit image2d.
// ImageConverter will be changed.
default_convertor.ImageToNCHW(
in_data_v, real_out_v.data(), image_shape, in->dims());
CHECK(real_out_v.size() == in->numel());
Expand All @@ -359,8 +378,9 @@ class PrecisionProfiler {
memcpy(real_out_data,
real_out_v.data(),
real_out_v.size() * sizeof(float));
write_result_to_file&& write_tensorfile<float>(
real_out_t.get(), name, log_dir_);
if (write_result_to_file) {
write_tensorfile<float>(real_out_t.get(), name, log_dir_);
}
return;
}
case DATALAYOUT(kNCHW): {
Expand All @@ -381,8 +401,9 @@ class PrecisionProfiler {
memcpy(real_out_data,
in_data_v.data(),
in_data_v.size() * sizeof(float));
write_result_to_file&& write_tensorfile<float>(
real_out_t.get(), name, log_dir_);
if (write_result_to_file) {
write_tensorfile<float>(real_out_t.get(), name, log_dir_);
}
return;
}
default:
Expand All @@ -409,7 +430,9 @@ class PrecisionProfiler {
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<float>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<float>(in, name, log_dir_);
}
return;
}
case PRECISION(kInt32): {
Expand All @@ -424,7 +447,9 @@ class PrecisionProfiler {
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<int>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<int>(in, name, log_dir_);
}
return;
}
case PRECISION(kInt64): {
Expand All @@ -439,7 +464,9 @@ class PrecisionProfiler {
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<int64_t>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<int64_t>(in, name, log_dir_);
}
return;
}
case PRECISION(kFP16): {
Expand All @@ -460,7 +487,9 @@ class PrecisionProfiler {
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<float>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<float>(in, name, log_dir_);
}
return;
}
default:
Expand Down Expand Up @@ -521,6 +550,7 @@ class PrecisionProfiler {

if (tout->IsInitialized()) {
compute_tensor_precision_info(tout,
op_name,
type->layout(),
&mean,
&std_dev,
Expand Down Expand Up @@ -559,6 +589,7 @@ class PrecisionProfiler {

if (tout->IsInitialized()) {
compute_tensor_precision_info(tout,
op_name,
type->layout(),
&mean,
&std_dev,
Expand Down
7 changes: 7 additions & 0 deletions lite/kernels/opencl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ add_kernel(batch_norm_opencl_image OPENCL basic SRCS batch_norm_image_compute.cc
add_kernel(reduce_mean_opencl_image OPENCL basic SRCS reduce_mean_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(clip_opencl_image OPENCL basic SRCS clip_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(softmax_opencl_image OPENCL basic SRCS softmax_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fc_opencl_image OPENCL basic SRCS fc_image_compute.cc DEPS ${cl_kernel_deps})
# extra
# wait to add ...

Expand Down Expand Up @@ -121,6 +122,12 @@ lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc
lite_cc_test(test_trigonometric_image_opencl SRCS trigonometric_image_compute_test.cc
DEPS trigonometric_opencl_image op_registry program context)

lite_cc_test(test_fc_image_opencl SRCS fc_image_compute_test.cc
DEPS fc_opencl_image op_registry program context)

lite_cc_test(test_softmax_image_opencl SRCS softmax_image_compute_test.cc
DEPS softmax_opencl_image op_registry program context)

######################
# buffer kernel #
######################
Expand Down
2 changes: 2 additions & 0 deletions lite/kernels/opencl/conv_image_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1506,6 +1506,8 @@ void ConvImageCompute::Conv2d1x1FC() {
status_ = kernel_.setArg(cnt++, *second_input_image_p_);
CL_CHECK_FATAL(status_);
}
status_ = kernel_.setArg(cnt++, output_tensor_n_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(cnt++, UP_DIV(input_tensor_c_, 4));
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(cnt++, UP_DIV(output_tensor_c_, 4));
Expand Down
Loading

0 comments on commit 3dbaebd

Please sign in to comment.