diff --git a/apps/cpp_rtvm/README.md b/apps/cpp_rtvm/README.md index e6961532823d..c60a7b0e12f5 100644 --- a/apps/cpp_rtvm/README.md +++ b/apps/cpp_rtvm/README.md @@ -352,3 +352,17 @@ python3 -m tvm.driver.tvmc compile --cross-compiler ${ANDROID_NDK_HOME}/toolchai python3 -m tvm.driver.tvmc run --device="cl" keras-resnet50.tar --rpc-key ${TVM_RPC_KEY} --rpc-tracker {TVM_TRACKER_HOST}:{TVM_TRACKER_PORT} --print-time ``` + +# Use pre-compiled OpenCL kernels +Using pre-compiled programs might significantly improve inference time of the +first run. E.g. for topology with ~300 kernels compilation time on Adreno was +about 26 seconds. But after dumping compiled programs to binary files and reuse +them on the next runs, the compilation time was significantly decreased (more +than 1000 times) and starts to be around 25 ms. + +To use such functionality, the developer have to pass parameter `--pre-compiled` +to the `rtvm` and specify the file name where pre-compiled programs will be +stored. If the pre-compiled file name was passed to the `rtvm` then After method +`Load`, method `UsePreCompiledProgram` is called. This method loads pre-compiled +programs if the file exists. In opposite case the file will be created and +pre-compiled programs will be saved to this file. diff --git a/apps/cpp_rtvm/main.cc b/apps/cpp_rtvm/main.cc index 31019ee0c9cf..c38a5f62bd9a 100644 --- a/apps/cpp_rtvm/main.cc +++ b/apps/cpp_rtvm/main.cc @@ -54,6 +54,7 @@ static const string kUsage = "--input - Numpy file for the model input (optional and we use random of not given)\n" "--output - Numpy file name to dump the model output as numpy\n" "--dump-meta - Dump model meta information\n" + "--pre-compiled - The file name of a file where pre-compiled programs should be stored" "\n" " Example\n" " ./rtvm --model=keras-resnet50 --device=\"opencl\" --dump-meta\n" @@ -66,12 +67,14 @@ static const string kUsage = * \arg device The target device to use {llvm, cl, ...etc.} * \arg input Numpy file for the model input * \arg output Numpy file name to dump the model output as numpy + * \arg pre_compiled File name where pre-compiled programs should be stored */ struct ToolArgs { string model; string device; string input; string output; + string pre_compiled; bool dump_meta = false; }; @@ -84,6 +87,7 @@ void PrintArgs(const ToolArgs& args) { LOG(INFO) << "Device = " << args.device; LOG(INFO) << "Input = " << args.input; LOG(INFO) << "Output = " << args.output; + LOG(INFO) << "Pre-compiled = " << args.pre_compiled; LOG(INFO) << "Dump Metadata = " << ((args.dump_meta) ? ("True") : ("False")); } @@ -172,6 +176,8 @@ void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& args) { if (!pmeta.empty()) { args.dump_meta = true; } + + args.pre_compiled = GetCmdOption(argc, argv, "--pre-compiled="); } /*! @@ -190,6 +196,9 @@ int ExecuteModel(ToolArgs& args) { // Load the model runner.Load(); + if (!args.pre_compiled.empty()) { + runner.UsePreCompiledPrograms(args.pre_compiled); + } // Query Model meta Information TVMMetaInfo mInfo = runner.GetMetaInfo(); diff --git a/apps/cpp_rtvm/tvm_runner.cc b/apps/cpp_rtvm/tvm_runner.cc index 74498e8170c4..2fd4f2281e01 100644 --- a/apps/cpp_rtvm/tvm_runner.cc +++ b/apps/cpp_rtvm/tvm_runner.cc @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -67,7 +68,8 @@ int GetTVMDevice(std::string device) { * \param path where the tfm compiler artifacts present. * \param device the target device where we need to load the compiled model. */ -TVMRunner::TVMRunner(std::string path, std::string device) : r_model_path(path), r_device(device) { +TVMRunner::TVMRunner(std::string path, std::string device) + : r_model_path(path), r_device(device), r_run_was_called(false) { LOG(INFO) << "TVMRunner Constructor:" << r_model_path << " Devices:" << r_device; } @@ -110,6 +112,30 @@ int TVMRunner::Load(void) { return 0; } +/*! + * \brief Specify if the run programs should be dumped to binary and reused in the next runs. + * \param file_name File name where pre-compiled programs should be stored. + */ +void TVMRunner::UsePreCompiledPrograms(std::string file_name) { + if (r_run_was_called) { + LOG(INFO) << "TVMRunner UsePreCompiledPrograms: should be called before first run"; + return; + } + auto f_get = r_mod_handle->GetFunction("opencl.GetPreCompiledPrograms", true); + auto f_set = r_mod_handle->GetFunction("opencl.SetPreCompiledPrograms", true); + if (f_get != nullptr && f_set != nullptr) { + std::ifstream ifs(file_name, std::ios::in | std::ios::binary); + if (ifs.fail()) { + auto bytes = String(f_get()); + std::ofstream fs(file_name, std::ofstream::binary); + fs.write(bytes.c_str(), bytes.size()); + } else { + std::string bytes((std::istreambuf_iterator(ifs)), std::istreambuf_iterator()); + f_set(String(bytes)); + } + } +} + /*! * \brief Calculated the memory size for the NDArray. * \param NDArray object. @@ -242,6 +268,7 @@ int TVMRunner::GetOutput(std::string output_id, char* raw_output) { */ int TVMRunner::Run(void) { LOG(INFO) << "TVMRunner::Run"; + r_run_was_called = true; r_graph_handle.GetFunction("run")(); return 0; diff --git a/apps/cpp_rtvm/tvm_runner.h b/apps/cpp_rtvm/tvm_runner.h index 37ba53606ee8..926e009c4c2e 100644 --- a/apps/cpp_rtvm/tvm_runner.h +++ b/apps/cpp_rtvm/tvm_runner.h @@ -56,6 +56,8 @@ class TVMRunner { /*! \brief Initiates graph runtime and with the compiled model */ int Load(void); + /*! \brief Specify if the run programs should be dumped to binary and reused in the next runs */ + void UsePreCompiledPrograms(std::string); /*! \brief Executes one inference cycle */ int Run(void); /*! \brief To set the inputs from given npz file */ @@ -86,6 +88,8 @@ class TVMRunner { std::string r_device; /*! \brief Holds meta information queried from graph runtime */ TVMMetaInfo mInfo; + /*! \brief Mark if the run method was called */ + bool r_run_was_called; }; } // namespace runtime diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index c172a0f94539..a295ea396cd0 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -438,6 +438,8 @@ class OpenCLModuleNode : public ModuleNode { // install a new kernel to thread local entry cl_kernel InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThreadEntry* t, const std::string& func_name, const KTRefEntry& e); + void SetPreCompiledPrograms(const std::string& bytes); + std::string GetPreCompiledPrograms(); private: // The workspace, need to keep reference to use it in destructor. diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index aa31d80d6e8b..c53523267d66 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -202,7 +202,7 @@ void* OpenCLWorkspace::CreateHostPtrIfEnabled(cl::BufferDescriptor* desc, Device cl_int err_code; desc->host_ptr = reinterpret_cast( clEnqueueMapBuffer(this->GetQueue(dev), desc->buffer, CL_TRUE, CL_MAP_WRITE, 0, - sizeof(cl_uchar) * size, 0, NULL, NULL, &err_code)); + sizeof(cl_uchar) * size, 0, nullptr, nullptr, &err_code)); OPENCL_CHECK_ERROR(err_code); #endif // OPENCL_ENABLE_HOST_PTR return desc; @@ -256,7 +256,7 @@ void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) { cl::BufferDescriptor* desc = static_cast(ptr); if (desc->host_ptr) { clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer, - reinterpret_cast(desc->host_ptr), 0, NULL, NULL); + reinterpret_cast(desc->host_ptr), 0, nullptr, nullptr); } OPENCL_CALL(clReleaseMemObject(desc->buffer)); delete desc; diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index 2fb157aac6af..ad41a34dde4e 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -137,6 +137,15 @@ cl::OpenCLWorkspace* OpenCLModuleNode::GetGlobalWorkspace() { PackedFunc OpenCLModuleNode::GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) { ICHECK_EQ(sptr_to_self.get(), this); + if (name == "opencl.GetPreCompiledPrograms") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + *rv = this->GetPreCompiledPrograms(); + }); + } else if (name == "opencl.SetPreCompiledPrograms") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + this->SetPreCompiledPrograms(args[0]); + }); + } ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main"; auto it = fmap_.find(name); if (it == fmap_.end()) return PackedFunc(); @@ -262,6 +271,74 @@ cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThre return kernel; } +void OpenCLModuleNode::SetPreCompiledPrograms(const std::string& bytes) { + std::string data = bytes; + dmlc::MemoryStringStream reader(&data); + dmlc::Stream* strm = &reader; + uint64_t kernels_num; + strm->Read(&kernels_num); + cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry(); + int device_id = t->device.device_id; + for (size_t i = 0; i < kernels_num; ++i) { + std::string name; + std::vector bin_vector; + strm->Read(&name); + strm->Read(&bin_vector); + if (programs_[name][device_id] == nullptr) { + cl_int err = 0; + cl_int binaryStatus; + size_t binarySize = bin_vector.size(); + const unsigned char* programBinary = bin_vector.data(); + + cl_device_id dev = workspace_->devices[device_id]; + programs_[name][device_id] = clCreateProgramWithBinary( + workspace_->context, 1, &dev, &binarySize, &programBinary, &binaryStatus, &err); + OPENCL_CHECK_ERROR(err); + OPENCL_CHECK_ERROR(binaryStatus); + + err = clBuildProgram(programs_[name][device_id], 0, nullptr, nullptr, nullptr, nullptr); + if (err != CL_SUCCESS) { + size_t len; + std::string log; + clGetProgramBuildInfo(programs_[name][device_id], dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, + &len); + log.resize(len); + clGetProgramBuildInfo(programs_[name][device_id], dev, CL_PROGRAM_BUILD_LOG, len, &log[0], + nullptr); + LOG(FATAL) << "OpenCL build error for device=" << dev << "\n" << log; + } + } + } +} + +std::string OpenCLModuleNode::GetPreCompiledPrograms() { + std::string data; + dmlc::MemoryStringStream writer(&data); + dmlc::Stream* strm = &writer; + strm->Write(static_cast(parsed_kernels_.size())); + for (auto& it : parsed_kernels_) { + std::string name = it.first; + cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry(); + int device_id = t->device.device_id; + t->kernel_table.resize(workspace_->num_registered_kernels); + if (programs_[std::string(name)][device_id] == nullptr) { + InstallKernel(workspace_, t, name, kid_map_[name]); + } + size_t size; + clGetProgramInfo(programs_[name][device_id], CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, + nullptr); + ICHECK(size > 0) << "Size of binary is 0"; + std::vector bin_vector(size); + unsigned char* binary = bin_vector.data(); + clGetProgramInfo(programs_[name][device_id], CL_PROGRAM_BINARIES, sizeof(unsigned char*), + &binary, nullptr); + + strm->Write(name); + strm->Write(bin_vector); + } + return data; +} + Module OpenCLModuleCreate(std::string data, std::string fmt, std::unordered_map fmap, std::string source) { auto n = make_object(data, fmt, fmap, source); diff --git a/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc b/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc index 2c2768945424..13b7d9470646 100644 --- a/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc +++ b/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc @@ -137,6 +137,7 @@ using f_clCreateProgramWithBinary = cl_program (*)(cl_context, cl_uint, const cl using f_clReleaseProgram = cl_int (*)(cl_program); using f_clBuildProgram = cl_int (*)(cl_program, cl_uint, const cl_device_id*, const char*, void (*pfn_notify)(cl_program program, void* user_data), void*); +using f_clGetProgramInfo = cl_int (*)(cl_program, cl_program_info, size_t, void*, size_t*); using f_clGetProgramBuildInfo = cl_int (*)(cl_program, cl_device_id, cl_program_build_info, size_t, void*, size_t*); using f_clCreateKernel = cl_kernel (*)(cl_program, const char*, cl_int*); @@ -347,6 +348,17 @@ cl_int clBuildProgram(cl_program program, cl_uint num_devices, const cl_device_i } } +cl_int clGetProgramInfo(cl_program program, cl_program_info param_name, size_t param_value_size, + void* param_value, size_t* param_value_size_ret) { + auto& lib = LibOpenCLWrapper::getInstance(); + auto func = (f_clGetProgramInfo)lib.getOpenCLFunction("clGetProgramInfo"); + if (func) { + return func(program, param_name, param_value_size, param_value, param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } +} + cl_int clGetProgramBuildInfo(cl_program program, cl_device_id device, cl_program_build_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) { diff --git a/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc new file mode 100644 index 000000000000..a1bdeb9c1408 --- /dev/null +++ b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include +#include + +#include "../src/runtime/opencl/opencl_common.h" + +using namespace tvm::runtime; +using namespace tvm::runtime::cl; + +namespace { +// This kernel was generated by TVM for conv2d operation +const std::string kernelTemplate = R"( +// Function: kernel_name_placeholder0 +__kernel void kernel_name_placeholder0(__write_only image2d_t pad_temp_texture, __read_only image2d_t placeholder0) { + const sampler_t image_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + float4 _1 = read_imagef(placeholder0, image_sampler, (int2)(((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 9) - 1), ((((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) / 81) * 7) + ((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 81) / 9)) - 1))); + (void)write_imagef(pad_temp_texture, (int2)((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 9), (((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) / 9)), (((((9 <= (((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 81)) && ((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 81) < 72)) && (1 <= (((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 9))) && ((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 9) < 8)) ? _1 : ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); +} + +// Function: kernel_name_placeholder1 +__kernel void kernel_name_placeholder1(__read_only image2d_t pad_temp_texture, __read_only image2d_t placeholder1, __write_only image2d_t compute, __read_only image2d_t placeholder2, __read_only image2d_t placeholder3) { + const sampler_t image_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + float4 compute1[14]; + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 0); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 28); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 4); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 32); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 8); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 36); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 12); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 40); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 16); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 44); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 20); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 48); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 24); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 52); + for (int rc_inner = 0; rc_inner < 128; ++rc_inner) { + for (int ry_inner = 0; ry_inner < 3; ++ry_inner) { + for (int rx_inner = 0; rx_inner < 3; ++rx_inner) { + for (int rc = 0; rc < 4; ++rc) { + float4 _1 = read_imagef(pad_temp_texture, image_sampler, (int2)((((int)get_local_id(0)) + rx_inner), ((rc_inner * 9) + ry_inner))); + float4 _2 = read_imagef(placeholder1, image_sampler, (int2)(((((rc_inner * 36) + (rc * 9)) + (ry_inner * 3)) + rx_inner), ((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))))); + vstore4((vload4(0, (float*)compute1 + 0) + (((float*)&_1)[rc] * _2)), 0, (float*)compute1 + 0); + float4 _3 = read_imagef(placeholder1, image_sampler, (int2)(((((rc_inner * 36) + (rc * 9)) + (ry_inner * 3)) + rx_inner), (((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8))); + vstore4((vload4(0, (float*)compute1 + 28) + (((float*)&_1)[rc] * _3)), 0, (float*)compute1 + 28); + float4 _4 = read_imagef(pad_temp_texture, image_sampler, (int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 1))); + vstore4((vload4(0, (float*)compute1 + 4) + (((float*)&_4)[rc] * _2)), 0, (float*)compute1 + 4); + vstore4((vload4(0, (float*)compute1 + 32) + (((float*)&_4)[rc] * _3)), 0, (float*)compute1 + 32); + float4 _5 = read_imagef(pad_temp_texture, image_sampler, (int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 2))); + vstore4((vload4(0, (float*)compute1 + 8) + (((float*)&_5)[rc] * _2)), 0, (float*)compute1 + 8); + vstore4((vload4(0, (float*)compute1 + 36) + (((float*)&_5)[rc] * _3)), 0, (float*)compute1 + 36); + float4 _6 = read_imagef(pad_temp_texture, image_sampler, (int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 3))); + vstore4((vload4(0, (float*)compute1 + 12) + (((float*)&_6)[rc] * _2)), 0, (float*)compute1 + 12); + vstore4((vload4(0, (float*)compute1 + 40) + (((float*)&_6)[rc] * _3)), 0, (float*)compute1 + 40); + float4 _7 = read_imagef(pad_temp_texture, image_sampler, (int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 4))); + vstore4((vload4(0, (float*)compute1 + 16) + (((float*)&_7)[rc] * _2)), 0, (float*)compute1 + 16); + vstore4((vload4(0, (float*)compute1 + 44) + (((float*)&_7)[rc] * _3)), 0, (float*)compute1 + 44); + float4 _8 = read_imagef(pad_temp_texture, image_sampler, (int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 5))); + vstore4((vload4(0, (float*)compute1 + 20) + (((float*)&_8)[rc] * _2)), 0, (float*)compute1 + 20); + vstore4((vload4(0, (float*)compute1 + 48) + (((float*)&_8)[rc] * _3)), 0, (float*)compute1 + 48); + float4 _9 = read_imagef(pad_temp_texture, image_sampler, (int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 6))); + vstore4((vload4(0, (float*)compute1 + 24) + (((float*)&_9)[rc] * _2)), 0, (float*)compute1 + 24); + vstore4((vload4(0, (float*)compute1 + 52) + (((float*)&_9)[rc] * _3)), 0, (float*)compute1 + 52); + } + } + } + } + float4 _10 = read_imagef(placeholder2, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + float4 _11 = read_imagef(placeholder3, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), ((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7))), max(((vload4(0, (float*)compute1 + 0) * _10) + _11), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _12 = read_imagef(placeholder2, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + float4 _13 = read_imagef(placeholder3, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 56)), max(((vload4(0, (float*)compute1 + 28) * _12) + _13), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _14 = read_imagef(placeholder2, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + float4 _15 = read_imagef(placeholder3, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 1)), max(((vload4(0, (float*)compute1 + 4) * _14) + _15), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _16 = read_imagef(placeholder2, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + float4 _17 = read_imagef(placeholder3, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 57)), max(((vload4(0, (float*)compute1 + 32) * _16) + _17), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _18 = read_imagef(placeholder2, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + float4 _19 = read_imagef(placeholder3, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 2)), max(((vload4(0, (float*)compute1 + 8) * _18) + _19), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _20 = read_imagef(placeholder2, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + float4 _21 = read_imagef(placeholder3, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 58)), max(((vload4(0, (float*)compute1 + 36) * _20) + _21), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _22 = read_imagef(placeholder2, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + float4 _23 = read_imagef(placeholder3, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 3)), max(((vload4(0, (float*)compute1 + 12) * _22) + _23), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _24 = read_imagef(placeholder2, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + float4 _25 = read_imagef(placeholder3, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 59)), max(((vload4(0, (float*)compute1 + 40) * _24) + _25), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _26 = read_imagef(placeholder2, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + float4 _27 = read_imagef(placeholder3, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 4)), max(((vload4(0, (float*)compute1 + 16) * _26) + _27), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _28 = read_imagef(placeholder2, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + float4 _29 = read_imagef(placeholder3, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 60)), max(((vload4(0, (float*)compute1 + 44) * _28) + _29), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _30 = read_imagef(placeholder2, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + float4 _31 = read_imagef(placeholder3, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 5)), max(((vload4(0, (float*)compute1 + 20) * _30) + _31), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _32 = read_imagef(placeholder2, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + float4 _33 = read_imagef(placeholder3, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 61)), max(((vload4(0, (float*)compute1 + 48) * _32) + _33), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _34 = read_imagef(placeholder2, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + float4 _35 = read_imagef(placeholder3, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 6)), max(((vload4(0, (float*)compute1 + 24) * _34) + _35), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _36 = read_imagef(placeholder2, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + float4 _37 = read_imagef(placeholder3, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 62)), max(((vload4(0, (float*)compute1 + 52) * _36) + _37), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); +} + + )"; +} // namespace + +using Timestamp = std::chrono::time_point; + +class OpenCLCompileBin : public ::testing::Test { + protected: + virtual void SetUp() override { + m_workspace = OpenCLWorkspace::Global(); + OpenCLThreadEntry* t = m_workspace->GetThreadEntry(); + t->kernel_table.resize(m_kernelsNum * 2); + m_kernelNames.resize(m_kernelsNum * 2); + m_dataSrc = ""; + m_fmap.clear(); + for (size_t i = 0; i < m_kernelsNum; ++i) { + std::string kernel_name = "generated_kernel_" + std::to_string(i) + "_"; + std::string kernelSource = + std::regex_replace(kernelTemplate, std::regex("kernel_name_placeholder"), kernel_name); + FunctionInfo fi1 = {kernel_name + "0"}; + FunctionInfo fi2 = {kernel_name + "1"}; + m_fmap[fi1.name] = fi1; + m_fmap[fi2.name] = fi2; + m_kernelNames[i * 2] = fi1.name; + m_kernelNames[i * 2 + 1] = fi2.name; + m_dataSrc += kernelSource; + } + } + + protected: + const size_t m_kernelsNum = 100; + const std::string m_tmpDirName = "OpenCLCompileBin_dir"; + OpenCLWorkspace* m_workspace; + std::string m_dataSrc; + std::unordered_map m_fmap; + std::vector m_kernelNames; +}; + +TEST_F(OpenCLCompileBin, SourceVsBinaryCompilationPerf) { + double compileFromSourceTimeMS, compileFromBinTimeMS; + std::string bytes; + { + OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string()); + module.Init(); + Timestamp comp_start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < m_kernelNames.size(); ++i) { + OpenCLModuleNode::KTRefEntry e = {i, 1}; + module.InstallKernel(m_workspace, m_workspace->GetThreadEntry(), m_kernelNames[i], e); + } + Timestamp comp_end = std::chrono::high_resolution_clock::now(); + auto get_pre_compiled_f = + module.GetFunction("opencl.GetPreCompiledPrograms", GetObjectPtr(&module)); + bytes = String(get_pre_compiled_f()); + std::chrono::duration duration = + std::chrono::duration_cast(comp_end - comp_start); + compileFromSourceTimeMS = duration.count() * 1e-6; + std::cout << "Compile time from source: " << compileFromSourceTimeMS << " ms." << std::endl; + } + { + OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string()); + module.Init(); + module.GetFunction("opencl.SetPreCompiledPrograms", + GetObjectPtr(&module))(String(bytes)); + Timestamp comp_start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < m_kernelNames.size(); ++i) { + OpenCLModuleNode::KTRefEntry e = {i, 1}; + module.InstallKernel(m_workspace, m_workspace->GetThreadEntry(), m_kernelNames[i], e); + } + Timestamp comp_end = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = + std::chrono::duration_cast(comp_end - comp_start); + compileFromBinTimeMS = duration.count() * 1e-6; + std::cout << "Compile time from bin: " << compileFromBinTimeMS << " ms." << std::endl; + } + ASSERT_LT(compileFromBinTimeMS, compileFromSourceTimeMS); +}