xe: ukernel: Delay microkernel check until micro_sdpa creation

Checks for the compatibility to build/run microkernel codes were performed at program initialization time. This commit delays the checks to a later point so that they are only performed once the micro_sdpa kernel is generated. The reason for this is that the vISA compiler generates a kernel.errors.txt if there is an error and this caused the CI to fail because the build directory is dirty afterwards.
oneapi-src · Nov 18, 2024 · 48f6bd9 · 48f6bd9
1 parent e72f65d
commit 48f6bd9
Showing 9 changed files with 107 additions and 44 deletions.
diff --git a/src/gpu/intel/compute/device_info.cpp b/src/gpu/intel/compute/device_info.cpp
@@ -290,7 +290,6 @@ status_t device_info_t::init_serialized_device_info(
     serialized_device_info_.write(&mayiuse_systolic_);
     serialized_device_info_.write(&mayiuse_ngen_kernels_);
     serialized_device_info_.write(&mayiuse_system_memory_allocators_);
-    serialized_device_info_.write(&mayiuse_microkernels_);
     serialized_device_info_.write(&mayiuse_non_uniform_work_groups_);
 
     const size_t name_size = name_.size();
@@ -332,7 +331,6 @@ status_t device_info_t::init_from_cache_blob(
     DESERIALIZE(mayiuse_systolic_, bool);
     DESERIALIZE(mayiuse_ngen_kernels_, bool);
     DESERIALIZE(mayiuse_system_memory_allocators_, bool);
-    DESERIALIZE(mayiuse_microkernels_, bool);
     DESERIALIZE(mayiuse_non_uniform_work_groups_, bool);
 #undef DESERIALIZE
 

diff --git a/src/gpu/intel/compute/device_info.hpp b/src/gpu/intel/compute/device_info.hpp
@@ -231,9 +231,6 @@ struct device_info_t {
 
     bool mayiuse_ngen_kernels() const { return mayiuse_ngen_kernels_; }
 
-    /// Returns true if the OpenCL compiler supports microkernels.
-    bool mayiuse_microkernels() const { return mayiuse_microkernels_; }
-
     bool mayiuse_systolic() const { return mayiuse_systolic_; }
 
     bool mayiuse_non_uniform_work_groups() const {
@@ -281,7 +278,6 @@ struct device_info_t {
     bool mayiuse_systolic_ = false;
     bool mayiuse_ngen_kernels_ = false;
     bool mayiuse_system_memory_allocators_ = false;
-    bool mayiuse_microkernels_ = false;
 
     std::string name_;
     xpu::runtime_version_t runtime_version_;

diff --git a/src/gpu/intel/ocl/micro_sdpa.cpp b/src/gpu/intel/ocl/micro_sdpa.cpp
@@ -153,8 +153,8 @@ status_t micro_sdpa_t::pd_t::init_microkernels(impl::engine_t *engine) {
     arch_ = dev_info->gpu_arch();
     auto *d = desc();
 
-    VCONDCHECK(primitive, create, check, sdpa,
-            (dev_info->mayiuse_microkernels()), status::unimplemented,
+    VCONDCHECK(primitive, create, check, sdpa, mayiuse_microkernels(engine),
+            status::unimplemented,
             "Microkernels not supported by the OpenCL driver.");
 
     /* Retrieve pre-tuned kernel configuration */

diff --git a/src/gpu/intel/ocl/ocl_gpu_device_info.cpp b/src/gpu/intel/ocl/ocl_gpu_device_info.cpp
@@ -44,7 +44,7 @@ status_t ocl_gpu_device_info_t::init_arch(impl::engine_t *engine) {
 
     init_gpu_hw_info(engine, device, context, ip_version_, gpu_arch_,
             gpu_product_family_, stepping_id_, native_extensions_,
-            mayiuse_systolic_, mayiuse_ngen_kernels_, mayiuse_microkernels_);
+            mayiuse_systolic_, mayiuse_ngen_kernels_);
 
     err = clReleaseContext(context);
     OCL_CHECK(err);

diff --git a/src/gpu/intel/ocl/ocl_gpu_hw_info.cpp b/src/gpu/intel/ocl/ocl_gpu_hw_info.cpp
@@ -55,34 +55,10 @@ xpu::runtime_version_t get_driver_version(cl_device_id device) {
     return runtime_version;
 }
 
-/// Tries to build a kernel with assembly instructions to check to see if the
-/// OpenCL compiler supports microkernels.
-bool try_building_with_microkernels(cl_context context, cl_device_id device) {
-    const char *kernel_code = R""""(
-        kernel void igc_check() {
-            __asm__ volatile(
-                    ".decl AA0 v_type=G type=ud num_elts=1\n"
-                    ".decl AA1 v_type=G type=ud num_elts=1\n"
-                    ".implicit_PSEUDO_INPUT AA0 offset=256 size=4\n"
-                    ".implicit_PSEUDO_INPUT AA1 offset=256 size=4\n"
-                    "mov (M1_NM,1) AA0(0,0)<1> AA1(0,0)<0;1,0>\n"
-            );
-        }
-        )"""";
-    cl_int err;
-    /// Not using existing build infrastructure to avoid error messages in the CI logs
-    xpu::ocl::wrapper_t<cl_program> program(
-            clCreateProgramWithSource(context, 1, &kernel_code, nullptr, &err));
-    if (err != CL_SUCCESS) return false;
-    err = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);
-    return err == CL_SUCCESS;
-}
-
 void init_gpu_hw_info(impl::engine_t *engine, cl_device_id device,
         cl_context context, uint32_t &ip_version, compute::gpu_arch_t &gpu_arch,
         int &gpu_product_family, int &stepping_id, uint64_t &native_extensions,
-        bool &mayiuse_systolic, bool &mayiuse_ngen_kernels,
-        bool &mayiuse_microkernels) {
+        bool &mayiuse_systolic, bool &mayiuse_ngen_kernels) {
     using namespace ngen;
     HW hw = HW::Unknown;
     Product product = {ProductFamily::Unknown, 0};
@@ -107,12 +83,6 @@ void init_gpu_hw_info(impl::engine_t *engine, cl_device_id device,
             = jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine);
     if (status != status::success) mayiuse_ngen_kernels = false;
 
-    mayiuse_microkernels = get_driver_version(device)
-            >= xpu::runtime_version_t(24, 22, 29735);
-    if (!mayiuse_microkernels) {
-        mayiuse_microkernels = try_building_with_microkernels(context, device);
-    }
-
     ip_version = 0;
     if (clGetDeviceInfo(device, CL_DEVICE_IP_VERSION_INTEL, sizeof(ip_version),
                 &ip_version, nullptr)

diff --git a/src/gpu/intel/ocl/ocl_gpu_hw_info.hpp b/src/gpu/intel/ocl/ocl_gpu_hw_info.hpp
@@ -33,8 +33,7 @@ xpu::runtime_version_t get_driver_version(cl_device_id device);
 void init_gpu_hw_info(impl::engine_t *engine, cl_device_id device,
         cl_context context, uint32_t &ip_version, compute::gpu_arch_t &gpu_arch,
         int &gpu_product_family, int &stepping_id, uint64_t &native_extensions,
-        bool &mayiuse_systolic, bool &mayiuse_ngen_kernels,
-        bool &mayiuse_microkernels);
+        bool &mayiuse_systolic, bool &mayiuse_ngen_kernels);
 
 } // namespace ocl
 } // namespace intel

diff --git a/src/gpu/intel/ocl/ocl_utils.cpp b/src/gpu/intel/ocl/ocl_utils.cpp
@@ -17,13 +17,19 @@
 #include <algorithm>
 #include <cstring>
 #include <iostream>
+#include <mutex>
 #include <CL/cl_ext.h>
 
 #include "gpu/intel/ocl/ocl_gpu_engine.hpp"
+#include "gpu/intel/ocl/ocl_gpu_hw_info.hpp"
 #include "gpu/intel/ocl/ocl_gpu_kernel.hpp"
 #include "gpu/intel/ocl/ocl_utils.hpp"
 #include "xpu/ocl/utils.hpp"
 
+#if __has_include(<sycl/sycl.hpp>)
+#include "gpu/intel/sycl/engine.hpp"
+#endif
+
 #ifndef CL_KERNEL_BINARY_PROGRAM_INTEL
 #define CL_KERNEL_BINARY_PROGRAM_INTEL 0x407D
 #endif
@@ -76,6 +82,99 @@ namespace gpu {
 namespace intel {
 namespace ocl {
 
+/// Tries to build a kernel with assembly instructions to check to see if the
+/// OpenCL compiler supports microkernels.
+bool try_building_with_microkernels(cl_context context, cl_device_id device) {
+    const char *kernel_code = R""""(
+        kernel void igc_check() {
+            __asm__ volatile(
+                    ".decl AA0 v_type=G type=ud num_elts=1\n"
+                    ".decl AA1 v_type=G type=ud num_elts=1\n"
+                    ".implicit_PSEUDO_INPUT AA0 offset=256 size=4\n"
+                    ".implicit_PSEUDO_INPUT AA1 offset=256 size=4\n"
+                    "mov (M1_NM,1) AA0(0,0)<1> AA1(0,0)<0;1,0>\n"
+            );
+        }
+        )"""";
+    cl_int err;
+    /// Not using existing build infrastructure to avoid error messages in the CI logs
+    xpu::ocl::wrapper_t<cl_program> program(
+            clCreateProgramWithSource(context, 1, &kernel_code, nullptr, &err));
+    if (err != CL_SUCCESS) return false;
+    err = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);
+    return err == CL_SUCCESS;
+}
+
+int get_sycl_ocl_device_and_context(
+        xpu::ocl::wrapper_t<cl_context> &ocl_context,
+        xpu::ocl::wrapper_t<cl_device_id> &ocl_device,
+        const impl::engine_t *engine) {
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+    auto *sycl_engine = utils::downcast<const sycl::engine_t *>(engine);
+    auto &device = sycl_engine->device();
+
+    auto be = xpu::sycl::get_backend(device);
+    if (be == xpu::sycl::backend_t::opencl) {
+        cl_int err = CL_SUCCESS;
+        auto ocl_dev = xpu::sycl::compat::get_native<cl_device_id>(device);
+        ocl_device = xpu::ocl::make_wrapper(ocl_dev, true);
+
+        ocl_context = xpu::ocl::make_wrapper(
+                clCreateContext(nullptr, 1, &ocl_dev, nullptr, nullptr, &err),
+                true);
+        if (err) return -1;
+    } else if (be == xpu::sycl::backend_t::level0) {
+        std::unique_ptr<gpu::intel::ocl::ocl_gpu_engine_t, engine_deleter_t>
+                ocl_engine;
+        auto err
+                = gpu::intel::sycl::create_ocl_engine(&ocl_engine, sycl_engine);
+        if (err != status::success) return -1;
+        ocl_device = xpu::ocl::make_wrapper(ocl_engine->device(), true);
+        ocl_context = xpu::ocl::make_wrapper(ocl_engine->context(), true);
+    }
+#endif
+    return 0;
+}
+
+bool mayiuse_microkernels(const impl::engine_t *engine) {
+    auto mayiuse_mk = [](const impl::engine_t *engine) {
+        xpu::ocl::wrapper_t<cl_device_id> ocl_device;
+        xpu::ocl::wrapper_t<cl_context> ocl_context;
+
+        switch (engine->runtime_kind()) {
+            case runtime_kind::sycl: {
+                auto err = get_sycl_ocl_device_and_context(
+                        ocl_context, ocl_device, engine);
+                if (err) return false;
+            } break;
+            case runtime_kind::ocl: {
+                const ocl_gpu_engine_t *eng
+                        = utils::downcast<const ocl_gpu_engine_t *>(engine);
+                ocl_device = xpu::ocl::make_wrapper(eng->device(), true);
+                ocl_context = xpu::ocl::make_wrapper(eng->context(), true);
+            } break;
+            default: return false;
+        }
+
+        bool mayiuse_microkernels = get_driver_version(ocl_device)
+                >= xpu::runtime_version_t(24, 22, 29735);
+        if (!mayiuse_microkernels) {
+            mayiuse_microkernels
+                    = try_building_with_microkernels(ocl_context, ocl_device);
+        }
+        return mayiuse_microkernels;
+    };
+
+    static std::map<engine_id_t, bool> engine_microkernel_map {
+            {engine->engine_id(), mayiuse_mk(engine)}};
+
+    static std::mutex map_mutex;
+    std::lock_guard<std::mutex> map_lock(map_mutex);
+    auto it = engine_microkernel_map.find(engine->engine_id());
+    if (it != std::end(engine_microkernel_map)) { return it->second; }
+    return engine_microkernel_map[engine->engine_id()] = mayiuse_mk(engine);
+}
+
 status_t get_ocl_kernel_arg_type(compute::scalar_type_t *type,
         cl_kernel ocl_kernel, cl_uint idx, bool allow_undef) {
     char s_type[16];

diff --git a/src/gpu/intel/ocl/ocl_utils.hpp b/src/gpu/intel/ocl/ocl_utils.hpp
@@ -40,6 +40,8 @@ namespace ocl {
 
 enum { OCL_BUFFER_ALIGNMENT = 128 };
 
+bool mayiuse_microkernels(const impl::engine_t *engine);
+
 status_t get_ocl_kernel_arg_type(compute::scalar_type_t *type,
         cl_kernel ocl_kernel, int idx, bool allow_undef = false);
 

diff --git a/src/gpu/intel/sycl/device_info.cpp b/src/gpu/intel/sycl/device_info.cpp
@@ -55,7 +55,7 @@ status_t device_info_t::init_arch(impl::engine_t *engine) {
         gpu::intel::ocl::init_gpu_hw_info(engine, ocl_dev_wrapper,
                 ocl_ctx_wrapper, ip_version_, gpu_arch_, gpu_product_family_,
                 stepping_id_, native_extensions_, mayiuse_systolic_,
-                mayiuse_ngen_kernels_, mayiuse_microkernels_);
+                mayiuse_ngen_kernels_);
     } else if (be == xpu::sycl::backend_t::level0) {
         // TODO: add support for L0 binary ngen check
         // XXX: query from ocl_engine for now
@@ -70,7 +70,6 @@ status_t device_info_t::init_arch(impl::engine_t *engine) {
         stepping_id_ = dev_info->stepping_id();
         mayiuse_systolic_ = dev_info->mayiuse_systolic();
         mayiuse_ngen_kernels_ = dev_info->mayiuse_ngen_kernels();
-        mayiuse_microkernels_ = dev_info->mayiuse_microkernels();
     } else {
         assert(!"not_expected");
     }