Squashed 'thirdParty/alpaka/' changes from 6854077d6d..106a4975f4

106a4975f4 fix getFunctionAttributes for the SYCL backend f36e1156af update CUDA version in CI 3f8456973e use inline for CUDA/HIP code when rdc is on, otherwise use static 8b9cc3c557 fix gh-pages jobA 89d5ce671c Ignore VI temporary files 4b7bd17493 Fix the device used by KernelExecutionFixture (ComputationalRadiationPhysics#2344) 2c386dc5e9 Make alpaka follow CMAKE_CUDA_RUNTIME_LIBRARY 2d652dd233 Add thread count to CPU blocks accelerators (ComputationalRadiationPhysics#2338) dbc5ebe1e9 Fix complex math pow test (ComputationalRadiationPhysics#2336) 4995c5b22a Fix isValidWorkDivKernel to use the correct device f571ce9197 Remove unnecessary include a26cdbcd41 Re-enable the KernelNoTemplateGpu test a9217fb780 Link libcudart even when libcurand is not used 9c8614143b Suppress GCC warning about casting a function to void* ba169cdc52 Rewrite the getValidWorkDivForKernel tests 948eb757d4 Fix getValidWorkDivForKernel tests for the SYCL CPU backend f6f94f13b5 Fix getValidWorkDivForKernel tests for the CUDA backend f612f971a0 Reduce code duplications in matrixMulWithMdSpan (ComputationalRadiationPhysics#2326) d1cc2e01c1 Add a matrix multiplication example using mdspan 536a183cce Add missing whitespace in enqueue log messages 81d4410eec Reduce code duplication in CUDA/HIP kernel launch 6fdec14904 add remove-restrict 5323600508 CI: improve script utils 01d123e605 fix missing C++20 STL for ICPX in the CI d254bcd6a3 ctest: display only output of tests, which failed c9b8c941af change documentation b9ed742913 remove getValidWorkDiv itself 048ef8afca use getValidWorkDivForKernel in kernelfixture of tests 38805498f0 fix random strategies 4f175420f2 remove getValidWorkDiv first 7f08120428 CI_FILTER: ^linux_nvcc11.* 789344f019 ALPAKA_FN_HOST is not a type 4efdb9dc63 fix explicit instantiation issue fe4106f88a CI_FILTER: ^linux_nvcc11.*gcc9 e6b4881b70 CI_FILTER: ^linux_nvcc11.*gcc9 e3e760ed9e make conv2dmdspan use kernelbundle 62efffe605 Add getValidWorkDivForKernel function and KernelBundle with tests 690da679bd Let the SYCL queue implement `ConceptCurrentThreadWaitFor`, `ConceptGetDev` and `ConceptQueue` (ComputationalRadiationPhysics#2314) 995c57b54b set alpaka_CXX_STANDARD in the job generator 6ad09baa38 remove nvcc11.0 and nvcc11.1 support (ComputationalRadiationPhysics#2310) 0775f7c066 clang-format and fix typo 18eeeb7b49 move complex declaration to internal namespace 3468d2f8ac add trait IsKernelTriviallyCopyable 3015eae06b update CI container to version 3.2 56c0e416bc Update Catch2 to v3.5.2 (ComputationalRadiationPhysics#2300) git-subtree-dir: thirdParty/alpaka git-subtree-split: 106a4975f48dc38cc34f6a2494a3d16774282951
psychocoderHPC · Aug 7, 2024 · 8431571 · 8431571
1 parent fd24dd8
commit 8431571
Show file tree

Hide file tree

Showing 348 changed files with 14,853 additions and 6,020 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -40,7 +40,7 @@ concurrency:
 # alpaka_ACC_ANY_BT_OMP5_ENABLE                 : {ON, OFF}
 #   [ON] OMP_NUM_THREADS                        : {1, 2, 3, 4}
 # alpaka_ACC_GPU_CUDA_ENABLE                    : {ON, OFF}
-#   [ON] ALPAKA_CI_CUDA_VERSION                 : {11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6}
+#   [ON] ALPAKA_CI_CUDA_VERSION                 : {11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2 12.3}
 #   [ON] CMAKE_CUDA_COMPILER                    : {nvcc, [CXX==clang++]:clang++}
 # alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE             : {ON, OFF}
 # alpaka_ACC_GPU_HIP_ENABLE                     : {ON, OFF}
@@ -62,6 +62,7 @@ env:
   ALPAKA_CI_ONEAPI_VERSION: 2024.0
   ALPAKA_CI_TBB_VERSION: 2021.10.0
   ALPAKA_CI_RUN_TESTS: ON
+  alpaka_CXX_STANDARD: 17
   alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE: ON
   alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE: ON
   alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: ON

diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml
@@ -6,6 +6,8 @@ on:
   push:
     branches:
       - develop
+env:
+  ALPAKA_CI_OS_NAME: "Linux"
 
 jobs:
   gh-pages:

diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@
 
 # tmp files
 *~
+.*.swp
 
 # netbeans project files
 /nbproject/

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -17,7 +17,7 @@ variables:
   # container version of the generated jobs
   # should be merged with ALPAKA_GITLAB_CI_CONTAINER_VERSION
   # see: script/job_generator/generate_job_yaml.py
-  ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION: "3.1"
+  ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION: "3.2"
 
 generate:
   stage: generator
@@ -27,7 +27,9 @@ generate:
     - apk update && apk add python3~=3.11 py3-pip
     - pip3 install -r script/job_generator/requirements.txt
     # it is sufficient to verify once, as the same job matrix is generated, verified and then filtered each time
-    - python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --verify --wave compile_only_job -o compile_only.yml
+    # disable verify because we know that the generator is broken: https://github.com/thombashi/allpairspy/pull/10
+    #- python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --verify --wave compile_only_job -o compile_only.yml
+    - python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --wave compile_only_job -o compile_only.yml
     - python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --wave runtime_job_cpu -o runtime_cpu.yml
     - python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --wave runtime_job_gpu -o runtime_gpu.yml
     - cat compile_only.yml

diff --git a/README.md b/README.md
@@ -72,8 +72,8 @@ This library uses C++17 (or newer when available).
 | OpenMP 2.0+ threads                                                            | :white_check_mark:                              | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:     | :white_check_mark:                                         | :white_check_mark:                                    | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:                                | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:[^3]           | :white_check_mark:                                    | :white_check_mark:                   |
 | std::thread                                                                    | :white_check_mark:                              | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:     | :white_check_mark:                                         | :white_check_mark:                                    | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:                                | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:               | :white_check_mark:                                    | :white_check_mark:                   |
 | TBB                                                                            | :white_check_mark:                              | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:     | :white_check_mark:                                         | :white_check_mark:                                    | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:                                | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:               | :white_check_mark:                                    | :white_check_mark:                   |
-| CUDA (nvcc)                                                                    | :white_check_mark: <br/> (CUDA 11.0 - 12.3)[^2] | :white_check_mark: <br/> (CUDA 11.4 - 12.0)[^2] | :white_check_mark: <br/> (CUDA 12.0 - 12.3) | :x:                    | :white_check_mark: <br/> (CUDA 11.0-11.2; 11.6 - 12.0)[^2] | :white_check_mark: <br/> (CUDA 11.2, 11.6 - 12.0)[^2] | :white_check_mark: <br/> (CUDA 11.6 - 12.0)[^2] | :white_check_mark: <br/> (CUDA 11.7 - 12.0) | :white_check_mark: <br/> (CUDA 11.8 - 12.0)       | :white_check_mark: <br/> (CUDA 12.2)      | :white_check_mark: <br/> (CUDA 12.3)      | :x:                                       | :x:                              | :x:                                                   | :x:                                  |
-| CUDA (clang)                                                                   | -                                               | -                                               | -                                           | :x:                    | :x:                                                        | :x:                                                   | :x:                                             | :x:                                         | :white_check_mark: (CUDA 11.0 - 11.5)             | :white_check_mark: (CUDA 11.0 - 11.5)[^1] | :white_check_mark: (CUDA 11.0 - 11.5)[^1] | :white_check_mark: (CUDA 11.0 - 11.8)[^1] | :x:                              | -                                                     | -                                    |
+| CUDA (nvcc)                                                                    | :white_check_mark: <br/> (CUDA 11.2 - 12.5)[^2] | :white_check_mark: <br/> (CUDA 11.4 - 12.0)[^2] | :white_check_mark: <br/> (CUDA 12.0 - 12.5) | :white_check_mark: <br/> (CUDA 12.4 - 12.5) | :white_check_mark: <br/> (11.6 - 12.0)[^2] | :white_check_mark: <br/> (CUDA 11.2, 11.6 - 12.0)[^2] | :white_check_mark: <br/> (CUDA 11.6 - 12.0)[^2] | :white_check_mark: <br/> (CUDA 11.7 - 12.0) | :white_check_mark: <br/> (CUDA 11.8 - 12.0)       | :white_check_mark: <br/> (CUDA 12.2)      | :white_check_mark: <br/> (CUDA 12.3)      | :white_check_mark: <br/> (CUDA 12.4 - 12.5) | :x:                              | :x:                                                   | :x:                                  |
+| CUDA (clang)                                                                   | -                                               | -                                               | -                                           | :x:                    | :x:                                                        | :x:                                                   | :x:                                             | :x:                                         | :white_check_mark: (CUDA 11.2 - 11.5)             | :white_check_mark: (CUDA 11.2 - 11.5)[^1] | :white_check_mark: (CUDA 11.2 - 11.5)[^1] | :white_check_mark: (CUDA 11.2 - 11.8)[^1] | :x:                              | -                                                     | -                                    |
 | [HIP](https://alpaka.readthedocs.io/en/latest/install/HIP.html) (clang)        | -                                               | -                                               | -                                           | :x:                    | :x:                                                        | :x:                                                   | :x:                                             | :x:                                         | :white_check_mark: (HIP 5.1 - 5.2)                | :white_check_mark: (HIP 5.3 - 5.4)        | :white_check_mark: (HIP 5.5 - 5.6)        | :white_check_mark: (HIP 5.7 - 6.0)        | :x:                              | -                                                     | -                                    |
 | SYCL                                                                           | :x:                                             | :x:                                             | :x:                                         | :x:                    | :x:                                                        | :x:                                                   | :x:                                             | :x:                                         | :x:                                               | :x:                                       | :x:                                       | :x:                                       | :white_check_mark:[^4]           | :x:                                                   | :x:                                  |
 
@@ -91,7 +91,7 @@ Dependencies
 The **alpaka** library itself just requires header-only libraries.
 However some of the accelerator back-end implementations require different boost libraries to be built.
 
-When an accelerator back-end using *CUDA* is enabled, version *11.0* (with nvcc as CUDA compiler) or version *9.2* (with clang as CUDA compiler) of the *CUDA SDK* is the minimum requirement.
+When an accelerator back-end using *CUDA* is enabled, version *11.2* (with nvcc as CUDA compiler) or version *11.2* (with clang as CUDA compiler) of the *CUDA SDK* is the minimum requirement.
 *NOTE*: When using clang as a native *CUDA* compiler, the *CUDA accelerator back-end* can not be enabled together with any *OpenMP accelerator back-end* because this combination is currently unsupported.
 *NOTE*: Separable compilation is disabled by default and can be enabled via the CMake flag `CMAKE_CUDA_SEPARABLE_COMPILATION`.
 

diff --git a/benchmarks/babelstream/src/AlpakaStream.cpp b/benchmarks/babelstream/src/AlpakaStream.cpp
@@ -50,7 +50,7 @@ template<typename T>
 void AlpakaStream<T>::init_arrays(T initA, T initB, T initC)
 {
     auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-    // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
+
     alpaka::exec<
         Acc>(queue, workdiv, InitKernel{}, std::data(d_a), std::data(d_b), std::data(d_c), initA, initB, initC);
     alpaka::wait(queue);
@@ -78,7 +78,7 @@ template<typename T>
 void AlpakaStream<T>::copy()
 {
     auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-    // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
+
     alpaka::exec<Acc>(queue, workdiv, CopyKernel{}, std::data(d_a), std::data(d_c));
     alpaka::wait(queue);
 }
@@ -98,7 +98,7 @@ template<typename T>
 void AlpakaStream<T>::mul()
 {
     auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-    // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
+
     alpaka::exec<Acc>(queue, workdiv, MulKernel{}, std::data(d_b), std::data(d_c));
     alpaka::wait(queue);
 }
@@ -117,7 +117,7 @@ template<typename T>
 void AlpakaStream<T>::add()
 {
     auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-    // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
+
     alpaka::exec<Acc>(queue, workdiv, AddKernel{}, std::data(d_a), std::data(d_b), std::data(d_c));
     alpaka::wait(queue);
 }
@@ -137,7 +137,7 @@ template<typename T>
 void AlpakaStream<T>::triad()
 {
     auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-    // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
+
     alpaka::exec<Acc>(queue, workdiv, TriadKernel{}, std::data(d_a), std::data(d_b), std::data(d_c));
     alpaka::wait(queue);
 }
@@ -157,7 +157,7 @@ template<typename T>
 void AlpakaStream<T>::nstream()
 {
     auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-    // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
+
     alpaka::exec<Acc>(queue, workdiv, NstreamKernel{}, std::data(d_a), std::data(d_b), std::data(d_c));
     alpaka::wait(queue);
 }
@@ -197,7 +197,6 @@ template<typename T>
 auto AlpakaStream<T>::dot() -> T
 {
     auto const workdiv = WorkDiv{dotBlockSize, blockSize, 1};
-    // auto const workdiv = alpaka::getValidWorkDiv(devAcc, dotBlockSize * blockSize);
     alpaka::exec<Acc>(queue, workdiv, DotKernel{}, std::data(d_a), std::data(d_b), std::data(d_sum), arraySize);
     alpaka::wait(queue);
 

diff --git a/cmake/alpakaCommon.cmake b/cmake/alpakaCommon.cmake
@@ -470,6 +470,10 @@ if(alpaka_ACC_GPU_CUDA_ENABLE)
         elseif(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
             message(STATUS "nvcc is used as CUDA compiler")
 
+            if(alpaka_CXX_STANDARD GREATER_EQUAL 20 AND CMAKE_VERSION VERSION_LESS "3.25.0")
+                message(FATAL_ERROR "CMake 3.24 and older does not support C++20 for nvcc")
+            endif()
+
             # nvcc sets no linux/__linux macros on OpenPOWER linux
             # nvidia bug id: 2448610
             if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
@@ -538,9 +542,20 @@ if(alpaka_ACC_GPU_CUDA_ENABLE)
             endif()
         endif()
 
+        # Link the CUDA Runtime library
+        if(CMAKE_CUDA_RUNTIME_LIBRARY STREQUAL "Shared")
+            target_link_libraries(alpaka INTERFACE CUDA::cudart)
+        else()
+            target_link_libraries(alpaka INTERFACE CUDA::cudart_static)
+        endif()
+
         if(NOT alpaka_DISABLE_VENDOR_RNG)
             # Use cuRAND random number generators
-            target_link_libraries(alpaka INTERFACE CUDA::cudart CUDA::curand)
+            if(CMAKE_CUDA_RUNTIME_LIBRARY STREQUAL "Shared")
+                target_link_libraries(alpaka INTERFACE CUDA::curand)
+            else()
+                target_link_libraries(alpaka INTERFACE CUDA::curand_static)
+            endif()
         endif()
     else()
         message(FATAL_ERROR "Optional alpaka dependency CUDA could not be found!")

diff --git a/docs/source/basic/cheatsheet.rst b/docs/source/basic/cheatsheet.rst
@@ -176,15 +176,22 @@ Enqueue a memory copy from device to host
 
 Kernel Execution
 ----------------
+Prepare Kernel Bundle
+  .. code-block:: c++
+
+     HeatEquationKernel heatEqKernel;
+     // Arguments of KernelBundle: The kernel instance and the kernel arguments
+     auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
 
 Automatically select a valid kernel launch configuration
   .. code-block:: c++
 
      Vec<Dim, Idx> const globalThreadExtent = vectorValue;
      Vec<Dim, Idx> const elementsPerThread = vectorValue;
 
-     auto autoWorkDiv = getValidWorkDiv<Acc>(
+     auto autoWorkDiv = getValidWorkDivForKernel<Acc>(
        device,
+       bundeledKernel,
        globalThreadExtent, elementsPerThread,
        false,
        GridBlockExtentSubDivRestrictions::Unrestricted);

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
@@ -26,6 +26,7 @@ add_subdirectory("helloWorld/")
 add_subdirectory("helloWorldLambda/")
 add_subdirectory("kernelSpecialization/")
 add_subdirectory("ls/")
+add_subdirectory("matrixMulWithMdspan/")
 add_subdirectory("monteCarloIntegration/")
 add_subdirectory("openMPSchedule/")
 add_subdirectory("parallelLoopPatterns/")

diff --git a/example/bufferCopy/src/bufferCopy.cpp b/example/bufferCopy/src/bufferCopy.cpp
@@ -106,19 +106,6 @@ auto example(TAccTag const&) -> int
     using Vec = alpaka::Vec<Dim, Idx>;
     Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
     Vec const threadsPerGrid(Vec::all(static_cast<Idx>(10)));
-    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
-    WorkDiv const devWorkDiv = alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        threadsPerGrid,
-        elementsPerThread,
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
-    WorkDiv const hostWorkDiv = alpaka::getValidWorkDiv<Host>(
-        devHost,
-        threadsPerGrid,
-        elementsPerThread,
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
 
     // Create host and device buffers
     //
@@ -177,10 +164,13 @@ auto example(TAccTag const&) -> int
 
     FillBufferKernel fillBufferKernel;
 
+    auto const& bundeledFillBufferKernel = alpaka::KernelBundle(fillBufferKernel, hostViewPlainPtrMdSpan);
+    auto const hostWorkDiv
+        = alpaka::getValidWorkDivForKernel<Host>(devHost, bundeledFillBufferKernel, threadsPerGrid, elementsPerThread);
+
     alpaka::exec<Host>(hostQueue, hostWorkDiv, fillBufferKernel,
                        hostViewPlainPtrMdSpan); // 1st kernel argument
 
-
     // Copy host to device Buffer
     //
     // A copy operation of one buffer into
@@ -213,10 +203,15 @@ auto example(TAccTag const&) -> int
     auto deviceBufferMdSpan2 = alpaka::experimental::getMdSpan(deviceBuffer2);
 
     TestBufferKernel testBufferKernel;
+    auto const& bundeledTestBufferKernel = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan1);
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const devWorkDiv
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledTestBufferKernel, threadsPerGrid, elementsPerThread);
+
     alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan1);
     alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan2);
 
-
     // Print device Buffer
     //
     // Because we really like to flood our

diff --git a/example/complex/src/complex.cpp b/example/complex/src/complex.cpp
@@ -55,15 +55,16 @@ auto example(TAccTag const&) -> int
     // Define the work division
     Idx const threadsPerGrid = 1u;
     Idx const elementsPerThread = 1u;
-    auto const workDiv = alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        threadsPerGrid,
-        elementsPerThread,
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+
+    ComplexKernel complexKernel;
+
+    auto const& bundeledKernel = alpaka::KernelBundle(complexKernel);
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
 
     // Run the kernel
-    alpaka::exec<Acc>(queue, workDiv, ComplexKernel{});
+    alpaka::exec<Acc>(queue, workDiv, complexKernel);
     alpaka::wait(queue);
 
     // Usage of alpaka::Complex<T> on the host side is the same as inside kernels, except math functions are not

diff --git a/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp b/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp
@@ -129,14 +129,6 @@ auto example(TAccTag const&) -> int
     //
     auto outputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, extent);
 
-    //  Let alpaka calculate good block and grid sizes given our full problem extent.
-    alpaka::WorkDivMembers<Dim, Idx> const workDiv(alpaka::getValidWorkDiv<DevAcc>(
-        devAcc,
-        extent,
-        Vec::ones(),
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
-
     //  Prepare convolution filter at host
     //
     std::vector<DataType> const filter = {0.11, 0.12, 0.13, 0.14, 0.15, 0.21, 0.22, 0.23, 0.24, 0.25, 0.31, 0.32, 0.33,
@@ -155,6 +147,17 @@ auto example(TAccTag const&) -> int
     //  Construct kernel object
     ConvolutionKernelMdspan2D convolutionKernel2D;
 
+    // Make a bundle
+    auto const& bundeledKernel = alpaka::KernelBundle(
+        convolutionKernel2D,
+        alpaka::experimental::getMdSpan(bufInputAcc),
+        alpaka::experimental::getMdSpan(outputDeviceMemory),
+        alpaka::experimental::getMdSpan(bufFilterAcc));
+
+    //   Let alpaka calculate good block and grid sizes given our full problem extent.
+    auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());
+
+
     // Run the kernel, pass 3 arrays as 2D mdspans
     alpaka::exec<DevAcc>(
         queueAcc,
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,8 @@ on: @@
       push:
         branches:
           - develop
+    env:
+      ALPAKA_CI_OS_NAME: "Linux"
     jobs:
       gh-pages:
@@ Expand Down @@