refactor mem allocate in cuda kernel launcher

Ferrite-FEM · Dec 4, 2024 · 05fb154 · 05fb154
1 parent 52f1479
commit 05fb154
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 51 deletions.
diff --git a/ext/GPU/CUDAKernelLauncher.jl b/ext/GPU/CUDAKernelLauncher.jl
@@ -14,23 +14,17 @@ end
 
 
 function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
-    return if CUDA.functional()
+    if CUDA.functional()
         threads = convert(Ti, min(n_cells, 256))
-        shared_mem = _calculate_shared_memory(threads, n_basefuncs)
         blocks = _calculate_nblocks(threads, n_cells)
-        _adapted_args = _adapt_args(args)
-
-        if (_can_use_dynshmem(shared_mem))
-            Ke = DynamicSharedMemFunction{3, Float32, Int32}((threads, n_basefuncs, n_basefuncs), Int32(0))
-            fe = DynamicSharedMemFunction{2, Float32, Int32}((threads, n_basefuncs), sizeof(Float32) * threads * n_basefuncs * n_basefuncs |> Int32)
-            mem_alloc = SharedMemAlloc(Ke, fe, shared_mem)
-            return CudaKernel(n_cells, n_basefuncs, kernel, _adapted_args, mem_alloc, threads, blocks)
-        else
-            Kes = CUDA.zeros(Float32, n_cells, n_basefuncs, n_basefuncs)
-            fes = CUDA.zeros(Float32, n_cells, n_basefuncs)
-            mem_alloc = GlobalMemAlloc(Kes, fes)
-            return CudaKernel(n_cells, n_basefuncs, kernel, _adapted_args, mem_alloc, threads, blocks)
-        end
+        adapted_args = _adapt_args(args)
+
+        mem_alloc = try_allocate_shared_mem(Float32, threads, n_basefuncs)
+        mem_alloc isa Nothing || return CudaKernel(n_cells, n_basefuncs, kernel, adapted_args, mem_alloc, threads, blocks)
+
+        # FIXME: mem_alloc adapted twice here and in launch!
+        mem_alloc = allocate_global_mem(Float32, n_cells, n_basefuncs) |> _adapt
+        return CudaKernel(n_cells, n_basefuncs, kernel, adapted_args, mem_alloc, threads, blocks)
     else
         throw(ArgumentError("CUDA is not functional, please check your GPU driver and CUDA installation"))
     end
@@ -72,40 +66,6 @@ function Ferrite.launch!(kernel::CudaKernel{GlobalMemAlloc{LOCAL_MATRICES, LOCAL
     return nothing
 end
 
-
-"""
-    _calculate_shared_memory(threads::Integer, n_basefuncs::Integer)
-
-Calculate the shared memory required for kernel execution.
-
-# Arguments
-- `threads::Integer`: Number of threads per block.
-- `n_basefuncs::Integer`: Number of basis functions per cell.
-
-# Returns
-- `Integer`: Amount of shared memory in bytes.
-"""
-function _calculate_shared_memory(threads::Ti, n_basefuncs::Ti) where {Ti <: Integer}
-    return convert(Ti, sizeof(Float32) * (threads) * (n_basefuncs) * n_basefuncs + sizeof(Float32) * (threads) * n_basefuncs)
-end
-
-"""
-    _can_use_dynshmem(required_shmem::Integer)
-
-Check if the GPU supports the required amount of dynamic shared memory.
-
-# Arguments
-- `required_shmem::Integer`: Required shared memory size in bytes.
-
-# Returns
-- `Bool`: `true` if the GPU can provide the required shared memory; `false` otherwise.
-"""
-function _can_use_dynshmem(required_shmem::Integer)
-    dev = device()
-    MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
-    return required_shmem < MAX_DYN_SHMEM
-end
-
 """
     _calculate_nblocks(threads::Integer, n_cells::Integer)
 

diff --git a/ext/GPU/adapt.jl b/ext/GPU/adapt.jl
@@ -2,6 +2,7 @@
 
 Adapt.@adapt_structure GPUGrid
 Adapt.@adapt_structure GPUDofHandler
+Adapt.@adapt_structure GlobalMemAlloc
 
 function _adapt_args(args)
     return tuple(((_adapt(arg) for arg in args) |> collect)...)
@@ -13,6 +14,7 @@ function _adapt(kgpu::CUSPARSE.CuSparseMatrixCSC)
     return Adapt.adapt_structure(CUSPARSE.CuSparseDeviceMatrixCSC, kgpu)
 end
 
+
 function _adapt(obj::Any)
     # fallback to the default implementation
     return Adapt.adapt_structure(CuArray, obj)
@@ -21,8 +23,8 @@ end
 ## Adapt GlobalMemAlloc
 function Adapt.adapt_structure(to, mem_alloc::GlobalMemAlloc)
     @show "Adapting GlobalMemAlloc"
-    kes = Adapt.adapt_structure(to, mem_alloc.Kes |> cu)
-    fes = Adapt.adapt_structure(to, mem_alloc.fes |> cu)
+    kes = Adapt.adapt_structure(to, mem_alloc.Kes)
+    fes = Adapt.adapt_structure(to, mem_alloc.fes)
     return GlobalMemAlloc(kes, fes)
 end
 

diff --git a/ext/GPU/cuda_mem_alloc.jl b/ext/GPU/cuda_mem_alloc.jl
@@ -20,10 +20,34 @@ end
 
 mem_size(alloc::SharedMemAlloc) = alloc.tot_mem_size
 
+
+function _can_use_dynshmem(required_shmem::Integer)
+    dev = device()
+    MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
+    return required_shmem < MAX_DYN_SHMEM
+end
+
+
+function try_allocate_shared_mem(::Type{Tv}, block_dim::Ti, n_basefuncs::Ti) where {Ti <: Integer, Tv <: Real}
+    shared_mem = convert(Ti, sizeof(Tv) * (block_dim) * (n_basefuncs) * n_basefuncs + sizeof(Tv) * (block_dim) * n_basefuncs)
+    _can_use_dynshmem(shared_mem) || return nothing
+    Ke = DynamicSharedMemFunction{3, Tv, Ti}((block_dim, n_basefuncs, n_basefuncs), convert(Ti, 0))
+    fe = DynamicSharedMemFunction{2, Tv, Ti}((block_dim, n_basefuncs), convert(Ti, sizeof(Tv) * block_dim * n_basefuncs * n_basefuncs))
+    return SharedMemAlloc(Ke, fe, shared_mem)
+end
+
+
 struct GlobalMemAlloc{LOCAL_MATRICES, LOCAL_VECTORS} <: AbstractCudaMemAlloc
     Kes::LOCAL_MATRICES ## global level allocation (i.e. memory for all blocks -> 3rd order tensor)
     fes::LOCAL_VECTORS  ## global level allocation (i.e. memory for all blocks -> 2nd order tensor)
 end
 
 cellke(alloc::GlobalMemAlloc, e::Ti) where {Ti <: Integer} = @view alloc.Kes[e, :, :]
 cellfe(alloc::GlobalMemAlloc, e::Ti) where {Ti <: Integer} = @view alloc.fes[e, :]
+
+
+function allocate_global_mem(::Type{Tv}, n_cells::Ti, n_basefuncs::Ti) where {Ti <: Integer, Tv <: Real}
+    Kes = CUDA.zeros(Tv, n_cells, n_basefuncs, n_basefuncs)
+    fes = CUDA.zeros(Tv, n_cells, n_basefuncs)
+    return GlobalMemAlloc(Kes, fes)
+end