diff --git a/ext/GPU/CUDAKernelLauncher.jl b/ext/GPU/CUDAKernelLauncher.jl index 82cba20523..02d5ad85e1 100644 --- a/ext/GPU/CUDAKernelLauncher.jl +++ b/ext/GPU/CUDAKernelLauncher.jl @@ -14,23 +14,17 @@ end function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer} - return if CUDA.functional() + if CUDA.functional() threads = convert(Ti, min(n_cells, 256)) - shared_mem = _calculate_shared_memory(threads, n_basefuncs) blocks = _calculate_nblocks(threads, n_cells) - _adapted_args = _adapt_args(args) - - if (_can_use_dynshmem(shared_mem)) - Ke = DynamicSharedMemFunction{3, Float32, Int32}((threads, n_basefuncs, n_basefuncs), Int32(0)) - fe = DynamicSharedMemFunction{2, Float32, Int32}((threads, n_basefuncs), sizeof(Float32) * threads * n_basefuncs * n_basefuncs |> Int32) - mem_alloc = SharedMemAlloc(Ke, fe, shared_mem) - return CudaKernel(n_cells, n_basefuncs, kernel, _adapted_args, mem_alloc, threads, blocks) - else - Kes = CUDA.zeros(Float32, n_cells, n_basefuncs, n_basefuncs) - fes = CUDA.zeros(Float32, n_cells, n_basefuncs) - mem_alloc = GlobalMemAlloc(Kes, fes) - return CudaKernel(n_cells, n_basefuncs, kernel, _adapted_args, mem_alloc, threads, blocks) - end + adapted_args = _adapt_args(args) + + mem_alloc = try_allocate_shared_mem(Float32, threads, n_basefuncs) + mem_alloc isa Nothing || return CudaKernel(n_cells, n_basefuncs, kernel, adapted_args, mem_alloc, threads, blocks) + + # FIXME: mem_alloc adapted twice here and in launch! + mem_alloc = allocate_global_mem(Float32, n_cells, n_basefuncs) |> _adapt + return CudaKernel(n_cells, n_basefuncs, kernel, adapted_args, mem_alloc, threads, blocks) else throw(ArgumentError("CUDA is not functional, please check your GPU driver and CUDA installation")) end @@ -72,40 +66,6 @@ function Ferrite.launch!(kernel::CudaKernel{GlobalMemAlloc{LOCAL_MATRICES, LOCAL return nothing end - -""" - _calculate_shared_memory(threads::Integer, n_basefuncs::Integer) - -Calculate the shared memory required for kernel execution. - -# Arguments -- `threads::Integer`: Number of threads per block. -- `n_basefuncs::Integer`: Number of basis functions per cell. - -# Returns -- `Integer`: Amount of shared memory in bytes. -""" -function _calculate_shared_memory(threads::Ti, n_basefuncs::Ti) where {Ti <: Integer} - return convert(Ti, sizeof(Float32) * (threads) * (n_basefuncs) * n_basefuncs + sizeof(Float32) * (threads) * n_basefuncs) -end - -""" - _can_use_dynshmem(required_shmem::Integer) - -Check if the GPU supports the required amount of dynamic shared memory. - -# Arguments -- `required_shmem::Integer`: Required shared memory size in bytes. - -# Returns -- `Bool`: `true` if the GPU can provide the required shared memory; `false` otherwise. -""" -function _can_use_dynshmem(required_shmem::Integer) - dev = device() - MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) - return required_shmem < MAX_DYN_SHMEM -end - """ _calculate_nblocks(threads::Integer, n_cells::Integer) diff --git a/ext/GPU/adapt.jl b/ext/GPU/adapt.jl index 3aee7c1258..79a3d9bba4 100644 --- a/ext/GPU/adapt.jl +++ b/ext/GPU/adapt.jl @@ -2,6 +2,7 @@ Adapt.@adapt_structure GPUGrid Adapt.@adapt_structure GPUDofHandler +Adapt.@adapt_structure GlobalMemAlloc function _adapt_args(args) return tuple(((_adapt(arg) for arg in args) |> collect)...) @@ -13,6 +14,7 @@ function _adapt(kgpu::CUSPARSE.CuSparseMatrixCSC) return Adapt.adapt_structure(CUSPARSE.CuSparseDeviceMatrixCSC, kgpu) end + function _adapt(obj::Any) # fallback to the default implementation return Adapt.adapt_structure(CuArray, obj) @@ -21,8 +23,8 @@ end ## Adapt GlobalMemAlloc function Adapt.adapt_structure(to, mem_alloc::GlobalMemAlloc) @show "Adapting GlobalMemAlloc" - kes = Adapt.adapt_structure(to, mem_alloc.Kes |> cu) - fes = Adapt.adapt_structure(to, mem_alloc.fes |> cu) + kes = Adapt.adapt_structure(to, mem_alloc.Kes) + fes = Adapt.adapt_structure(to, mem_alloc.fes) return GlobalMemAlloc(kes, fes) end diff --git a/ext/GPU/cuda_mem_alloc.jl b/ext/GPU/cuda_mem_alloc.jl index facfb14d4a..dc30c2040f 100644 --- a/ext/GPU/cuda_mem_alloc.jl +++ b/ext/GPU/cuda_mem_alloc.jl @@ -20,6 +20,23 @@ end mem_size(alloc::SharedMemAlloc) = alloc.tot_mem_size + +function _can_use_dynshmem(required_shmem::Integer) + dev = device() + MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) + return required_shmem < MAX_DYN_SHMEM +end + + +function try_allocate_shared_mem(::Type{Tv}, block_dim::Ti, n_basefuncs::Ti) where {Ti <: Integer, Tv <: Real} + shared_mem = convert(Ti, sizeof(Tv) * (block_dim) * (n_basefuncs) * n_basefuncs + sizeof(Tv) * (block_dim) * n_basefuncs) + _can_use_dynshmem(shared_mem) || return nothing + Ke = DynamicSharedMemFunction{3, Tv, Ti}((block_dim, n_basefuncs, n_basefuncs), convert(Ti, 0)) + fe = DynamicSharedMemFunction{2, Tv, Ti}((block_dim, n_basefuncs), convert(Ti, sizeof(Tv) * block_dim * n_basefuncs * n_basefuncs)) + return SharedMemAlloc(Ke, fe, shared_mem) +end + + struct GlobalMemAlloc{LOCAL_MATRICES, LOCAL_VECTORS} <: AbstractCudaMemAlloc Kes::LOCAL_MATRICES ## global level allocation (i.e. memory for all blocks -> 3rd order tensor) fes::LOCAL_VECTORS ## global level allocation (i.e. memory for all blocks -> 2nd order tensor) @@ -27,3 +44,10 @@ end cellke(alloc::GlobalMemAlloc, e::Ti) where {Ti <: Integer} = @view alloc.Kes[e, :, :] cellfe(alloc::GlobalMemAlloc, e::Ti) where {Ti <: Integer} = @view alloc.fes[e, :] + + +function allocate_global_mem(::Type{Tv}, n_cells::Ti, n_basefuncs::Ti) where {Ti <: Integer, Tv <: Real} + Kes = CUDA.zeros(Tv, n_cells, n_basefuncs, n_basefuncs) + fes = CUDA.zeros(Tv, n_cells, n_basefuncs) + return GlobalMemAlloc(Kes, fes) +end