Skip to content

Commit

Permalink
refactor mem allocate in cuda kernel launcher
Browse files Browse the repository at this point in the history
  • Loading branch information
Abdelrahman912 committed Dec 4, 2024
1 parent 52f1479 commit 05fb154
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 51 deletions.
58 changes: 9 additions & 49 deletions ext/GPU/CUDAKernelLauncher.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,17 @@ end


function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
return if CUDA.functional()
if CUDA.functional()
threads = convert(Ti, min(n_cells, 256))
shared_mem = _calculate_shared_memory(threads, n_basefuncs)
blocks = _calculate_nblocks(threads, n_cells)
_adapted_args = _adapt_args(args)

if (_can_use_dynshmem(shared_mem))
Ke = DynamicSharedMemFunction{3, Float32, Int32}((threads, n_basefuncs, n_basefuncs), Int32(0))
fe = DynamicSharedMemFunction{2, Float32, Int32}((threads, n_basefuncs), sizeof(Float32) * threads * n_basefuncs * n_basefuncs |> Int32)
mem_alloc = SharedMemAlloc(Ke, fe, shared_mem)
return CudaKernel(n_cells, n_basefuncs, kernel, _adapted_args, mem_alloc, threads, blocks)
else
Kes = CUDA.zeros(Float32, n_cells, n_basefuncs, n_basefuncs)
fes = CUDA.zeros(Float32, n_cells, n_basefuncs)
mem_alloc = GlobalMemAlloc(Kes, fes)
return CudaKernel(n_cells, n_basefuncs, kernel, _adapted_args, mem_alloc, threads, blocks)
end
adapted_args = _adapt_args(args)

mem_alloc = try_allocate_shared_mem(Float32, threads, n_basefuncs)
mem_alloc isa Nothing || return CudaKernel(n_cells, n_basefuncs, kernel, adapted_args, mem_alloc, threads, blocks)

# FIXME: mem_alloc adapted twice here and in launch!
mem_alloc = allocate_global_mem(Float32, n_cells, n_basefuncs) |> _adapt
return CudaKernel(n_cells, n_basefuncs, kernel, adapted_args, mem_alloc, threads, blocks)
else
throw(ArgumentError("CUDA is not functional, please check your GPU driver and CUDA installation"))
end
Expand Down Expand Up @@ -72,40 +66,6 @@ function Ferrite.launch!(kernel::CudaKernel{GlobalMemAlloc{LOCAL_MATRICES, LOCAL
return nothing
end


"""
_calculate_shared_memory(threads::Integer, n_basefuncs::Integer)
Calculate the shared memory required for kernel execution.
# Arguments
- `threads::Integer`: Number of threads per block.
- `n_basefuncs::Integer`: Number of basis functions per cell.
# Returns
- `Integer`: Amount of shared memory in bytes.
"""
function _calculate_shared_memory(threads::Ti, n_basefuncs::Ti) where {Ti <: Integer}
return convert(Ti, sizeof(Float32) * (threads) * (n_basefuncs) * n_basefuncs + sizeof(Float32) * (threads) * n_basefuncs)
end

"""
_can_use_dynshmem(required_shmem::Integer)
Check if the GPU supports the required amount of dynamic shared memory.
# Arguments
- `required_shmem::Integer`: Required shared memory size in bytes.
# Returns
- `Bool`: `true` if the GPU can provide the required shared memory; `false` otherwise.
"""
function _can_use_dynshmem(required_shmem::Integer)
dev = device()
MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
return required_shmem < MAX_DYN_SHMEM
end

"""
_calculate_nblocks(threads::Integer, n_cells::Integer)
Expand Down
6 changes: 4 additions & 2 deletions ext/GPU/adapt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

Adapt.@adapt_structure GPUGrid
Adapt.@adapt_structure GPUDofHandler
Adapt.@adapt_structure GlobalMemAlloc

function _adapt_args(args)
return tuple(((_adapt(arg) for arg in args) |> collect)...)
Expand All @@ -13,6 +14,7 @@ function _adapt(kgpu::CUSPARSE.CuSparseMatrixCSC)
return Adapt.adapt_structure(CUSPARSE.CuSparseDeviceMatrixCSC, kgpu)
end


function _adapt(obj::Any)
# fallback to the default implementation
return Adapt.adapt_structure(CuArray, obj)
Expand All @@ -21,8 +23,8 @@ end
## Adapt GlobalMemAlloc
function Adapt.adapt_structure(to, mem_alloc::GlobalMemAlloc)
@show "Adapting GlobalMemAlloc"
kes = Adapt.adapt_structure(to, mem_alloc.Kes |> cu)
fes = Adapt.adapt_structure(to, mem_alloc.fes |> cu)
kes = Adapt.adapt_structure(to, mem_alloc.Kes)
fes = Adapt.adapt_structure(to, mem_alloc.fes)
return GlobalMemAlloc(kes, fes)
end

Expand Down
24 changes: 24 additions & 0 deletions ext/GPU/cuda_mem_alloc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,34 @@ end

mem_size(alloc::SharedMemAlloc) = alloc.tot_mem_size


function _can_use_dynshmem(required_shmem::Integer)
dev = device()
MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
return required_shmem < MAX_DYN_SHMEM
end


function try_allocate_shared_mem(::Type{Tv}, block_dim::Ti, n_basefuncs::Ti) where {Ti <: Integer, Tv <: Real}
shared_mem = convert(Ti, sizeof(Tv) * (block_dim) * (n_basefuncs) * n_basefuncs + sizeof(Tv) * (block_dim) * n_basefuncs)
_can_use_dynshmem(shared_mem) || return nothing
Ke = DynamicSharedMemFunction{3, Tv, Ti}((block_dim, n_basefuncs, n_basefuncs), convert(Ti, 0))
fe = DynamicSharedMemFunction{2, Tv, Ti}((block_dim, n_basefuncs), convert(Ti, sizeof(Tv) * block_dim * n_basefuncs * n_basefuncs))
return SharedMemAlloc(Ke, fe, shared_mem)
end


struct GlobalMemAlloc{LOCAL_MATRICES, LOCAL_VECTORS} <: AbstractCudaMemAlloc
Kes::LOCAL_MATRICES ## global level allocation (i.e. memory for all blocks -> 3rd order tensor)
fes::LOCAL_VECTORS ## global level allocation (i.e. memory for all blocks -> 2nd order tensor)
end

cellke(alloc::GlobalMemAlloc, e::Ti) where {Ti <: Integer} = @view alloc.Kes[e, :, :]
cellfe(alloc::GlobalMemAlloc, e::Ti) where {Ti <: Integer} = @view alloc.fes[e, :]


function allocate_global_mem(::Type{Tv}, n_cells::Ti, n_basefuncs::Ti) where {Ti <: Integer, Tv <: Real}
Kes = CUDA.zeros(Tv, n_cells, n_basefuncs, n_basefuncs)
fes = CUDA.zeros(Tv, n_cells, n_basefuncs)
return GlobalMemAlloc(Kes, fes)
end

0 comments on commit 05fb154

Please sign in to comment.