Skip to content

Commit

Permalink
Fix dispatching for some cuda kernels
Browse files Browse the repository at this point in the history
Fix

Update test filenames, ndims on DataLayouts - fix CI

Build latest version in docs

Fixes for cuda kernels with empty fields
  • Loading branch information
charleskawczynski committed Jun 10, 2024
1 parent a57c937 commit efb0865
Show file tree
Hide file tree
Showing 14 changed files with 320 additions and 165 deletions.
31 changes: 31 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ steps:
key: unit_data0d
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/data0d.jl"

- label: "Unit: data_fill"
key: unit_data_fill
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_fill.jl"

- label: "Unit: data1d"
key: unit_data1d
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/data1d.jl"
Expand All @@ -103,6 +107,16 @@ steps:
agents:
slurm_gpus: 1

- label: "Unit: data fill"
key: gpu_unit_data_fill
command:
- "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_fill.jl"
env:
CLIMACOMMS_DEVICE: "CUDA"
agents:
slurm_gpus: 1

- group: "Unit: Geometry"
steps:

Expand Down Expand Up @@ -1060,6 +1074,23 @@ steps:
key: "cpu_axis_tensor_conversion_perf_bm"
command: "julia --color=yes --project=.buildkite test/Geometry/axistensor_conversion_benchmarks.jl"

- group: "Perf: DataLayouts"
steps:

- label: "Perf: DataLayouts fill"
key: "cpu_datalayouts_fill"
command: "julia --color=yes --project=.buildkite test/DataLayouts/benchmark_fill.jl"

- label: "Perf: DataLayouts fill"
key: "gpu_datalayouts_fill"
command:
- "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
- "julia --color=yes --project=.buildkite test/DataLayouts/benchmark_fill.jl"
env:
CLIMACOMMS_DEVICE: "CUDA"
agents:
slurm_gpus: 1

- group: "Perf: Fields"
steps:

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
with:
version: '1.10'
- name: Install dependencies
run: julia --project=docs -e 'using Pkg; Pkg.instantiate(;verbose=true)'
run: julia --project=docs -e 'using Pkg; Pkg.develop(path="."); Pkg.instantiate(;verbose=true)'
- name: Build and deploy
env:
GKSwstype: "100" # headless GR: https://discourse.julialang.org/t/generation-of-documentation-fails-qt-qpa-xcb-could-not-connect-to-display/60988/2
Expand Down
3 changes: 3 additions & 0 deletions ext/ClimaCoreCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ import ClimaCore.Utilities: half
import ClimaCore.RecursiveApply:
, , , radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax

const CuArrayBackedTypes =
Union{CUDA.CuArray, SubArray{<:Any, <:Any, CUDA.CuArray}}

include(joinpath("cuda", "cuda_utils.jl"))
include(joinpath("cuda", "data_layouts.jl"))
include(joinpath("cuda", "fields.jl"))
Expand Down
17 changes: 10 additions & 7 deletions ext/cuda/cuda_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ import ClimaCore.DataLayouts
import ClimaCore.DataLayouts: empty_kernel_stats

get_n_items(field::Fields.Field) =
prod(size(parent(Fields.field_values(field))))
get_n_items(data::DataLayouts.AbstractData) = prod(size(parent(data)))
prod(DataLayouts.universal_size(Fields.field_values(field)))
get_n_items(data::DataLayouts.AbstractData) =
prod(DataLayouts.universal_size(parent(data)))
get_n_items(arr::AbstractArray) = prod(size(parent(arr)))
get_n_items(tup::Tuple) = prod(tup)

Expand Down Expand Up @@ -47,11 +48,13 @@ function auto_launch!(
) where {F!}
if auto
nitems = get_n_items(data)
kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
config = CUDA.launch_configuration(kernel.fun)
threads = min(nitems, config.threads)
blocks = cld(nitems, threads)
kernel(args...; threads, blocks) # This knows to use always_inline from above.
if nitems 0
kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
config = CUDA.launch_configuration(kernel.fun)
threads = min(nitems, config.threads)
blocks = cld(nitems, threads)
kernel(args...; threads, blocks) # This knows to use always_inline from above.
end
else
kernel =
CUDA.@cuda always_inline = always_inline threads = threads_s blocks =
Expand Down
100 changes: 10 additions & 90 deletions ext/cuda/data_layouts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,11 @@ import ClimaCore.DataLayouts: IJKFVH, IJFH, VIJFH, VIFH, IFH, IJF, IF, VF, DataF
import ClimaCore.DataLayouts: IJFHStyle, VIJFHStyle, VFStyle, DataFStyle
import ClimaCore.DataLayouts: promote_parent_array_type
import ClimaCore.DataLayouts: parent_array_type
import ClimaCore.DataLayouts: device_from_array_type, isascalar
import ClimaCore.DataLayouts: isascalar
import ClimaCore.DataLayouts: fused_copyto!
import Adapt
import CUDA

device_from_array_type(::Type{<:CUDA.CuArray}) = ClimaComms.CUDADevice()
device_from_array_type(::Type{<:SubArray{<:Any, <:Any, <:CUDA.CuArray}}) =
ClimaComms.CUDADevice()

parent_array_type(::Type{<:CUDA.CuArray{T, N, B} where {N}}) where {T, B} =
CUDA.CuArray{T, N, B} where {N}

Expand Down Expand Up @@ -44,24 +40,10 @@ function knl_copyto!(dest, src)
return nothing
end

function knl_fill!(dest, val)
i = CUDA.threadIdx().x
j = CUDA.threadIdx().y

h = CUDA.blockIdx().x
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z

if v <= size(dest, 4)
I = CartesianIndex((i, j, 1, v, h))
@inbounds dest[I] = val
end
return nothing
end

function Base.copyto!(
dest::IJFH{S, Nij},
bc::Union{IJFH{S, Nij, A}, Base.Broadcast.Broadcasted{IJFHStyle{Nij, A}}},
) where {S, Nij, A <: CUDA.CuArray}
) where {S, Nij, A <: CuArrayBackedTypes}
_, _, _, _, Nh = size(bc)
if Nh > 0
auto_launch!(
Expand All @@ -74,36 +56,14 @@ function Base.copyto!(
end
return dest
end
function Base.fill!(
dest::IJFH{S, Nij, A},
val,
) where {
S,
Nij,
A <: Union{CUDA.CuArray, SubArray{<:Any, <:Any, <:CUDA.CuArray}},
}
_, _, _, _, Nh = size(dest)
if Nh > 0
auto_launch!(
knl_fill!,
(dest, val),
dest;
threads_s = (Nij, Nij),
blocks_s = (Nh, 1),
)
end
return dest
end



function Base.copyto!(
dest::VIJFH{S, Nv, Nij},
bc::Union{
VIJFH{S, Nv, Nij, A},
Base.Broadcast.Broadcasted{VIJFHStyle{Nv, Nij, A}},
},
) where {S, Nv, Nij, A <: CUDA.CuArray}
) where {S, Nv, Nij, A <: CuArrayBackedTypes}
_, _, _, _, Nh = size(bc)
if Nv > 0 && Nh > 0
Nv_per_block = min(Nv, fld(256, Nij * Nij))
Expand All @@ -118,30 +78,11 @@ function Base.copyto!(
end
return dest
end
function Base.fill!(
dest::VIJFH{S, Nv, Nij, A},
val,
) where {S, Nv, Nij, A <: CUDA.CuArray}
_, _, _, _, Nh = size(dest)
if Nv > 0 && Nh > 0
Nv_per_block = min(Nv, fld(256, Nij * Nij))
Nv_blocks = cld(Nv, Nv_per_block)
auto_launch!(
knl_fill!,
(dest, val),
dest;
threads_s = (Nij, Nij, Nv_per_block),
blocks_s = (Nh, Nv_blocks),
)
end
return dest
end


function Base.copyto!(
dest::VF{S, Nv},
bc::Union{VF{S, Nv, A}, Base.Broadcast.Broadcasted{VFStyle{Nv, A}}},
) where {S, Nv, A <: CUDA.CuArray}
) where {S, Nv, A <: CuArrayBackedTypes}
_, _, _, _, Nh = size(dest)
if Nv > 0 && Nh > 0
auto_launch!(
Expand All @@ -154,19 +95,6 @@ function Base.copyto!(
end
return dest
end
function Base.fill!(dest::VF{S, Nv, A}, val) where {S, Nv, A <: CUDA.CuArray}
_, _, _, _, Nh = size(dest)
if Nv > 0 && Nh > 0
auto_launch!(
knl_fill!,
(dest, val),
dest;
threads_s = (1, 1),
blocks_s = (Nh, Nv),
)
end
return dest
end

function Base.copyto!(
dest::DataF{S},
Expand All @@ -181,16 +109,8 @@ function Base.copyto!(
)
return dest
end
function Base.fill!(dest::DataF{S, A}, val) where {S, A <: CUDA.CuArray}
auto_launch!(
knl_fill!,
(dest, val),
dest;
threads_s = (1, 1),
blocks_s = (1, 1),
)
return dest
end

include("fill.jl")

Base.@propagate_inbounds function rcopyto_at!(
pair::Pair{<:AbstractData, <:Any},
Expand Down Expand Up @@ -236,7 +156,7 @@ end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::VIJFH{S, Nv, Nij},
dest1::VIJFH{S, Nv, Nij, <:CuArrayBackedTypes},
::ClimaComms.CUDADevice,
) where {S, Nv, Nij}
_, _, _, _, Nh = size(dest1)
Expand All @@ -256,7 +176,7 @@ end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::IJFH{S, Nij},
dest1::IJFH{S, Nij, <:CuArrayBackedTypes},
::ClimaComms.CUDADevice,
) where {S, Nij}
_, _, _, _, Nh = size(dest1)
Expand All @@ -273,7 +193,7 @@ function fused_copyto!(
end
function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::VF{S, Nv},
dest1::VF{S, Nv, <:CuArrayBackedTypes},
::ClimaComms.CUDADevice,
) where {S, Nv}
_, _, _, _, Nh = size(dest1)
Expand All @@ -291,7 +211,7 @@ end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::DataF{S},
dest1::DataF{S, <:CuArrayBackedTypes},
::ClimaComms.CUDADevice,
) where {S}
auto_launch!(
Expand Down
30 changes: 30 additions & 0 deletions ext/cuda/fill.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
cartesian_index(::AbstractData, inds) = CartesianIndex(inds)

function knl_fill_flat!(dest::AbstractData, val)
n = DataLayouts.universal_size(dest)
inds = kernel_indexes(n)
if valid_range(inds, n)
I = cartesian_index(dest, inds)
@inbounds dest[I] = val
end
return nothing
end

function cuda_fill!(dest::AbstractData, val)
(_, _, Nf, Nv, Nh) = DataLayouts.universal_size(dest)
if Nv > 0 && Nh > 0 && Nf > 0
auto_launch!(knl_fill_flat!, (dest, val), dest; auto = true)
end
return dest
end

#! format: off
Base.fill!(dest::IJFH{<:Any, <:Any, <:CuArrayBackedTypes}, val) = cuda_fill!(dest, val)
Base.fill!(dest::IFH{<:Any, <:Any, <:CuArrayBackedTypes}, val) = cuda_fill!(dest, val)
Base.fill!(dest::IJF{<:Any, <:Any, <:CuArrayBackedTypes}, val) = cuda_fill!(dest, val)
Base.fill!(dest::IF{<:Any, <:Any, <:CuArrayBackedTypes}, val) = cuda_fill!(dest, val)
Base.fill!(dest::VIFH{<:Any, <:Any, <:Any, <:CuArrayBackedTypes}, val) = cuda_fill!(dest, val)
Base.fill!(dest::VIJFH{<:Any, <:Any, <:Any, <:CuArrayBackedTypes}, val) = cuda_fill!(dest, val)
Base.fill!(dest::VF{<:Any, <:Any, <:CuArrayBackedTypes}, val) = cuda_fill!(dest, val)
Base.fill!(dest::DataF{<:Any, <:CuArrayBackedTypes}, val) = cuda_fill!(dest, val)
#! format: on
Loading

0 comments on commit efb0865

Please sign in to comment.