Skip to content

Commit

Permalink
Fix dispatching for some cuda kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
charleskawczynski committed Jun 9, 2024
1 parent a57c937 commit 525be4d
Show file tree
Hide file tree
Showing 9 changed files with 222 additions and 145 deletions.
31 changes: 31 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ steps:
key: unit_data0d
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/data0d.jl"

- label: "Unit: data_fill"
key: unit_data_fill
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/fill.jl"

- label: "Unit: data1d"
key: unit_data1d
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/data1d.jl"
Expand All @@ -103,6 +107,16 @@ steps:
agents:
slurm_gpus: 1

- label: "Unit: data fill"
key: unit_data_fill
command:
- "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/fill.jl"
env:
CLIMACOMMS_DEVICE: "CUDA"
agents:
slurm_gpus: 1

- group: "Unit: Geometry"
steps:

Expand Down Expand Up @@ -1060,6 +1074,23 @@ steps:
key: "cpu_axis_tensor_conversion_perf_bm"
command: "julia --color=yes --project=.buildkite test/Geometry/axistensor_conversion_benchmarks.jl"

- group: "Perf: DataLayouts"
steps:

- label: "Perf: DataLayouts fill"
key: "cpu_datalayouts_fill"
command: "julia --color=yes --project=.buildkite test/DataLayouts/benchmark_fill.jl"

- label: "Perf: DataLayouts fill"
key: "gpu_datalayouts_fill"
command:
- "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
- "julia --color=yes --project=.buildkite test/DataLayouts/benchmark_fill.jl"
env:
CLIMACOMMS_DEVICE: "CUDA"
agents:
slurm_gpus: 1

- group: "Perf: Fields"
steps:

Expand Down
2 changes: 2 additions & 0 deletions ext/ClimaCoreCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import ClimaCore.Utilities: half
import ClimaCore.RecursiveApply:
, , , radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax

const cu_array = Union{CUDA.CuArray, SubArray{<:Any, <:Any, CUDA.CuArray}}

include(joinpath("cuda", "cuda_utils.jl"))
include(joinpath("cuda", "data_layouts.jl"))
include(joinpath("cuda", "fields.jl"))
Expand Down
100 changes: 10 additions & 90 deletions ext/cuda/data_layouts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,11 @@ import ClimaCore.DataLayouts: IJKFVH, IJFH, VIJFH, VIFH, IFH, IJF, IF, VF, DataF
import ClimaCore.DataLayouts: IJFHStyle, VIJFHStyle, VFStyle, DataFStyle
import ClimaCore.DataLayouts: promote_parent_array_type
import ClimaCore.DataLayouts: parent_array_type
import ClimaCore.DataLayouts: device_from_array_type, isascalar
import ClimaCore.DataLayouts: isascalar
import ClimaCore.DataLayouts: fused_copyto!
import Adapt
import CUDA

device_from_array_type(::Type{<:CUDA.CuArray}) = ClimaComms.CUDADevice()
device_from_array_type(::Type{<:SubArray{<:Any, <:Any, <:CUDA.CuArray}}) =
ClimaComms.CUDADevice()

parent_array_type(::Type{<:CUDA.CuArray{T, N, B} where {N}}) where {T, B} =
CUDA.CuArray{T, N, B} where {N}

Expand Down Expand Up @@ -44,24 +40,10 @@ function knl_copyto!(dest, src)
return nothing
end

function knl_fill!(dest, val)
i = CUDA.threadIdx().x
j = CUDA.threadIdx().y

h = CUDA.blockIdx().x
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z

if v <= size(dest, 4)
I = CartesianIndex((i, j, 1, v, h))
@inbounds dest[I] = val
end
return nothing
end

function Base.copyto!(
dest::IJFH{S, Nij},
bc::Union{IJFH{S, Nij, A}, Base.Broadcast.Broadcasted{IJFHStyle{Nij, A}}},
) where {S, Nij, A <: CUDA.CuArray}
) where {S, Nij, A <: cu_array}
_, _, _, _, Nh = size(bc)
if Nh > 0
auto_launch!(
Expand All @@ -74,36 +56,14 @@ function Base.copyto!(
end
return dest
end
function Base.fill!(
dest::IJFH{S, Nij, A},
val,
) where {
S,
Nij,
A <: Union{CUDA.CuArray, SubArray{<:Any, <:Any, <:CUDA.CuArray}},
}
_, _, _, _, Nh = size(dest)
if Nh > 0
auto_launch!(
knl_fill!,
(dest, val),
dest;
threads_s = (Nij, Nij),
blocks_s = (Nh, 1),
)
end
return dest
end



function Base.copyto!(
dest::VIJFH{S, Nv, Nij},
bc::Union{
VIJFH{S, Nv, Nij, A},
Base.Broadcast.Broadcasted{VIJFHStyle{Nv, Nij, A}},
},
) where {S, Nv, Nij, A <: CUDA.CuArray}
) where {S, Nv, Nij, A <: cu_array}
_, _, _, _, Nh = size(bc)
if Nv > 0 && Nh > 0
Nv_per_block = min(Nv, fld(256, Nij * Nij))
Expand All @@ -118,30 +78,11 @@ function Base.copyto!(
end
return dest
end
function Base.fill!(
dest::VIJFH{S, Nv, Nij, A},
val,
) where {S, Nv, Nij, A <: CUDA.CuArray}
_, _, _, _, Nh = size(dest)
if Nv > 0 && Nh > 0
Nv_per_block = min(Nv, fld(256, Nij * Nij))
Nv_blocks = cld(Nv, Nv_per_block)
auto_launch!(
knl_fill!,
(dest, val),
dest;
threads_s = (Nij, Nij, Nv_per_block),
blocks_s = (Nh, Nv_blocks),
)
end
return dest
end


function Base.copyto!(
dest::VF{S, Nv},
bc::Union{VF{S, Nv, A}, Base.Broadcast.Broadcasted{VFStyle{Nv, A}}},
) where {S, Nv, A <: CUDA.CuArray}
) where {S, Nv, A <: cu_array}
_, _, _, _, Nh = size(dest)
if Nv > 0 && Nh > 0
auto_launch!(
Expand All @@ -154,19 +95,6 @@ function Base.copyto!(
end
return dest
end
function Base.fill!(dest::VF{S, Nv, A}, val) where {S, Nv, A <: CUDA.CuArray}
_, _, _, _, Nh = size(dest)
if Nv > 0 && Nh > 0
auto_launch!(
knl_fill!,
(dest, val),
dest;
threads_s = (1, 1),
blocks_s = (Nh, Nv),
)
end
return dest
end

function Base.copyto!(
dest::DataF{S},
Expand All @@ -181,16 +109,8 @@ function Base.copyto!(
)
return dest
end
function Base.fill!(dest::DataF{S, A}, val) where {S, A <: CUDA.CuArray}
auto_launch!(
knl_fill!,
(dest, val),
dest;
threads_s = (1, 1),
blocks_s = (1, 1),
)
return dest
end

include("fill.jl")

Base.@propagate_inbounds function rcopyto_at!(
pair::Pair{<:AbstractData, <:Any},
Expand Down Expand Up @@ -236,7 +156,7 @@ end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::VIJFH{S, Nv, Nij},
dest1::VIJFH{S, Nv, Nij, <:cu_array},
::ClimaComms.CUDADevice,
) where {S, Nv, Nij}
_, _, _, _, Nh = size(dest1)
Expand All @@ -256,7 +176,7 @@ end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::IJFH{S, Nij},
dest1::IJFH{S, Nij, <:cu_array},
::ClimaComms.CUDADevice,
) where {S, Nij}
_, _, _, _, Nh = size(dest1)
Expand All @@ -273,7 +193,7 @@ function fused_copyto!(
end
function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::VF{S, Nv},
dest1::VF{S, Nv, <:cu_array},
::ClimaComms.CUDADevice,
) where {S, Nv}
_, _, _, _, Nh = size(dest1)
Expand All @@ -291,7 +211,7 @@ end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::DataF{S},
dest1::DataF{S, <:cu_array},
::ClimaComms.CUDADevice,
) where {S}
auto_launch!(
Expand Down
30 changes: 30 additions & 0 deletions ext/cuda/fill.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
cartesian_index(::AbstractData, inds) = CartesianIndex(inds)

function knl_fill_flat!(dest::AbstractData, val)
n = size(dest)
inds = kernel_indexes(n)
if valid_range(inds, n)
I = cartesian_index(dest, inds)
@inbounds dest[I] = val
end
return nothing
end

function cuda_fill!(dest::AbstractData, val)
(Nij, Nij, Nf, Nv, Nh) = size(dest)
if Nv > 0 && Nh > 0
auto_launch!(knl_fill_flat!, (dest, val), dest; auto = true)
end
return dest
end

#! format: off
Base.fill!(dest::IJFH{<:Any, <:Any, <:cu_array}, val) = cuda_fill!(dest, val)
Base.fill!(dest::IFH{<:Any, <:Any, <:cu_array}, val) = cuda_fill!(dest, val)
Base.fill!(dest::IJF{<:Any, <:Any, <:cu_array}, val) = cuda_fill!(dest, val)
Base.fill!(dest::IF{<:Any, <:Any, <:cu_array}, val) = cuda_fill!(dest, val)
Base.fill!(dest::VIFH{<:Any, <:Any, <:Any, <:cu_array}, val) = cuda_fill!(dest, val)
Base.fill!(dest::VIJFH{<:Any, <:Any, <:Any, <:cu_array}, val) = cuda_fill!(dest, val)
Base.fill!(dest::VF{<:Any, <:Any, <:cu_array}, val) = cuda_fill!(dest, val)
Base.fill!(dest::DataF{<:Any, <:cu_array}, val) = cuda_fill!(dest, val)
#! format: on
Loading

0 comments on commit 525be4d

Please sign in to comment.