From 82151d9c228e64ca2e021f27d12dc36d422c5a3c Mon Sep 17 00:00:00 2001 From: Charles Kawczynski Date: Tue, 23 Jul 2024 16:55:48 -0400 Subject: [PATCH] Use UniversalSize struct in more kernels --- .buildkite/pipeline.yml | 4 ++ ext/ClimaCoreCUDAExt.jl | 1 + ext/cuda/data_layouts_copyto.jl | 11 ++-- ext/cuda/data_layouts_fill.jl | 9 ++-- ext/cuda/operators_finite_difference.jl | 28 ++++------ src/DataLayouts/DataLayouts.jl | 35 +++++++++++++ test/DataLayouts/opt_universal_size.jl | 68 +++++++++++++++++++++++++ 7 files changed, 128 insertions(+), 28 deletions(-) create mode 100644 test/DataLayouts/opt_universal_size.jl diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 5392dd1f42..c033ed5efb 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -93,6 +93,10 @@ steps: key: data_opt_similar command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/opt_similar.jl" + - label: "Unit: opt_universal_size" + key: opt_universal_size + command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/opt_universal_size.jl" + - label: "Unit: data_ndims" key: unit_data_ndims command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_ndims.jl" diff --git a/ext/ClimaCoreCUDAExt.jl b/ext/ClimaCoreCUDAExt.jl index 28cc705328..167696e93d 100644 --- a/ext/ClimaCoreCUDAExt.jl +++ b/ext/ClimaCoreCUDAExt.jl @@ -16,6 +16,7 @@ import ClimaCore.Utilities: half import ClimaCore.Utilities: cart_ind, linear_ind import ClimaCore.RecursiveApply: ⊠, ⊞, ⊟, radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax +import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh include(joinpath("cuda", "cuda_utils.jl")) include(joinpath("cuda", "data_layouts.jl")) diff --git a/ext/cuda/data_layouts_copyto.jl b/ext/cuda/data_layouts_copyto.jl index 3e53d05c46..5439a61527 100644 --- a/ext/cuda/data_layouts_copyto.jl +++ b/ext/cuda/data_layouts_copyto.jl @@ -84,11 +84,11 @@ function Base.copyto!( end import ClimaCore.DataLayouts: isascalar -function knl_copyto_flat!(dest::AbstractData, bc) +function knl_copyto_flat!(dest::AbstractData, bc, us) @inbounds begin - n = size(dest) tidx = thread_index() - if valid_range(tidx, prod(n)) + if tidx ≤ get_N(us) + n = size(dest) I = kernel_indexes(tidx, n) dest[I] = bc[I] end @@ -98,14 +98,15 @@ end function cuda_copyto!(dest::AbstractData, bc) (_, _, Nv, Nh) = DataLayouts.universal_size(dest) + us = DataLayouts.UniversalSize(dest) if Nv > 0 && Nh > 0 - auto_launch!(knl_copyto_flat!, (dest, bc), dest; auto = true) + auto_launch!(knl_copyto_flat!, (dest, bc, us), dest; auto = true) end return dest end # TODO: can we use CUDA's luanch configuration for all data layouts? -# Currently, it seems to have a slight performance degredation. +# Currently, it seems to have a slight performance degradation. #! format: off # Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc) Base.copyto!(dest::IFH{S, Ni, Nh}, bc::DataLayouts.BroadcastedUnionIFH{S, Ni, Nh}, ::ToCUDA) where {S, Ni, Nh} = cuda_copyto!(dest, bc) diff --git a/ext/cuda/data_layouts_fill.jl b/ext/cuda/data_layouts_fill.jl index f140250db2..9999c65a8a 100644 --- a/ext/cuda/data_layouts_fill.jl +++ b/ext/cuda/data_layouts_fill.jl @@ -1,8 +1,8 @@ -function knl_fill_flat!(dest::AbstractData, val) +function knl_fill_flat!(dest::AbstractData, val, us) @inbounds begin tidx = thread_index() - n = size(dest) - if valid_range(tidx, prod(n)) + if tidx ≤ get_N(us) + n = size(dest) I = kernel_indexes(tidx, n) @inbounds dest[I] = val end @@ -12,8 +12,9 @@ end function cuda_fill!(dest::AbstractData, val) (_, _, Nv, Nh) = DataLayouts.universal_size(dest) + us = DataLayouts.UniversalSize(dest) if Nv > 0 && Nh > 0 - auto_launch!(knl_fill_flat!, (dest, val), dest; auto = true) + auto_launch!(knl_fill_flat!, (dest, val, us), dest; auto = true) end return dest end diff --git a/ext/cuda/operators_finite_difference.jl b/ext/cuda/operators_finite_difference.jl index 1b790f1e16..980e5a813a 100644 --- a/ext/cuda/operators_finite_difference.jl +++ b/ext/cuda/operators_finite_difference.jl @@ -29,17 +29,11 @@ function Base.copyto!( (li, lw, rw, ri) = bounds = Operators.window_bounds(space, bc) Nv = ri - li + 1 max_threads = 256 + us = DataLayouts.UniversalSize(Fields.field_values(out)) nitems = Nv * Nq * Nq * Nh # # of independent items (nthreads, nblocks) = _configure_threadblock(max_threads, nitems) - args = ( - strip_space(out, space), - strip_space(bc, space), - axes(out), - bounds, - Val(Nv), - Val(Nq), - Nh, - ) + args = + (strip_space(out, space), strip_space(bc, space), axes(out), bounds, us) auto_launch!( copyto_stencil_kernel!, args, @@ -49,20 +43,16 @@ function Base.copyto!( ) return out end +import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh -function copyto_stencil_kernel!( - out, - bc, - space, - bds, - ::Val{Nv}, - ::Val{Nq}, - Nh, -) where {Nv, Nq} +function copyto_stencil_kernel!(out, bc, space, bds, us) @inbounds begin gid = threadIdx().x + (blockIdx().x - 1) * blockDim().x - if gid ≤ Nv * Nq * Nq * Nh + if gid ≤ get_N(us) (li, lw, rw, ri) = bds + Nv = get_Nv(us) + Nq = get_Nij(us) + Nh = get_Nh(us) (v, i, j, h) = cart_ind((Nv, Nq, Nq, Nh), gid).I hidx = (i, j, h) idx = v - 1 + li diff --git a/src/DataLayouts/DataLayouts.jl b/src/DataLayouts/DataLayouts.jl index 16c50a29db..c8f0cb0b16 100644 --- a/src/DataLayouts/DataLayouts.jl +++ b/src/DataLayouts/DataLayouts.jl @@ -76,6 +76,41 @@ corresponding to `UniversalSize`. """ @inline universal_size(::UniversalSize{Ni, Nj, Nv, Nh}) where {Ni, Nj, Nv, Nh} = (Ni, Nj, Nv, Nh) + +""" + get_N(::AbstractData) + get_N(::UniversalSize) + +Statically returns `prod((Ni, Nj, Nv, Nh))` +""" +@inline get_N(::UniversalSize{Ni, Nj, Nv, Nh}) where {Ni, Nj, Nv, Nh} = + prod((Ni, Nj, Nv, Nh)) + +@inline get_N(data::AbstractData) = get_N(UniversalSize(data)) + +""" + get_Nv(::UniversalSize) + +Statically returns `Nv`. +""" +get_Nv(::UniversalSize{Ni, Nj, Nv}) where {Ni, Nj, Nv} = Nv + +""" + get_Nij(::UniversalSize) + +Statically returns `Nij`. +""" +get_Nij(::UniversalSize{Nij}) where {Nij} = Nij + +""" + get_Nh(::UniversalSize) + +Statically returns `Nh`. +""" +get_Nh(::UniversalSize{Ni, Nj, Nv, Nh}) where {Ni, Nj, Nv, Nh} = Nh + +get_Nh(data::AbstractData) = Nh + @inline universal_size(data::AbstractData) = universal_size(UniversalSize(data)) function Base.show(io::IO, data::AbstractData) diff --git a/test/DataLayouts/opt_universal_size.jl b/test/DataLayouts/opt_universal_size.jl new file mode 100644 index 0000000000..f54d5a6330 --- /dev/null +++ b/test/DataLayouts/opt_universal_size.jl @@ -0,0 +1,68 @@ +#= +julia --project +using Revise; include(joinpath("test", "DataLayouts", "opt_universal_size.jl")) +=# +using Test +using ClimaCore.DataLayouts +using ClimaCore: DataLayouts, Geometry +import ClimaComms +using StaticArrays: SMatrix +ClimaComms.@import_required_backends +using JET +using InteractiveUtils: @code_typed + +function test_universal_size(data) + us = DataLayouts.UniversalSize(data) + # Make sure results is statically returned / constant propagated + ct = @code_typed DataLayouts.get_N(us) + @test ct.first.code[1] isa Core.ReturnNode + @test ct.first.code[end].val == DataLayouts.get_N(us) + + ct = @code_typed DataLayouts.get_Nv(us) + @test ct.first.code[1] isa Core.ReturnNode + @test ct.first.code[end].val == DataLayouts.get_Nv(us) + + ct = @code_typed DataLayouts.get_Nij(us) + @test ct.first.code[1] isa Core.ReturnNode + @test ct.first.code[end].val == DataLayouts.get_Nij(us) + + ct = @code_typed DataLayouts.get_Nh(us) + @test ct.first.code[1] isa Core.ReturnNode + @test ct.first.code[end].val == DataLayouts.get_Nh(us) + + ct = @code_typed size(data) + @test ct.first.code[1] isa Core.ReturnNode + @test ct.first.code[end].val == size(data) + + ct = @code_typed DataLayouts.get_N(data) + @test ct.first.code[1] isa Core.ReturnNode + @test ct.first.code[end].val == DataLayouts.get_N(data) + + # Demo of failed constant prop: + ct = @code_typed prod(size(data)) + @test ct.first.code[1] isa Expr # first element is not a return node, but an expression +end + +@testset "UniversalSize" begin + device = ClimaComms.device() + device_zeros(args...) = ClimaComms.array_type(device)(zeros(args...)) + FT = Float64 + S = FT + Nf = 1 + Nv = 4 + Nij = 3 + Nh = 5 + Nk = 6 +#! format: off + data = DataF{S}(device_zeros(FT,Nf)); test_universal_size(data) + data = IJFH{S, Nij, Nh}(device_zeros(FT,Nij,Nij,Nf,Nh)); test_universal_size(data) + data = IFH{S, Nij, Nh}(device_zeros(FT,Nij,Nf,Nh)); test_universal_size(data) + data = IJF{S, Nij}(device_zeros(FT,Nij,Nij,Nf)); test_universal_size(data) + data = IF{S, Nij}(device_zeros(FT,Nij,Nf)); test_universal_size(data) + data = VF{S, Nv}(device_zeros(FT,Nv,Nf)); test_universal_size(data) + data = VIJFH{S,Nv,Nij,Nh}(device_zeros(FT,Nv,Nij,Nij,Nf,Nh));test_universal_size(data) + data = VIFH{S, Nv, Nij, Nh}(device_zeros(FT,Nv,Nij,Nf,Nh)); test_universal_size(data) +#! format: on + # data = DataLayouts.IJKFVH{S, Nij, Nk, Nv, Nh}(device_zeros(FT,Nij,Nij,Nk,Nf,Nv,Nh)); test_universal_size(data) # TODO: test + # data = DataLayouts.IH1JH2{S, Nij}(device_zeros(FT,2*Nij,3*Nij)); test_universal_size(data) # TODO: test +end