diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 1a6e7756c5..48043f692b 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -77,6 +77,10 @@ steps: key: unit_data0d command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/data0d.jl" + - label: "Unit: data_fill" + key: unit_data_fill + command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/fill.jl" + - label: "Unit: data1d" key: unit_data1d command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/data1d.jl" @@ -103,6 +107,16 @@ steps: agents: slurm_gpus: 1 + - label: "Unit: data fill" + key: unit_data_fill + command: + - "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'" + - "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/fill.jl" + env: + CLIMACOMMS_DEVICE: "CUDA" + agents: + slurm_gpus: 1 + - group: "Unit: Geometry" steps: @@ -1060,6 +1074,23 @@ steps: key: "cpu_axis_tensor_conversion_perf_bm" command: "julia --color=yes --project=.buildkite test/Geometry/axistensor_conversion_benchmarks.jl" + - group: "Perf: DataLayouts" + steps: + + - label: "Perf: DataLayouts fill" + key: "cpu_datalayouts_fill" + command: "julia --color=yes --project=.buildkite test/DataLayouts/benchmark_fill.jl" + + - label: "Perf: DataLayouts fill" + key: "gpu_datalayouts_fill" + command: + - "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'" + - "julia --color=yes --project=.buildkite test/DataLayouts/benchmark_fill.jl" + env: + CLIMACOMMS_DEVICE: "CUDA" + agents: + slurm_gpus: 1 + - group: "Perf: Fields" steps: diff --git a/ext/ClimaCoreCUDAExt.jl b/ext/ClimaCoreCUDAExt.jl index 8f1e81bc8c..8126e65813 100644 --- a/ext/ClimaCoreCUDAExt.jl +++ b/ext/ClimaCoreCUDAExt.jl @@ -14,6 +14,8 @@ import ClimaCore.Utilities: half import ClimaCore.RecursiveApply: ⊠, ⊞, ⊟, radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax +const cu_array = Union{CUDA.CuArray, SubArray{<:Any, <:Any, CUDA.CuArray}} + include(joinpath("cuda", "cuda_utils.jl")) include(joinpath("cuda", "data_layouts.jl")) include(joinpath("cuda", "fields.jl")) diff --git a/ext/cuda/data_layouts.jl b/ext/cuda/data_layouts.jl index 2f8860a136..648ac6f5e8 100644 --- a/ext/cuda/data_layouts.jl +++ b/ext/cuda/data_layouts.jl @@ -5,15 +5,11 @@ import ClimaCore.DataLayouts: IJKFVH, IJFH, VIJFH, VIFH, IFH, IJF, IF, VF, DataF import ClimaCore.DataLayouts: IJFHStyle, VIJFHStyle, VFStyle, DataFStyle import ClimaCore.DataLayouts: promote_parent_array_type import ClimaCore.DataLayouts: parent_array_type -import ClimaCore.DataLayouts: device_from_array_type, isascalar +import ClimaCore.DataLayouts: isascalar import ClimaCore.DataLayouts: fused_copyto! import Adapt import CUDA -device_from_array_type(::Type{<:CUDA.CuArray}) = ClimaComms.CUDADevice() -device_from_array_type(::Type{<:SubArray{<:Any, <:Any, <:CUDA.CuArray}}) = - ClimaComms.CUDADevice() - parent_array_type(::Type{<:CUDA.CuArray{T, N, B} where {N}}) where {T, B} = CUDA.CuArray{T, N, B} where {N} @@ -44,24 +40,10 @@ function knl_copyto!(dest, src) return nothing end -function knl_fill!(dest, val) - i = CUDA.threadIdx().x - j = CUDA.threadIdx().y - - h = CUDA.blockIdx().x - v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z - - if v <= size(dest, 4) - I = CartesianIndex((i, j, 1, v, h)) - @inbounds dest[I] = val - end - return nothing -end - function Base.copyto!( dest::IJFH{S, Nij}, bc::Union{IJFH{S, Nij, A}, Base.Broadcast.Broadcasted{IJFHStyle{Nij, A}}}, -) where {S, Nij, A <: CUDA.CuArray} +) where {S, Nij, A <: cu_array} _, _, _, _, Nh = size(bc) if Nh > 0 auto_launch!( @@ -74,28 +56,6 @@ function Base.copyto!( end return dest end -function Base.fill!( - dest::IJFH{S, Nij, A}, - val, -) where { - S, - Nij, - A <: Union{CUDA.CuArray, SubArray{<:Any, <:Any, <:CUDA.CuArray}}, -} - _, _, _, _, Nh = size(dest) - if Nh > 0 - auto_launch!( - knl_fill!, - (dest, val), - dest; - threads_s = (Nij, Nij), - blocks_s = (Nh, 1), - ) - end - return dest -end - - function Base.copyto!( dest::VIJFH{S, Nv, Nij}, @@ -103,7 +63,7 @@ function Base.copyto!( VIJFH{S, Nv, Nij, A}, Base.Broadcast.Broadcasted{VIJFHStyle{Nv, Nij, A}}, }, -) where {S, Nv, Nij, A <: CUDA.CuArray} +) where {S, Nv, Nij, A <: cu_array} _, _, _, _, Nh = size(bc) if Nv > 0 && Nh > 0 Nv_per_block = min(Nv, fld(256, Nij * Nij)) @@ -118,30 +78,11 @@ function Base.copyto!( end return dest end -function Base.fill!( - dest::VIJFH{S, Nv, Nij, A}, - val, -) where {S, Nv, Nij, A <: CUDA.CuArray} - _, _, _, _, Nh = size(dest) - if Nv > 0 && Nh > 0 - Nv_per_block = min(Nv, fld(256, Nij * Nij)) - Nv_blocks = cld(Nv, Nv_per_block) - auto_launch!( - knl_fill!, - (dest, val), - dest; - threads_s = (Nij, Nij, Nv_per_block), - blocks_s = (Nh, Nv_blocks), - ) - end - return dest -end - function Base.copyto!( dest::VF{S, Nv}, bc::Union{VF{S, Nv, A}, Base.Broadcast.Broadcasted{VFStyle{Nv, A}}}, -) where {S, Nv, A <: CUDA.CuArray} +) where {S, Nv, A <: cu_array} _, _, _, _, Nh = size(dest) if Nv > 0 && Nh > 0 auto_launch!( @@ -154,19 +95,6 @@ function Base.copyto!( end return dest end -function Base.fill!(dest::VF{S, Nv, A}, val) where {S, Nv, A <: CUDA.CuArray} - _, _, _, _, Nh = size(dest) - if Nv > 0 && Nh > 0 - auto_launch!( - knl_fill!, - (dest, val), - dest; - threads_s = (1, 1), - blocks_s = (Nh, Nv), - ) - end - return dest -end function Base.copyto!( dest::DataF{S}, @@ -181,16 +109,8 @@ function Base.copyto!( ) return dest end -function Base.fill!(dest::DataF{S, A}, val) where {S, A <: CUDA.CuArray} - auto_launch!( - knl_fill!, - (dest, val), - dest; - threads_s = (1, 1), - blocks_s = (1, 1), - ) - return dest -end + +include("fill.jl") Base.@propagate_inbounds function rcopyto_at!( pair::Pair{<:AbstractData, <:Any}, @@ -236,7 +156,7 @@ end function fused_copyto!( fmbc::FusedMultiBroadcast, - dest1::VIJFH{S, Nv, Nij}, + dest1::VIJFH{S, Nv, Nij, <:cu_array}, ::ClimaComms.CUDADevice, ) where {S, Nv, Nij} _, _, _, _, Nh = size(dest1) @@ -256,7 +176,7 @@ end function fused_copyto!( fmbc::FusedMultiBroadcast, - dest1::IJFH{S, Nij}, + dest1::IJFH{S, Nij, <:cu_array}, ::ClimaComms.CUDADevice, ) where {S, Nij} _, _, _, _, Nh = size(dest1) @@ -273,7 +193,7 @@ function fused_copyto!( end function fused_copyto!( fmbc::FusedMultiBroadcast, - dest1::VF{S, Nv}, + dest1::VF{S, Nv, <:cu_array}, ::ClimaComms.CUDADevice, ) where {S, Nv} _, _, _, _, Nh = size(dest1) @@ -291,7 +211,7 @@ end function fused_copyto!( fmbc::FusedMultiBroadcast, - dest1::DataF{S}, + dest1::DataF{S, <:cu_array}, ::ClimaComms.CUDADevice, ) where {S} auto_launch!( diff --git a/ext/cuda/fill.jl b/ext/cuda/fill.jl new file mode 100644 index 0000000000..235de93eea --- /dev/null +++ b/ext/cuda/fill.jl @@ -0,0 +1,30 @@ +cartesian_index(::AbstractData, inds) = CartesianIndex(inds) + +function knl_fill_flat!(dest::AbstractData, val) + n = size(dest) + inds = kernel_indexes(n) + if valid_range(inds, n) + I = cartesian_index(dest, inds) + @inbounds dest[I] = val + end + return nothing +end + +function cuda_fill!(dest::AbstractData, val) + (Nij, Nij, Nf, Nv, Nh) = size(dest) + if Nv > 0 && Nh > 0 + auto_launch!(knl_fill_flat!, (dest, val), dest; auto = true) + end + return dest +end + +#! format: off +Base.fill!(dest::IJFH{<:Any, <:Any, <:cu_array}, val) = cuda_fill!(dest, val) +Base.fill!(dest::IFH{<:Any, <:Any, <:cu_array}, val) = cuda_fill!(dest, val) +Base.fill!(dest::IJF{<:Any, <:Any, <:cu_array}, val) = cuda_fill!(dest, val) +Base.fill!(dest::IF{<:Any, <:Any, <:cu_array}, val) = cuda_fill!(dest, val) +Base.fill!(dest::VIFH{<:Any, <:Any, <:Any, <:cu_array}, val) = cuda_fill!(dest, val) +Base.fill!(dest::VIJFH{<:Any, <:Any, <:Any, <:cu_array}, val) = cuda_fill!(dest, val) +Base.fill!(dest::VF{<:Any, <:Any, <:cu_array}, val) = cuda_fill!(dest, val) +Base.fill!(dest::DataF{<:Any, <:cu_array}, val) = cuda_fill!(dest, val) +#! format: on diff --git a/src/DataLayouts/DataLayouts.jl b/src/DataLayouts/DataLayouts.jl index b71c3e0ae8..f1c46bf865 100644 --- a/src/DataLayouts/DataLayouts.jl +++ b/src/DataLayouts/DataLayouts.jl @@ -270,7 +270,8 @@ Base.copy(data::IJFH{S, Nij}) where {S, Nij} = IJFH{S, Nij}(copy(parent(data))) function Base.size(data::IJFH{S, Nij}) where {S, Nij} Nv = 1 Nh = size(parent(data), 4) - (Nij, Nij, 1, Nv, Nh) + Nf = ncomponents(data) + (Nij, Nij, Nf, Nv, Nh) end function Base.fill!(data::IJFH, val) @@ -444,7 +445,8 @@ Base.copy(data::IFH{S, Ni}) where {S, Ni} = IFH{S, Ni}(copy(parent(data))) function Base.size(data::IFH{S, Ni}) where {S, Ni} Nv = 1 Nh = size(parent(data), 3) - (Ni, 1, 1, Nv, Nh) + Nf = ncomponents(data) + (Ni, 1, Nf, Nv, Nh) end function Base.fill!(data::IFH, val) @@ -507,7 +509,7 @@ end @inbounds set_struct!( parent(data), convert(S, val), - Val(3), + Val(2), CartesianIndex(i, 1, h), ) end @@ -520,7 +522,7 @@ end @inbounds set_struct!( parent(data), convert(S, val), - Val(3), + Val(2), CartesianIndex(i, 1, h), ) end @@ -543,10 +545,11 @@ end function DataF{S}(array::AbstractVector{T}) where {S, T} check_basetype(T, S) + @assert size(array, 1) == typesize(T, S) DataF{S, typeof(array)}(array) end -function DataF{S}(ArrayType) where {S} +function DataF{S}(::Type{ArrayType}) where {S, ArrayType} T = eltype(ArrayType) DataF{S}(ArrayType(undef, typesize(T, S))) end @@ -638,8 +641,9 @@ end @inbounds slab[I[1], I[2]] = val end -Base.size(::DataSlab2D{S, Nij}) where {S, Nij} = (Nij, Nij, 1, 1, 1) -Base.axes(::DataSlab2D{S, Nij}) where {S, Nij} = (SOneTo(Nij), SOneTo(Nij)) +Base.size(data::DataSlab2D{S, Nij}) where {S, Nij} = + (Nij, Nij, ncomponents(data), 1, 1) +Base.axes(data::DataSlab2D{S, Nij}) where {S, Nij} = (SOneTo(Nij), SOneTo(Nij)) @inline function slab(data::DataSlab2D, h) @boundscheck (h >= 1) || throw(BoundsError(data, (h,))) @@ -690,7 +694,8 @@ function replace_basetype(data::IJF{S, Nij}, ::Type{T}) where {S, Nij, T} end function Base.size(data::IJF{S, Nij}) where {S, Nij} - return (Nij, Nij, 1, 1, 1) + Nf = ncomponents(data) + return (Nij, Nij, Nf, 1, 1) end function Base.fill!(data::IJF{S, Nij}, val) where {S, Nij} @inbounds for j in 1:Nij, i in 1:Nij @@ -775,8 +780,9 @@ end @inbounds slab[I[1]] = val end -function Base.size(::DataSlab1D{<:Any, Ni}) where {Ni} - return (Ni, 1, 1, 1, 1) +function Base.size(data::DataSlab1D{<:Any, Ni}) where {Ni} + Nf = ncomponents(data) + return (Ni, 1, Nf, 1, 1) end Base.axes(::DataSlab1D{S, Ni}) where {S, Ni} = (SOneTo(Ni),) Base.lastindex(::DataSlab1D{S, Ni}) where {S, Ni} = Ni @@ -888,7 +894,7 @@ end # ====================== Base.length(data::DataColumn) = size(parent(data), 1) -Base.size(data::DataColumn) = (1, 1, 1, length(data), 1) +Base.size(data::DataColumn) = (1, 1, ncomponents(data), length(data), 1) """ VF{S, A} <: DataColumn{S, Nv} @@ -933,7 +939,7 @@ end Base.copy(data::VF{S, Nv}) where {S, Nv} = VF{S, Nv}(copy(parent(data))) Base.lastindex(data::VF) = length(data) -Base.size(data::VF{S, Nv}) where {S, Nv} = (1, 1, 1, Nv, 1) +Base.size(data::VF{S, Nv}) where {S, Nv} = (1, 1, ncomponents(data), Nv, 1) nlevels(::VF{S, Nv}) where {S, Nv} = Nv @@ -1053,7 +1059,8 @@ end function Base.size(data::VIJFH{<:Any, Nv, Nij}) where {Nv, Nij} Nh = size(parent(data), 5) - return (Nij, Nij, 1, Nv, Nh) + Nf = ncomponents(data) + return (Nij, Nij, Nf, Nv, Nh) end function Base.length(data::VIJFH) @@ -1219,7 +1226,8 @@ Base.copy(data::VIFH{S, Nv, Ni}) where {S, Nv, Ni} = function Base.size(data::VIFH{<:Any, Nv, Ni}) where {Nv, Ni} Nh = size(parent(data), 4) - return (Ni, 1, 1, Nv, Nh) + Nf = ncomponents(data) + return (Ni, 1, Nf, Nv, Nh) end function Base.length(data::VIFH) @@ -1368,7 +1376,8 @@ Base.copy(data::IH1JH2{S, Nij}) where {S, Nij} = function Base.size(data::IH1JH2{S, Nij}) where {S, Nij} Nv = 1 Nh = div(length(parent(data)), Nij * Nij) - (Nij, Nij, 1, Nv, Nh) + Nv = ncomponents(data) + (Nij, Nij, Nf, Nv, Nh) end Base.length(data::IH1JH2{S, Nij}) where {S, Nij} = @@ -1413,7 +1422,8 @@ Base.copy(data::IV1JH2{S, n1, Ni}) where {S, n1, Ni} = function Base.size(data::IV1JH2{S, n1, Ni}) where {S, n1, Ni} Nh = div(size(parent(data), 2), Ni) - (Ni, 1, 1, n1, Nh) + Nf = ncomponents(data) + (Ni, 1, Nf, n1, Nh) end Base.length(data::IV1JH2{S, n1, Ni}) where {S, n1, Ni} = @@ -1475,12 +1485,6 @@ Adapt.adapt_structure(to, data::VF{S, Nv}) where {S, Nv} = Adapt.adapt_structure(to, data::DataF{S}) where {S} = DataF{S}(Adapt.adapt(to, parent(data))) -# TODO: Should the DataLayout be device-aware? So that we can -# determine if we're multi-threaded or not? -# This is only currently used in FusedMultiBroadcast kernels -device_from_array_type(::Type{<:AbstractArray}) = ClimaComms.CPUSingleThreaded() -ClimaComms.device(data::AbstractData) = - device_from_array_type(typeof(parent(data))) empty_kernel_stats(::ClimaComms.AbstractDevice) = nothing empty_kernel_stats() = empty_kernel_stats(ClimaComms.device()) diff --git a/src/DataLayouts/broadcast.jl b/src/DataLayouts/broadcast.jl index db822a8b2e..af3d30779c 100644 --- a/src/DataLayouts/broadcast.jl +++ b/src/DataLayouts/broadcast.jl @@ -678,13 +678,12 @@ function Base.copyto!( end, ) # check_fused_broadcast_axes(fmbc) # we should already have checked the axes - fused_copyto!(fmb_inst, dest1, ClimaComms.device(dest1)) + fused_copyto!(fmb_inst, dest1) end function fused_copyto!( fmbc::FusedMultiBroadcast, dest1::VIJFH{S1, Nv1, Nij}, - ::ClimaComms.AbstractCPUDevice, ) where {S1, Nv1, Nij} _, _, _, _, Nh = size(dest1) for (dest, bc) in fmbc.pairs @@ -700,9 +699,8 @@ end function fused_copyto!( fmbc::FusedMultiBroadcast, - dest1::IJFH{S, Nij, A}, - ::ClimaComms.AbstractCPUDevice, -) where {S, Nij, A} + dest1::IJFH{S, Nij}, +) where {S, Nij} # copy contiguous columns _, _, _, Nv, Nh = size(dest1) for (dest, bc) in fmbc.pairs @@ -717,9 +715,8 @@ end function fused_copyto!( fmbc::FusedMultiBroadcast, - dest1::VIFH{S, Nv1, Ni, A}, - ::ClimaComms.AbstractCPUDevice, -) where {S, Nv1, Ni, A} + dest1::VIFH{S, Nv1, Ni}, +) where {S, Nv1, Ni} # copy contiguous columns _, _, _, _, Nh = size(dest1) for (dest, bc) in fmbc.pairs @@ -734,9 +731,8 @@ end function fused_copyto!( fmbc::FusedMultiBroadcast, - dest1::VF{S1, Nv1, A}, - ::ClimaComms.AbstractCPUDevice, -) where {S1, Nv1, A} + dest1::VF{S1, Nv1}, +) where {S1, Nv1} for (dest, bc) in fmbc.pairs @inbounds for v in 1:Nv1 I = CartesianIndex(1, 1, 1, v, 1) @@ -747,11 +743,7 @@ function fused_copyto!( return nothing end -function fused_copyto!( - fmbc::FusedMultiBroadcast, - dest::DataF{S}, - ::ClimaComms.AbstractCPUDevice, -) where {S} +function fused_copyto!(fmbc::FusedMultiBroadcast, dest::DataF{S}) where {S} for (dest, bc) in fmbc.pairs @inbounds dest[] = convert(S, bc[]) end diff --git a/test/DataLayouts/benchmark_fill.jl b/test/DataLayouts/benchmark_fill.jl new file mode 100644 index 0000000000..feb25b5e10 --- /dev/null +++ b/test/DataLayouts/benchmark_fill.jl @@ -0,0 +1,40 @@ +#= +julia --project +using Revise; include(joinpath("test", "DataLayouts", "benchmark_fill.jl")) +=# +using Test +using ClimaCore.DataLayouts +using BenchmarkTools +import ClimaComms +ClimaComms.@import_required_backends + +function benchmarkfill!(device, data, val) + trial = @benchmark ClimaComms.@cuda_sync $device fill!($data, $val) + show(stdout, MIME("text/plain"), trial) + println() +end + +@testset "fill! with Nf = 1" begin + device = ClimaComms.device() + device_zeros(args...) = ClimaComms.array_type(device)(zeros(args...)) + FT = Float64 + S = FT + Nf = 1 + Nv = 63 + Nij = 4 + Nh = 30 + Nk = 6 +#! format: off + data = DataF{S}(device_zeros(FT,Nf)); benchmarkfill!(device, data, 3); @test all(parent(data) .== 3) + data = IJFH{S, Nij}(device_zeros(FT,Nij,Nij,Nf,Nh)); benchmarkfill!(device, data, 3); @test all(parent(data) .== 3) + data = IFH{S, Nij}(device_zeros(FT,Nij,Nf,Nh)); benchmarkfill!(device, data, 3); @test all(parent(data) .== 3) + data = IJF{S, Nij}(device_zeros(FT,Nij,Nij,Nf)); benchmarkfill!(device, data, 3); @test all(parent(data) .== 3) + data = IF{S, Nij}(device_zeros(FT,Nij,Nf)); benchmarkfill!(device, data, 3); @test all(parent(data) .== 3) + data = VF{S, Nv}(device_zeros(FT,Nv,Nf)); benchmarkfill!(device, data, 3); @test all(parent(data) .== 3) + data = VIJFH{S, Nv, Nij}(device_zeros(FT,Nv,Nij,Nij,Nf,Nh)); benchmarkfill!(device, data, 3); @test all(parent(data) .== 3) + data = VIFH{S, Nv, Nij}(device_zeros(FT,Nv,Nij,Nf,Nh)); benchmarkfill!(device, data, 3); @test all(parent(data) .== 3) +#! format: on + + # data = IJKFVH{S}(device_zeros(FT,Nij,Nij,Nk,Nf,Nh)); benchmarkfill!(device, data, 3); @test all(parent(data) .== 3) # TODO: test + # data = IH1JH2{S}(device_zeros(FT,Nij,Nij,Nk,Nf,Nh)); benchmarkfill!(device, data, 3); @test all(parent(data) .== 3) # TODO: test +end diff --git a/test/DataLayouts/fill.jl b/test/DataLayouts/fill.jl new file mode 100644 index 0000000000..95c3b31af5 --- /dev/null +++ b/test/DataLayouts/fill.jl @@ -0,0 +1,57 @@ +#= +julia --project +using Revise; include(joinpath("test", "DataLayouts", "fill.jl")) +=# +using Test +using ClimaCore.DataLayouts +import ClimaComms +import CUDA +ClimaComms.@import_required_backends + +@testset "fill! with Nf = 1" begin + device = ClimaComms.device() + device_zeros(args...) = ClimaComms.array_type(device)(zeros(args...)) + FT = Float64 + S = FT + Nf = 1 + Nv = 4 + Nij = 3 + Nh = 5 + Nk = 6 +#! format: off + data = DataF{S}(device_zeros(FT,Nf)); fill!(data, 3); @test all(parent(data) .== 3) + data = IJFH{S, Nij}(device_zeros(FT,Nij,Nij,Nf,Nh)); fill!(data, 3); @test all(parent(data) .== 3) + data = IFH{S, Nij}(device_zeros(FT,Nij,Nf,Nh)); fill!(data, 3); @test all(parent(data) .== 3) + data = IJF{S, Nij}(device_zeros(FT,Nij,Nij,Nf)); fill!(data, 3); @test all(parent(data) .== 3) + data = IF{S, Nij}(device_zeros(FT,Nij,Nf)); fill!(data, 3); @test all(parent(data) .== 3) + data = VF{S, Nv}(device_zeros(FT,Nv,Nf)); fill!(data, 3); @test all(parent(data) .== 3) + data = VIJFH{S, Nv, Nij}(device_zeros(FT,Nv,Nij,Nij,Nf,Nh)); fill!(data, 3); @test all(parent(data) .== 3) + data = VIFH{S, Nv, Nij}(device_zeros(FT,Nv,Nij,Nf,Nh)); fill!(data, 3); @test all(parent(data) .== 3) +#! format: on + # data = IJKFVH{S}(device_zeros(FT,Nij,Nij,Nk,Nf,Nh)); fill!(data, (2,3)); @test all(parent(data) .== 3) # TODO: test + # data = IH1JH2{S}(device_zeros(FT,Nij,Nij,Nk,Nf,Nh)); fill!(data, (2,3)); @test all(parent(data) .== 3) # TODO: test +end + +@testset "fill! with Nf > 1" begin + device = ClimaComms.device() + device_zeros(args...) = ClimaComms.array_type(device)(zeros(args...)) + FT = Float64 + S = Tuple{FT, FT} + Nf = 2 + Nv = 4 + Nij = 3 + Nh = 5 + Nk = 6 +#! format: off + data = DataF{S}(device_zeros(FT,Nf)); fill!(data, (2,3)); @test all(parent(data.:1) .== 2); @test all(parent(data.:2) .== 3) + data = IJFH{S, Nij}(device_zeros(FT,Nij,Nij,Nf,Nh)); fill!(data, (2,3)); @test all(parent(data.:1) .== 2); @test all(parent(data.:2) .== 3) + data = IFH{S, Nij}(device_zeros(FT,Nij,Nf,Nh)); fill!(data, (2,3)); @test all(parent(data.:1) .== 2); @test all(parent(data.:2) .== 3) + data = IJF{S, Nij}(device_zeros(FT,Nij,Nij,Nf)); fill!(data, (2,3)); @test all(parent(data.:1) .== 2); @test all(parent(data.:2) .== 3) + data = IF{S, Nij}(device_zeros(FT,Nij,Nf)); fill!(data, (2,3)); @test all(parent(data.:1) .== 2); @test all(parent(data.:2) .== 3) + data = VF{S, Nv}(device_zeros(FT,Nv,Nf)); fill!(data, (2,3)); @test all(parent(data.:1) .== 2); @test all(parent(data.:2) .== 3) + data = VIJFH{S, Nv, Nij}(device_zeros(FT,Nv,Nij,Nij,Nf,Nh)); fill!(data, (2,3)); @test all(parent(data.:1) .== 2); @test all(parent(data.:2) .== 3) + data = VIFH{S, Nv, Nij}(device_zeros(FT,Nv,Nij,Nf,Nh)); fill!(data, (2,3)); @test all(parent(data.:1) .== 2); @test all(parent(data.:2) .== 3) +#! format: on + # data = IJKFVH{S}(device_zeros(FT,Nij,Nij,Nk,Nf,Nh)); fill!(data, (2,3)); @test all(parent(data) .== (2,3)) # TODO: test + # data = IH1JH2{S}(device_zeros(FT,Nij,Nij,Nk,Nf,Nh)); fill!(data, (2,3)); @test all(parent(data) .== (2,3)) # TODO: test +end diff --git a/test/Fields/field_multi_broadcast_fusion.jl b/test/Fields/field_multi_broadcast_fusion.jl index 5d6b128720..652a2ace57 100644 --- a/test/Fields/field_multi_broadcast_fusion.jl +++ b/test/Fields/field_multi_broadcast_fusion.jl @@ -72,9 +72,9 @@ function CenterExtrudedFiniteDifferenceSpaceLineHSpace( return Spaces.ExtrudedFiniteDifferenceSpace(hspace, vspace) end -function benchmark_kernel!(f!, X, Y) +function benchmark_kernel!(f!, X, Y, device) println("\n--------------------------- $(nameof(typeof(f!))) ") - trial = benchmark_kernel!(f!, X, Y, ClimaComms.device(X.x1)) + trial = benchmark_kernel!(f!, X, Y, device) show(stdout, MIME("text/plain"), trial) end benchmark_kernel!(f!, X, Y, ::ClimaComms.CUDADevice) = @@ -250,11 +250,12 @@ end @testset "FusedMultiBroadcast VIJFH and VF" begin FT = Float64 + device = ClimaComms.device() space = TU.CenterExtrudedFiniteDifferenceSpace( FT; zelem = 3, helem = 4, - context = ClimaComms.context(), + context = ClimaComms.context(device), ) X = Fields.FieldVector( x1 = rand_field(FT, space), @@ -269,11 +270,11 @@ end test_kernel!(; fused!, unfused!, X, Y) test_kernel!(; fused! = fused_bycolumn!, unfused! = unfused_bycolumn!, X, Y) - benchmark_kernel!(unfused!, X, Y) - benchmark_kernel!(fused!, X, Y) + benchmark_kernel!(unfused!, X, Y, device) + benchmark_kernel!(fused!, X, Y, device) - benchmark_kernel!(unfused_bycolumn!, X, Y) - benchmark_kernel!(fused_bycolumn!, X, Y) + benchmark_kernel!(unfused_bycolumn!, X, Y, device) + benchmark_kernel!(fused_bycolumn!, X, Y, device) nothing end @@ -306,11 +307,11 @@ end Y, ) - benchmark_kernel!(unfused!, X, Y) - benchmark_kernel!(fused!, X, Y) + benchmark_kernel!(unfused!, X, Y, device) + benchmark_kernel!(fused!, X, Y, device) - benchmark_kernel!(unfused_bycolumn!, X, Y) - benchmark_kernel!(fused_bycolumn!, X, Y) + benchmark_kernel!(unfused_bycolumn!, X, Y, device) + benchmark_kernel!(fused_bycolumn!, X, Y, device) nothing end end @@ -332,8 +333,8 @@ end y3 = IJFH_data(), ) test_kernel!(; fused!, unfused!, X, Y) - benchmark_kernel!(unfused!, X, Y) - benchmark_kernel!(fused!, X, Y) + benchmark_kernel!(unfused!, X, Y, device) + benchmark_kernel!(fused!, X, Y, device) nothing end @@ -350,8 +351,8 @@ end X = Fields.FieldVector(; x1 = VF_data(), x2 = VF_data(), x3 = VF_data()) Y = Fields.FieldVector(; y1 = VF_data(), y2 = VF_data(), y3 = VF_data()) test_kernel!(; fused!, unfused!, X, Y) - benchmark_kernel!(unfused!, X, Y) - benchmark_kernel!(fused!, X, Y) + benchmark_kernel!(unfused!, X, Y, device) + benchmark_kernel!(fused!, X, Y, device) nothing end @@ -371,7 +372,7 @@ end y3 = DataF_data(), ) test_kernel!(; fused!, unfused!, X, Y) - benchmark_kernel!(unfused!, X, Y) - benchmark_kernel!(fused!, X, Y) + benchmark_kernel!(unfused!, X, Y, device) + benchmark_kernel!(fused!, X, Y, device) nothing end