From f1e9282993aa70830b12b0df5040a6453b72e0c2 Mon Sep 17 00:00:00 2001 From: Charles Kawczynski Date: Sat, 2 Nov 2024 14:57:04 -0400 Subject: [PATCH] FieldVectors, add benchmarks, improve perf --- .buildkite/pipeline.yml | 1 + ext/cuda/data_layouts_copyto.jl | 32 ++++++ src/DataLayouts/copyto.jl | 16 +++ src/Fields/Fields.jl | 3 +- src/Fields/fieldvector.jl | 36 ++++--- test/Fields/benchmark_fieldvectors.jl | 137 ++++++++++++++++++++++++++ 6 files changed, 209 insertions(+), 16 deletions(-) create mode 100644 test/Fields/benchmark_fieldvectors.jl diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 7331b5dd97..cecfec7fb0 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -486,6 +486,7 @@ steps: key: unit_field command: - "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/unit_field.jl" + - "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_fieldvectors.jl" - "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_field_multi_broadcast_fusion.jl" - "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/convergence_field_integrals.jl" - "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/inference_repro.jl" diff --git a/ext/cuda/data_layouts_copyto.jl b/ext/cuda/data_layouts_copyto.jl index 8f14df0bc4..60923401f9 100644 --- a/ext/cuda/data_layouts_copyto.jl +++ b/ext/cuda/data_layouts_copyto.jl @@ -96,3 +96,35 @@ function Base.copyto!( @inbounds bc0 = bc[] fill!(dest, bc0) end + +# For field-vector operations +function DataLayouts.copyto_per_field!( + array::AbstractArray, + bc::Base.Broadcast.Broadcasted, + ::ToCUDA, +) + bc′ = DataLayouts.to_non_extruded_broadcasted(bc) + # All field variables are treated separately, so + # we can parallelize across the field index, and + # leverage linear indexing: + nitems = prod(size(array)) + N = prod(size(array)) + args = (array, bc′, N) + threads = threads_via_occupancy(copyto_per_field_kernel!, args) + n_max_threads = min(threads, nitems) + p = linear_partition(nitems, n_max_threads) + auto_launch!( + copyto_per_field_kernel!, + args; + threads_s = p.threads, + blocks_s = p.blocks, + ) + return array +end +function copyto_per_field_kernel!(array, bc, N) + i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x + if 1 ≤ i ≤ N + @inbounds array[i] = bc[i] + end + return nothing +end diff --git a/src/DataLayouts/copyto.jl b/src/DataLayouts/copyto.jl index a7b5949ddf..4678a98f84 100644 --- a/src/DataLayouts/copyto.jl +++ b/src/DataLayouts/copyto.jl @@ -180,3 +180,19 @@ function Base.copyto!( end return dest end + +function copyto_per_field!( + array::AbstractArray, + bc::Base.Broadcast.Broadcasted, + ::ToCPU, +) + bc′ = to_non_extruded_broadcasted(bc) + # All field variables are treated separately, so + # we can parallelize across the field index, and + # leverage linear indexing: + N = prod(size(array)) + @inbounds @simd for I in 1:N + array[I] = bc′[I] + end + return array +end diff --git a/src/Fields/Fields.jl b/src/Fields/Fields.jl index 171daf699e..3be9b60af1 100644 --- a/src/Fields/Fields.jl +++ b/src/Fields/Fields.jl @@ -10,7 +10,8 @@ import ..DataLayouts: FusedMultiBroadcast, @fused_direct, isascalar, - check_fused_broadcast_axes + check_fused_broadcast_axes, + copyto_per_field! import ..Domains import ..Topologies import ..Quadratures diff --git a/src/Fields/fieldvector.jl b/src/Fields/fieldvector.jl index 520a86349e..a2c4be5812 100644 --- a/src/Fields/fieldvector.jl +++ b/src/Fields/fieldvector.jl @@ -175,15 +175,6 @@ function Base.similar( error("Cannot construct FieldVector") end -@inline function Base.copyto!(dest::FV, src::FV) where {FV <: FieldVector} - for symb in propertynames(dest) - pd = parent(getproperty(dest, symb)) - ps = parent(getproperty(src, symb)) - copyto!(pd, ps) - end - return dest -end - """ Spaces.create_dss_buffer(fv::FieldVector) @@ -218,6 +209,20 @@ function Spaces.weighted_dss!( Spaces.weighted_dss!(pairs...) end +Base.Broadcast.broadcasted(fs::FieldVectorStyle, ::typeof(+), args...) = + Base.Broadcast.broadcasted(fs, RecursiveApply.:⊞, args...) + +Base.Broadcast.broadcasted(fs::FieldVectorStyle, ::typeof(-), args...) = + Base.Broadcast.broadcasted(fs, RecursiveApply.:⊟, args...) + +Base.Broadcast.broadcasted(fs::FieldVectorStyle, ::typeof(*), args...) = + Base.Broadcast.broadcasted(fs, RecursiveApply.:⊠, args...) + +Base.Broadcast.broadcasted(fs::FieldVectorStyle, ::typeof(/), args...) = + Base.Broadcast.broadcasted(fs, RecursiveApply.rdiv, args...) + +Base.Broadcast.broadcasted(fs::FieldVectorStyle, ::typeof(muladd), args...) = + Base.Broadcast.broadcasted(fs, RecursiveApply.rmuladd, args...) # Recursively call transform_bc_args() on broadcast arguments in a way that is statically reducible by the optimizer # see Base.Broadcast.preprocess_args @@ -241,7 +246,7 @@ end ) end @inline transform_broadcasted(fv::FieldVector, symb, axes) = - parent(getfield(_values(fv), symb)) + parent(field_values(getfield(_values(fv), symb))) @inline transform_broadcasted(x, symb, axes) = x @inline function first_fieldvector_in_bc(args::Tuple, rargs...) @@ -332,12 +337,13 @@ end @inline function Base.copyto!( dest::FieldVector, - bc::Base.Broadcast.Broadcasted{FieldVectorStyle}, + bc::Union{FieldVector, Base.Broadcast.Broadcasted{FieldVectorStyle}}, ) map(propertynames(dest)) do symb Base.@_inline_meta - p = parent(getfield(_values(dest), symb)) - copyto!(p, transform_broadcasted(bc, symb, axes(p))) + array = parent(getfield(_values(dest), symb)) + bct = transform_broadcasted(bc, symb, axes(array)) + copyto_per_field!(array, bct, DataLayouts.device_dispatch(array)) end return dest end @@ -348,8 +354,8 @@ end ) map(propertynames(dest)) do symb Base.@_inline_meta - p = parent(getfield(_values(dest), symb)) - copyto!(p, bc) + data = parent((getfield(_values(dest), symb))) + copyto_per_field!(array, bc, DataLayouts.device_dispatch(array)) nothing end return dest diff --git a/test/Fields/benchmark_fieldvectors.jl b/test/Fields/benchmark_fieldvectors.jl new file mode 100644 index 0000000000..2098f6ea77 --- /dev/null +++ b/test/Fields/benchmark_fieldvectors.jl @@ -0,0 +1,137 @@ +#= +julia --project +ENV["CLIMACOMMS_DEVICE"] = "CPU"; using Revise; include(joinpath("test", "Fields", "benchmark_fieldvectors.jl")) +ENV["CLIMACOMMS_DEVICE"] = "CUDA"; using Revise; include(joinpath("test", "Fields", "benchmark_fieldvectors.jl")) +=# +using Test +using ClimaCore.DataLayouts +using ClimaCore: Spaces, Fields, Geometry +using BenchmarkTools +import ClimaComms +import ClimaCore +@static pkgversion(ClimaComms) >= v"0.6" && ClimaComms.@import_required_backends +if ClimaComms.device() isa ClimaComms.CUDADevice + import CUDA + device_name = CUDA.name(CUDA.device()) # Move to ClimaComms +else + device_name = "CPU" +end +if !(@isdefined(TU)) + include( + joinpath( + pkgdir(ClimaCore), + "test", + "TestUtilities", + "TestUtilities.jl", + ), + ) + import .TestUtilities as TU +end + +include(joinpath(pkgdir(ClimaCore), "benchmarks/scripts/benchmark_utils.jl")) + +function benchmarkcopyto!(bm, device, data, val) + caller = string(nameof(typeof(data))) + @info "Benchmarking $caller..." + data_rhs_1 = similar(data) + data_rhs_2 = similar(data) + data_rhs_3 = similar(data) + data_rhs_4 = similar(data) + T = eltype(parent(data)) + dt = T(0) + bc1 = Base.Broadcast.broadcasted(+, data_rhs_2, data_rhs_3, data_rhs_4) + bc2 = Base.Broadcast.broadcasted(*, dt, bc1) + bc = Base.Broadcast.broadcasted(+, data_rhs_1, bc2) + trial = @benchmark ClimaComms.@cuda_sync $device Base.copyto!($data, $bc) + t_min = minimum(trial.times) * 1e-9 # to seconds + nreps = length(trial.times) + n_reads_writes = 1 + 4 + push_info( + bm; + kernel_time_s = t_min, + nreps = nreps, + caller, + problem_size = size(parent(data)), + n_reads_writes, + ) +end + +function fv_state(cspace, fspace) + FT = Spaces.undertype(cspace) + return Fields.FieldVector( + c = fill( + (; + ρ = FT(0), + uₕ = zero(Geometry.Covariant12Vector{FT}), + e_int = FT(0), + q_tot = FT(0), + ), + cspace, + ), + f = fill((; w = Geometry.Covariant3Vector(FT(0))), fspace), + ) +end + +@testset "FieldVector FH" begin + FT = Float64 + device = ClimaComms.device() + + bm = Benchmark(; float_type = FT, device_name) + cspace = TU.CenterExtrudedFiniteDifferenceSpace( + FT; + zelem = 63, + helem = 30, + Nq = 4, + context = ClimaComms.context(device), + ) + fspace = Spaces.FaceExtrudedFiniteDifferenceSpace(cspace) + X = fv_state(cspace, fspace) + benchmarkcopyto!(bm, device, X, 3) + + cspace = TU.CenterExtrudedFiniteDifferenceSpace( + FT; + zelem = 63, + helem = 15, + Nq = 4, + context = ClimaComms.context(device), + ) + fspace = Spaces.FaceExtrudedFiniteDifferenceSpace(cspace) + X = fv_state(cspace, fspace) + benchmarkcopyto!(bm, device, X, 3) + + tabulate_benchmark(bm) + nothing +end + +@testset "FieldVector HF" begin + FT = Float64 + device = ClimaComms.device() + + bm = Benchmark(; float_type = FT, device_name) + cspace = TU.CenterExtrudedFiniteDifferenceSpace( + FT; + zelem = 63, + helem = 30, + Nq = 4, + context = ClimaComms.context(device), + horizontal_layout_type = DataLayouts.IJHF, + ) + fspace = Spaces.FaceExtrudedFiniteDifferenceSpace(cspace) + X = fv_state(cspace, fspace) + benchmarkcopyto!(bm, device, X, 3) + + cspace = TU.CenterExtrudedFiniteDifferenceSpace( + FT; + zelem = 63, + helem = 15, + Nq = 4, + context = ClimaComms.context(device), + horizontal_layout_type = DataLayouts.IJHF, + ) + fspace = Spaces.FaceExtrudedFiniteDifferenceSpace(cspace) + X = fv_state(cspace, fspace) + benchmarkcopyto!(bm, device, X, 3) + + tabulate_benchmark(bm) + nothing +end