diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index f04dba10fe..6f6fd0bcc4 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -673,6 +673,14 @@ steps: key: unit_limiter command: "julia --color=yes --check-bounds=yes --project=test test/Limiters/limiter.jl" + - label: "Unit: limiter cuda" + key: unit_limiter_gpu + command: + - "julia --project -e 'using CUDA; CUDA.versioninfo()'" + - "julia --color=yes --project=test test/Limiters/limiter.jl" + agents: + slurm_gpus: 1 + - label: "Unit: distributed limiters" key: unit_limiters_distributed command: "srun julia --color=yes --check-bounds=yes --project=test test/Limiters/distributed/dlimiter.jl" diff --git a/src/Limiters/quasimonotone.jl b/src/Limiters/quasimonotone.jl index 129702c3f6..4dafc16c02 100644 --- a/src/Limiters/quasimonotone.jl +++ b/src/Limiters/quasimonotone.jl @@ -1,3 +1,6 @@ +import ClimaComms +import CUDA + """ QuasiMonotoneLimiter @@ -85,12 +88,30 @@ storing it in `limiter.q_bounds`. Part of [`compute_bounds!`](@ref). """ function compute_element_bounds!(limiter::QuasiMonotoneLimiter, ρq, ρ) + compute_element_bounds!(limiter, ρq, ρ, ClimaComms.device(ρ)) +end + +function compute_element_bounds!( + limiter::QuasiMonotoneLimiter, + ρq, + ρ, + ::ClimaComms.CUDADevice, +) end + +function compute_element_bounds!( + limiter::QuasiMonotoneLimiter, + ρq, + ρ, + ::ClimaComms.AbstractCPUDevice, +) + ρ_data = Fields.field_values(ρ) + ρq_data = Fields.field_values(ρq) q_bounds = limiter.q_bounds - (Ni, Nj, _, Nv, Nh) = size(ρq) + (Ni, Nj, _, Nv, Nh) = size(ρq_data) for h in 1:Nh for v in 1:Nv - slab_ρq = slab(ρq, v, h) - slab_ρ = slab(ρ, v, h) + slab_ρq = slab(ρq_data, v, h) + slab_ρ = slab(ρ_data, v, h) local q_min, q_max for j in 1:Nj for i in 1:Ni @@ -120,10 +141,21 @@ neighbors. Part of [`compute_bounds!`](@ref). """ +compute_neighbor_bounds_local!(limiter::QuasiMonotoneLimiter, ρ) = + compute_neighbor_bounds_local!(limiter, ρ, ClimaComms.device(ρ)) + function compute_neighbor_bounds_local!( limiter::QuasiMonotoneLimiter, - topology::Topologies.AbstractTopology, + ρ, + ::ClimaComms.CUDADevice, +) end + +function compute_neighbor_bounds_local!( + limiter::QuasiMonotoneLimiter, + ρ, + ::ClimaComms.AbstractCPUDevice, ) + topology = Spaces.topology(axes(ρ)) q_bounds = limiter.q_bounds q_bounds_nbr = limiter.q_bounds_nbr (_, _, _, Nv, Nh) = size(q_bounds_nbr) @@ -198,34 +230,20 @@ function compute_bounds!( limiter::QuasiMonotoneLimiter, ρq::Fields.Field, ρ::Fields.Field, -) - topology = Spaces.topology(axes(ρq)) - compute_bounds!( - limiter, - Fields.field_values(ρq), - Fields.field_values(ρ), - topology, - ) -end -function compute_bounds!( - limiter::QuasiMonotoneLimiter, - ρq, - ρ, - topology::Topologies.AbstractTopology, ) compute_element_bounds!(limiter, ρq, ρ) if limiter.ghost_buffer isa Spaces.GhostBuffer Spaces.fill_send_buffer!( - topology, + Spaces.topology(axes(ρq)), limiter.q_bounds, limiter.ghost_buffer, ) ClimaComms.start(limiter.ghost_buffer.graph_context) end - compute_neighbor_bounds_local!(limiter, topology) + compute_neighbor_bounds_local!(limiter, ρ) if limiter.ghost_buffer isa Spaces.GhostBuffer - ClimaComms.finish(ghost_buffer.graph_context) - compute_neighbor_bounds_ghost!(limiter, topology) + ClimaComms.finish(limiter.ghost_buffer.graph_context) + compute_neighbor_bounds_ghost!(limiter, Spaces.topology(axes(ρq))) end end @@ -240,10 +258,24 @@ on the concentration `q` and density `ρ` as an optimal weight. This iterates over each element, calling [`apply_limit_slab!`](@ref). If the limiter fails to converge for any element, a warning is issued. """ +apply_limiter!( + ρq::Fields.Field, + ρ::Fields.Field, + limiter::QuasiMonotoneLimiter, +) = apply_limiter!(ρq, ρ, limiter, ClimaComms.device(ρ)) + +function apply_limiter!( + ρq::Fields.Field, + ρ::Fields.Field, + limiter::QuasiMonotoneLimiter, + ::ClimaComms.CUDADevice, +) end + function apply_limiter!( ρq::Fields.Field, ρ::Fields.Field, limiter::QuasiMonotoneLimiter, + ::ClimaComms.AbstractCPUDevice, ) (; q_bounds_nbr, rtol) = limiter diff --git a/test/Limiters/limiter.jl b/test/Limiters/limiter.jl index 539ca00f3f..8d0a99e576 100644 --- a/test/Limiters/limiter.jl +++ b/test/Limiters/limiter.jl @@ -1,3 +1,9 @@ +#= +julia --project=test +using Revise; include(joinpath("test", "Limiters", "limiter.jl")) +=# +import CUDA +CUDA.allowscalar(false) using ClimaComms using ClimaCore: DataLayouts, Fields, Domains, Geometry, Topologies, Meshes, Spaces, Limiters @@ -6,7 +12,7 @@ using ClimaCore: slab using Test # 2D mesh setup -function rectangular_mesh( +function rectangular_mesh_space( n1, n2, x1periodic, @@ -16,6 +22,9 @@ function rectangular_mesh( x1max = 1.0, x2min = 0.0, x2max = 1.0, + Nij, + device = ClimaComms.device(), + comms_ctx = ClimaComms.SingletonCommsContext(device), ) domain = Domains.RectangleDomain( Domains.IntervalDomain( @@ -33,7 +42,10 @@ function rectangular_mesh( x2periodic ? nothing : (:south, :north), ), ) - return Meshes.RectilinearMesh(domain, n1, n2) + mesh = Meshes.RectilinearMesh(domain, n1, n2) + topology = Topologies.Topology2D(comms_ctx, mesh) + quad = Spaces.Quadratures.GLL{Nij}() + return Spaces.SpectralElementSpace2D(topology, quad) end # 2D x 1D hybrid function space setup @@ -46,6 +58,8 @@ function hvspace_3D( yelems = 8, zelems = 16, Nij = 4, + device = ClimaComms.device(), + comms_ctx = ClimaComms.SingletonCommsContext(device), ) xdomain = Domains.IntervalDomain( @@ -61,8 +75,7 @@ function hvspace_3D( horzdomain = Domains.RectangleDomain(xdomain, ydomain) horzmesh = Meshes.RectilinearMesh(horzdomain, xelems, yelems) - horztopology = - Topologies.Topology2D(ClimaComms.SingletonCommsContext(), horzmesh) + horztopology = Topologies.Topology2D(comms_ctx, horzmesh) zdomain = Domains.IntervalDomain( Geometry.ZPoint{FT}(zlim[1]), @@ -70,7 +83,9 @@ function hvspace_3D( boundary_names = (:bottom, :top), ) vertmesh = Meshes.IntervalMesh(zdomain, nelems = zelems) - vert_center_space = Spaces.CenterFiniteDifferenceSpace(vertmesh) + z_topology = Topologies.IntervalTopology(comms_ctx, vertmesh) + + vert_center_space = Spaces.CenterFiniteDifferenceSpace(z_topology) quad = Spaces.Quadratures.GLL{Nij}() horzspace = Spaces.SpectralElementSpace2D(horztopology, quad) @@ -86,24 +101,24 @@ end Nij = 4 n1 = n2 = 5 + device = ClimaComms.device() + comms_ctx = ClimaComms.SingletonCommsContext(device) for FT in (Float64, Float32) lim_tol = FT(5e-14) - mesh = rectangular_mesh( + space = rectangular_mesh_space( n1, n2, false, - false, + false; FT = FT, x1min = 0.0, x1max = 2.0 * n1, x2min = 0.0, x2max = 3.0 * n2, + Nij, + comms_ctx, ) - topology = - Topologies.Topology2D(ClimaComms.SingletonCommsContext(), mesh) - quad = Spaces.Quadratures.GLL{Nij}() - space = Spaces.SpectralElementSpace2D(topology, quad) # Initialize fields ρ = map(coord -> exp(-coord.y), Fields.coordinate_field(space)) @@ -113,23 +128,24 @@ end limiter = Limiters.QuasiMonotoneLimiter(ρq) Limiters.compute_bounds!(limiter, ρq, ρ) + is_cpu = device isa ClimaComms.AbstractCPUDevice for h2 in 1:n2 for h1 in 1:n1 s = slab(limiter.q_bounds, h1 + n1 * (h2 - 1)) - q_min = s[1] - q_max = s[2] - @test q_min.x ≈ 2 * (h1 - 1) - @test q_min.y ≈ 3 * (h2 - 1) - @test q_max.x ≈ 2 * h1 - @test q_max.y ≈ 3 * h2 + CUDA.@allowscalar q_min = s[1] + CUDA.@allowscalar q_max = s[2] + is_cpu && @test q_min.x ≈ 2 * (h1 - 1) + is_cpu && @test q_min.y ≈ 3 * (h2 - 1) + is_cpu && @test q_max.x ≈ 2 * h1 + is_cpu && @test q_max.y ≈ 3 * h2 s_nbr = slab(limiter.q_bounds_nbr, h1 + n1 * (h2 - 1)) - q_min = s_nbr[1] - q_max = s_nbr[2] - @test q_min.x ≈ 2 * max(h1 - 2, 0) - @test q_min.y ≈ 3 * max(h2 - 2, 0) - @test q_max.x ≈ 2 * min(h1 + 1, n1) - @test q_max.y ≈ 3 * min(h2 + 1, n2) + CUDA.@allowscalar q_min = s_nbr[1] + CUDA.@allowscalar q_max = s_nbr[2] + is_cpu && @test q_min.x ≈ 2 * max(h1 - 2, 0) + is_cpu && @test q_min.y ≈ 3 * max(h2 - 2, 0) + is_cpu && @test q_max.x ≈ 2 * min(h1 + 1, n1) + is_cpu && @test q_max.y ≈ 3 * min(h2 + 1, n2) end end end @@ -167,14 +183,14 @@ end @testset "Optimization-based limiter on a 1×1 elem 2D domain space" begin Nij = 2 + device = ClimaComms.device() + comms_ctx = ClimaComms.SingletonCommsContext(device) + gpu_broken = device isa ClimaComms.CUDADevice for FT in (Float32,) lim_tol = FT(5e-14) - mesh = rectangular_mesh(1, 1, false, false; FT = FT) - topology = - Topologies.Topology2D(ClimaComms.SingletonCommsContext(), mesh) - quad = Spaces.Quadratures.GLL{Nij}() - space = Spaces.SpectralElementSpace2D(topology, quad) + space = + rectangular_mesh_space(1, 1, false, false; FT = FT, Nij, comms_ctx) # Initialize fields ρ = ones(FT, space) @@ -194,8 +210,9 @@ end Limiters.compute_bounds!(limiter, ρq_ref, ρ) Limiters.apply_limiter!(ρq, ρ, limiter) - @test parent(ρq)[:, :, 1, 1] ≈ - [FT(0.0) FT(0.0); FT(0.950005) FT(0.950005)] rtol = 10eps(FT) + @test Array(parent(ρq))[:, :, 1, 1] ≈ + [FT(0.0) FT(0.0); FT(0.950005) FT(0.950005)] rtol = 10eps(FT) broken = + gpu_broken # Check mass conservation after application of limiter @test sum(ρq) ≈ initial_Q_mass rtol = 10eps(FT) end @@ -204,13 +221,12 @@ end @testset "Optimization-based limiter on a 3×3 elem 2D domain space" begin Nij = 5 + device = ClimaComms.device() + comms_ctx = ClimaComms.SingletonCommsContext(device) for FT in (Float64, Float32) - mesh = rectangular_mesh(3, 3, false, false; FT = FT) - topology = - Topologies.Topology2D(ClimaComms.SingletonCommsContext(), mesh) - quad = Spaces.Quadratures.GLL{Nij}() - space = Spaces.SpectralElementSpace2D(topology, quad) + space = + rectangular_mesh_space(3, 3, false, false; FT = FT, Nij, comms_ctx) x_scale = FT(1.2) y_scale = FT(1.5) @@ -228,17 +244,19 @@ end ) ρq_ref = ρ .⊠ q_ref - total_ρq = sum(ρq) + if device isa ClimaComms.AbstractCPUDevice + total_ρq = sum(ρq) # broken on the GPU - limiter = Limiters.QuasiMonotoneLimiter(ρq) + limiter = Limiters.QuasiMonotoneLimiter(ρq) - Limiters.compute_bounds!(limiter, ρq_ref, ρ) - Limiters.apply_limiter!(ρq, ρ, limiter) - q = RecursiveApply.rdiv.(ρq, ρ) + Limiters.compute_bounds!(limiter, ρq_ref, ρ) + Limiters.apply_limiter!(ρq, ρ, limiter) + q = RecursiveApply.rdiv.(ρq, ρ) - @test sum(ρq.x) ≈ total_ρq.x - @test sum(ρq.y) ≈ total_ρq.y - @test all(FT(0) .<= parent(ρq) .<= FT(1)) + @test sum(ρq.x) ≈ total_ρq.x + @test sum(ρq.y) ≈ total_ρq.y + @test all(FT(0) .<= parent(ρq) .<= FT(1)) + end end end @@ -247,10 +265,12 @@ end Nij = 5 n1 = n2 = 3 n3 = 3 + device = ClimaComms.device() + comms_ctx = ClimaComms.SingletonCommsContext(device) for FT in (Float64, Float32) horzspace, hv_center_space, hv_face_space = hvspace_3D( - FT, + FT; xlim = (FT(0), FT(1)), ylim = (FT(0), FT(1)), zlim = (FT(0), FT(2)), @@ -258,6 +278,7 @@ end xelems = n1, yelems = n2, zelems = n3, + comms_ctx, ) x_scale = FT(1.2) @@ -279,16 +300,18 @@ end ) ρq_ref = ρ .⊠ q_ref - total_ρq = sum(ρq) + if device isa ClimaComms.AbstractCPUDevice + total_ρq = sum(ρq) - limiter = Limiters.QuasiMonotoneLimiter(ρq) + limiter = Limiters.QuasiMonotoneLimiter(ρq) - Limiters.compute_bounds!(limiter, ρq_ref, ρ) - Limiters.apply_limiter!(ρq, ρ, limiter) - q = RecursiveApply.rdiv.(ρq, ρ) + Limiters.compute_bounds!(limiter, ρq_ref, ρ) + Limiters.apply_limiter!(ρq, ρ, limiter) + q = RecursiveApply.rdiv.(ρq, ρ) - @test sum(ρq.x) ≈ total_ρq.x - @test sum(ρq.y) ≈ total_ρq.y - @test all(FT(0) .<= parent(ρq) .<= FT(1)) + @test sum(ρq.x) ≈ total_ρq.x + @test sum(ρq.y) ≈ total_ρq.y + @test all(FT(0) .<= parent(ρq) .<= FT(1)) + end end end