From bc1a12e9415771bab65dd54e180dcdf4c0f3df7c Mon Sep 17 00:00:00 2001 From: Charles Kawczynski Date: Fri, 19 Jul 2024 16:59:08 -0400 Subject: [PATCH 1/3] Fix and update benchmark script --- .../scripts/linear_vs_cartesian_indexing.jl | 124 +++++++++++++----- 1 file changed, 91 insertions(+), 33 deletions(-) diff --git a/benchmarks/scripts/linear_vs_cartesian_indexing.jl b/benchmarks/scripts/linear_vs_cartesian_indexing.jl index 9622353882..5bc9645183 100644 --- a/benchmarks/scripts/linear_vs_cartesian_indexing.jl +++ b/benchmarks/scripts/linear_vs_cartesian_indexing.jl @@ -3,6 +3,10 @@ julia --project=.buildkite using Revise; include(joinpath("benchmarks", "scripts", "linear_vs_cartesian_indexing.jl")) # Info: +This script compares two things: + - linear vs cartesian indexing + - impact of static vs dynamic NDRanges (https://juliagpu.github.io/KernelAbstractions.jl/dev/examples/memcopy_static/) + Linear indexing, when possible, has performance advantages over using Cartesian indexing. Julia Base's Broadcast only supports Cartesian indexing as it provides more general support @@ -13,6 +17,9 @@ This script (re-)defines some broadcast machinery and tests the performance of vector vs array operations in a broadcast setting where linear indexing is allowed. +# Summary: + - Pointwise + # References: - https://github.com/CliMA/ClimaCore.jl/issues/1889 - https://github.com/JuliaLang/julia/issues/28126 @@ -23,27 +30,43 @@ setting where linear indexing is allowed. Local Apple M1 Mac (CPU): ``` at_dot_call!($X_array, $Y_array): - 146 milliseconds, 558 microseconds + 143 milliseconds, 774 microseconds at_dot_call!($X_vector, $Y_vector): - 65 milliseconds, 531 microseconds -custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1))); printtb = false): - 66 milliseconds, 735 microseconds -custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = false): - 145 milliseconds, 957 microseconds -custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = true): - 66 milliseconds, 320 microseconds + 65 milliseconds, 567 microseconds +custom_kernel_bc!($X_vector, $Y_vector, $us; printtb = false): + 66 milliseconds, 870 microseconds +custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = false): + 143 milliseconds, 643 microseconds +custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = true): + 65 milliseconds, 778 microseconds +custom_kernel_bc!($X_vector, $Y_vector, $uss; printtb = false): + 65 milliseconds, 765 microseconds +custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = false): + 144 milliseconds, 271 microseconds +custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = true): + 66 milliseconds, 376 microseconds ``` Clima A100 ``` +at_dot_call!($X_array, $Y_array): + 6 milliseconds, 775 microseconds at_dot_call!($X_vector, $Y_vector): - 2 milliseconds, 848 microseconds -custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1))); printtb = false): - 2 milliseconds, 537 microseconds -custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = false): - 8 milliseconds, 804 microseconds -custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = true): - 2 milliseconds, 545 microseconds + 2 milliseconds, 834 microseconds +custom_sol_kernel!($X_vector, $Y_vector, $(Val(N))): + 2 milliseconds, 547 microseconds +custom_kernel_bc!($X_vector, $Y_vector, $us; printtb = false): + 2 milliseconds, 561 microseconds +custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = false): + 4 milliseconds, 160 microseconds +custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = true): + 2 milliseconds, 584 microseconds +custom_kernel_bc!($X_vector, $Y_vector, $uss; printtb = false): + 2 milliseconds, 540 microseconds +custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = false): + 2 milliseconds, 715 microseconds +custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = true): + 2 milliseconds, 547 microseconds ``` =# @@ -239,7 +262,7 @@ function at_dot_call!(X, Y) return nothing end; -function custom_kernel!(X, Y, ::Val{N}) where {N} +function custom_sol_kernel!(X, Y, ::Val{N}) where {N} (; x1, x2, x3) = X (; y1) = Y kernel = CUDA.@cuda always_inline = true launch = false custom_kernel_knl!( @@ -267,7 +290,27 @@ function custom_kernel_knl!(y1, x1, x2, x3, ::Val{N}) where {N} return nothing end; -function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N} +abstract type AbstractUniversalSizes{Nv, Nij} end +struct UniversalSizesCC{Nv, Nij} <: AbstractUniversalSizes{Nv, Nij} + Nh::Int +end +struct UniversalSizesStatic{Nv, Nij, Nh} <: AbstractUniversalSizes{Nv, Nij} end + +get_Nv(::AbstractUniversalSizes{Nv}) where {Nv} = Nv +get_Nij(::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = Nij +get_Nh(us::UniversalSizesCC) = us.Nh +get_Nh(::UniversalSizesStatic{Nv, Nij, Nh}) where {Nv, Nij, Nh} = Nh +get_N(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = prod((Nv,Nij,Nij,1,get_Nh(us))) +UniversalSizesCC(Nv, Nij, Nh) = UniversalSizesCC{Nv, Nij}(Nh) +UniversalSizesStatic(Nv, Nij, Nh) = UniversalSizesStatic{Nv, Nij, Nh}() +using Test +us_tup = (1, 2, 3) +@test get_Nv(UniversalSizesCC(us_tup...)) == get_Nv(UniversalSizesStatic(us_tup...)) +@test get_Nij(UniversalSizesCC(us_tup...)) == get_Nij(UniversalSizesStatic(us_tup...)) +@test get_Nh(UniversalSizesCC(us_tup...)) == get_Nh(UniversalSizesStatic(us_tup...)) +@test get_N(UniversalSizesCC(us_tup...)) == get_N(UniversalSizesStatic(us_tup...)) + +function custom_kernel_bc!(X, Y, us::AbstractUniversalSizes; printtb=true, use_pw=true) (; x1, x2, x3) = X (; y1) = Y bc_base = @lazy @. y1 = myadd(x1, x2, x3) @@ -281,7 +324,7 @@ function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N} end else for i in 1:100 # reduce variance / impact of launch latency - @inbounds @simd for j in 1:N + @inbounds @simd for j in 1:get_N(us) y1[j] = bc[j] end end @@ -291,14 +334,14 @@ function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N} CUDA.@cuda always_inline = true launch = false custom_kernel_knl_bc!( y1, bc, - Val(N), + us, ) config = CUDA.launch_configuration(kernel.fun) threads = min(N, config.threads) blocks = cld(N, threads) printtb && @show blocks, threads for i in 1:100 # reduce variance / impact of launch latency - kernel(y1, bc, Val(N); threads, blocks) + kernel(y1, bc,us; threads, blocks) end end return nothing @@ -306,13 +349,13 @@ end; @inline get_cart_lin_index(bc, n, I) = I @inline get_cart_lin_index(bc::Base.Broadcast.Broadcasted, n, I) = CartesianIndices(map(x -> Base.OneTo(x), n))[I] -function custom_kernel_knl_bc!(y1, bc, ::Val{N}) where {N} +function custom_kernel_knl_bc!(y1, bc, us) @inbounds begin I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x - n = size(y1) - if 1 ≤ I ≤ N - ind = get_cart_lin_index(bc, n, I) - y1[ind] = bc[ind] + if 1 ≤ I ≤ get_N(us) + n = (get_Nv(us), get_Nij(us), get_Nij(us), 1, get_Nh(us)) + ci = get_cart_lin_index(bc, n, I) + y1[ci] = bc[ci] end end return nothing @@ -327,16 +370,31 @@ X_vector = to_vec(X_array); Y_vector = to_vec(Y_array); at_dot_call!(X_array, Y_array) at_dot_call!(X_vector, Y_vector) -# custom_kernel!(X_vector, Y_vector, Val(length(X_vector.x1))) -custom_kernel_bc!(X_vector, Y_vector, Val(length(X_vector.x1))) -custom_kernel_bc!(X_array, Y_array, Val(length(X_vector.x1)); use_pw=false) -custom_kernel_bc!(X_array, Y_array, Val(length(X_vector.x1)); use_pw=true) +N = length(X_vector.x1) +(Nv, Nij, _, Nf, Nh) = size(Y_array.y1); +us = UniversalSizesCC(Nv, Nij, Nh); +uss = UniversalSizesStatic(Nv, Nij, Nh); +@test get_N(us) == N +@test get_N(uss) == N +iscpu = ArrayType === identity +iscpu || custom_sol_kernel!(X_vector, Y_vector, Val(N)) +custom_kernel_bc!(X_vector, Y_vector, us) +custom_kernel_bc!(X_array, Y_array, us; use_pw=false) +custom_kernel_bc!(X_array, Y_array, us; use_pw=true) + +custom_kernel_bc!(X_vector, Y_vector, uss) +custom_kernel_bc!(X_array, Y_array, uss; use_pw=false) +custom_kernel_bc!(X_array, Y_array, uss; use_pw=true) @pretty_belapsed at_dot_call!($X_array, $Y_array) # slow @pretty_belapsed at_dot_call!($X_vector, $Y_vector) # fast -# @pretty_belapsed custom_kernel!($X_vector, $Y_vector, $(Val(length(X_vector.x1)))) -@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1)));printtb=false) -@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1)));printtb=false, use_pw=false) -@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1)));printtb=false, use_pw=true) +iscpu || @pretty_belapsed custom_sol_kernel!($X_vector, $Y_vector, $(Val(N))) +@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $us; printtb=false) +@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $us; printtb=false, use_pw=false) +@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $us; printtb=false, use_pw=true) + +@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $uss; printtb=false) +@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss; printtb=false, use_pw=false) +@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss; printtb=false, use_pw=true) #! format: on From 519c59c4b4d46615b2de9b0c3c19ce20329a5e9b Mon Sep 17 00:00:00 2001 From: Charles Kawczynski Date: Mon, 22 Jul 2024 10:30:58 -0400 Subject: [PATCH 2/3] Rename benchmark script --- ...vs_cartesian_indexing.jl => indexing_and_static_ndranges.jl} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename benchmarks/scripts/{linear_vs_cartesian_indexing.jl => indexing_and_static_ndranges.jl} (99%) diff --git a/benchmarks/scripts/linear_vs_cartesian_indexing.jl b/benchmarks/scripts/indexing_and_static_ndranges.jl similarity index 99% rename from benchmarks/scripts/linear_vs_cartesian_indexing.jl rename to benchmarks/scripts/indexing_and_static_ndranges.jl index 5bc9645183..b92a3c6d5e 100644 --- a/benchmarks/scripts/linear_vs_cartesian_indexing.jl +++ b/benchmarks/scripts/indexing_and_static_ndranges.jl @@ -1,6 +1,6 @@ #= julia --project=.buildkite -using Revise; include(joinpath("benchmarks", "scripts", "linear_vs_cartesian_indexing.jl")) +using Revise; include(joinpath("benchmarks", "scripts", "indexing_and_static_ndranges.jl")) # Info: This script compares two things: From f940c2cc958eb2880056c477bed6dd4f057278ef Mon Sep 17 00:00:00 2001 From: Charles Kawczynski Date: Mon, 22 Jul 2024 10:36:47 -0400 Subject: [PATCH 3/3] Add summary to perf benchmark --- benchmarks/scripts/indexing_and_static_ndranges.jl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/benchmarks/scripts/indexing_and_static_ndranges.jl b/benchmarks/scripts/indexing_and_static_ndranges.jl index b92a3c6d5e..577d971e4f 100644 --- a/benchmarks/scripts/indexing_and_static_ndranges.jl +++ b/benchmarks/scripts/indexing_and_static_ndranges.jl @@ -18,7 +18,16 @@ the performance of vector vs array operations in a broadcast setting where linear indexing is allowed. # Summary: - - Pointwise + - On the CPU: + static NDRanges do not play an important role, + but linear indexing is 2x faster than cartesian + indexing. + - On the GPU: + static NDRanges DO play an important role, + but we could (alternatively) see an improvement + by using linear indexing. Supporting StaticNDRanges + also impacts non-pointwise kernels, and yields + nearly the same benefit as linear indexing. # References: - https://github.com/CliMA/ClimaCore.jl/issues/1889