From 6b31f5d1200cb0380b4680f74295d4a7d6430a82 Mon Sep 17 00:00:00 2001 From: Charles Kawczynski Date: Fri, 19 Jul 2024 16:59:08 -0400 Subject: [PATCH] Fix linear_vs_cartesian benchmark script --- .../scripts/linear_vs_cartesian_indexing.jl | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/benchmarks/scripts/linear_vs_cartesian_indexing.jl b/benchmarks/scripts/linear_vs_cartesian_indexing.jl index 9622353882..22ca318c4b 100644 --- a/benchmarks/scripts/linear_vs_cartesian_indexing.jl +++ b/benchmarks/scripts/linear_vs_cartesian_indexing.jl @@ -267,9 +267,10 @@ function custom_kernel_knl!(y1, x1, x2, x3, ::Val{N}) where {N} return nothing end; -function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N} +function custom_kernel_bc!(X, Y, ::Val{Nv}, ::Val{Nij}, ::Val{Nh}, ::Val{N}; printtb=true, use_pw=true) where {Nv, Nij, Nh, N} (; x1, x2, x3) = X (; y1) = Y + ns = (Val(Nv),Val(Nij),Val(Nh), Val(N)) bc_base = @lazy @. y1 = myadd(x1, x2, x3) bc = use_pw ? to_pointwise_bc(bc_base) : bc_base if y1 isa Array @@ -291,14 +292,14 @@ function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N} CUDA.@cuda always_inline = true launch = false custom_kernel_knl_bc!( y1, bc, - Val(N), + ns..., ) config = CUDA.launch_configuration(kernel.fun) threads = min(N, config.threads) blocks = cld(N, threads) printtb && @show blocks, threads for i in 1:100 # reduce variance / impact of launch latency - kernel(y1, bc, Val(N); threads, blocks) + kernel(y1, bc,ns...; threads, blocks) end end return nothing @@ -306,11 +307,13 @@ end; @inline get_cart_lin_index(bc, n, I) = I @inline get_cart_lin_index(bc::Base.Broadcast.Broadcasted, n, I) = CartesianIndices(map(x -> Base.OneTo(x), n))[I] -function custom_kernel_knl_bc!(y1, bc, ::Val{N}) where {N} +function custom_kernel_knl_bc!(y1, bc, ::Val{Nv}, ::Val{Nij}, ::Val{Nh}, ::Val{N}) where {Nv, Nij, Nh, N} @inbounds begin I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x - n = size(y1) if 1 ≤ I ≤ N + _Nh = size(y1, 5) + CUDA.@cuprintln("Nh = $_Nh") + n = (Nv, Nij, Nij, 1, _Nh) ind = get_cart_lin_index(bc, n, I) y1[ind] = bc[ind] end @@ -327,16 +330,20 @@ X_vector = to_vec(X_array); Y_vector = to_vec(Y_array); at_dot_call!(X_array, Y_array) at_dot_call!(X_vector, Y_vector) -# custom_kernel!(X_vector, Y_vector, Val(length(X_vector.x1))) -custom_kernel_bc!(X_vector, Y_vector, Val(length(X_vector.x1))) -custom_kernel_bc!(X_array, Y_array, Val(length(X_vector.x1)); use_pw=false) -custom_kernel_bc!(X_array, Y_array, Val(length(X_vector.x1)); use_pw=true) +N = length(X_vector.x1) +(Nv, Nij, _, Nf, Nh) = size(Y_array.y1); +ns = (Val(Nv), Val(Nij), Val(Nh), Val(N)); +@show ns +custom_kernel!(X_vector, Y_vector, Val(N)) +custom_kernel_bc!(X_vector, Y_vector, ns...) +custom_kernel_bc!(X_array, Y_array, ns...; use_pw=false) +custom_kernel_bc!(X_array, Y_array, ns...; use_pw=true) @pretty_belapsed at_dot_call!($X_array, $Y_array) # slow @pretty_belapsed at_dot_call!($X_vector, $Y_vector) # fast -# @pretty_belapsed custom_kernel!($X_vector, $Y_vector, $(Val(length(X_vector.x1)))) -@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1)));printtb=false) -@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1)));printtb=false, use_pw=false) -@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1)));printtb=false, use_pw=true) +@pretty_belapsed custom_kernel!($X_vector, $Y_vector, $(Val(N))) +@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $(ns...);printtb=false) +@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $(ns...);printtb=false, use_pw=false) +@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $(ns...);printtb=false, use_pw=true) #! format: on