Skip to content

Commit

Permalink
Fix linear_vs_cartesian benchmark script
Browse files Browse the repository at this point in the history
  • Loading branch information
charleskawczynski committed Jul 19, 2024
1 parent e7b2c9b commit 6b31f5d
Showing 1 changed file with 20 additions and 13 deletions.
33 changes: 20 additions & 13 deletions benchmarks/scripts/linear_vs_cartesian_indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -267,9 +267,10 @@ function custom_kernel_knl!(y1, x1, x2, x3, ::Val{N}) where {N}
return nothing
end;

function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N}
function custom_kernel_bc!(X, Y, ::Val{Nv}, ::Val{Nij}, ::Val{Nh}, ::Val{N}; printtb=true, use_pw=true) where {Nv, Nij, Nh, N}
(; x1, x2, x3) = X
(; y1) = Y
ns = (Val(Nv),Val(Nij),Val(Nh), Val(N))
bc_base = @lazy @. y1 = myadd(x1, x2, x3)
bc = use_pw ? to_pointwise_bc(bc_base) : bc_base
if y1 isa Array
Expand All @@ -291,26 +292,28 @@ function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N}
CUDA.@cuda always_inline = true launch = false custom_kernel_knl_bc!(
y1,
bc,
Val(N),
ns...,
)
config = CUDA.launch_configuration(kernel.fun)
threads = min(N, config.threads)
blocks = cld(N, threads)
printtb && @show blocks, threads
for i in 1:100 # reduce variance / impact of launch latency
kernel(y1, bc, Val(N); threads, blocks)
kernel(y1, bc,ns...; threads, blocks)
end
end
return nothing
end;
@inline get_cart_lin_index(bc, n, I) = I
@inline get_cart_lin_index(bc::Base.Broadcast.Broadcasted, n, I) =
CartesianIndices(map(x -> Base.OneTo(x), n))[I]
function custom_kernel_knl_bc!(y1, bc, ::Val{N}) where {N}
function custom_kernel_knl_bc!(y1, bc, ::Val{Nv}, ::Val{Nij}, ::Val{Nh}, ::Val{N}) where {Nv, Nij, Nh, N}
@inbounds begin
I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
n = size(y1)
if 1 I N
_Nh = size(y1, 5)
CUDA.@cuprintln("Nh = $_Nh")
n = (Nv, Nij, Nij, 1, _Nh)
ind = get_cart_lin_index(bc, n, I)
y1[ind] = bc[ind]
end
Expand All @@ -327,16 +330,20 @@ X_vector = to_vec(X_array);
Y_vector = to_vec(Y_array);
at_dot_call!(X_array, Y_array)
at_dot_call!(X_vector, Y_vector)
# custom_kernel!(X_vector, Y_vector, Val(length(X_vector.x1)))
custom_kernel_bc!(X_vector, Y_vector, Val(length(X_vector.x1)))
custom_kernel_bc!(X_array, Y_array, Val(length(X_vector.x1)); use_pw=false)
custom_kernel_bc!(X_array, Y_array, Val(length(X_vector.x1)); use_pw=true)
N = length(X_vector.x1)
(Nv, Nij, _, Nf, Nh) = size(Y_array.y1);
ns = (Val(Nv), Val(Nij), Val(Nh), Val(N));
@show ns
custom_kernel!(X_vector, Y_vector, Val(N))
custom_kernel_bc!(X_vector, Y_vector, ns...)
custom_kernel_bc!(X_array, Y_array, ns...; use_pw=false)
custom_kernel_bc!(X_array, Y_array, ns...; use_pw=true)

@pretty_belapsed at_dot_call!($X_array, $Y_array) # slow
@pretty_belapsed at_dot_call!($X_vector, $Y_vector) # fast
# @pretty_belapsed custom_kernel!($X_vector, $Y_vector, $(Val(length(X_vector.x1))))
@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1)));printtb=false)
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1)));printtb=false, use_pw=false)
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1)));printtb=false, use_pw=true)
@pretty_belapsed custom_kernel!($X_vector, $Y_vector, $(Val(N)))
@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $(ns...);printtb=false)
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $(ns...);printtb=false, use_pw=false)
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $(ns...);printtb=false, use_pw=true)

#! format: on

0 comments on commit 6b31f5d

Please sign in to comment.