Skip to content

Commit

Permalink
Merge pull request #1965 from CliMA/ck/update_copyto_benchmarks
Browse files Browse the repository at this point in the history
Refactor and update benchmarks
  • Loading branch information
charleskawczynski authored Aug 30, 2024
2 parents 9dd63b9 + 256319a commit aed16b0
Show file tree
Hide file tree
Showing 7 changed files with 221 additions and 131 deletions.
48 changes: 25 additions & 23 deletions benchmarks/scripts/benchmark_offset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,32 +14,33 @@ using Revise; include(joinpath("benchmarks", "scripts", "benchmark_offset.jl"))
Clima A100:
```
[ Info: ArrayType = CuArray
Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
┌────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps
├────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 68 microseconds, 834 nanoseconds │ 57.79081178.35 │ 4 │ 100
BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 58 microseconds, 153 nanoseconds │ 68.40461394.77 │ 4 │ 100
BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 576 nanoseconds │ 70.3113 │ 1433.65 │ 4 │ 100
BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 185 nanoseconds │ 59.2089 │ 1207.27 │ 4 │ 100
└────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
Problem size: (63, 4, 4, 1, 5400), N reads-writes: 4, N-reps: 100, Float_type = Float32, Device_bandwidth_GBs=2039
┌────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┐
│ funcs │ time per call │ bw % │ achieved bw │
├────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┤
│ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 84 microseconds, 726 nanoseconds │ 46.9507957.324
│ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 58 microseconds, 102 nanoseconds │ 68.46491396.0
│ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 331 nanoseconds │ 70.618 │ 1439.9
│ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 390 nanoseconds │ 59.029 │ 1203.6
└────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┘
[ Info: ArrayType = CuArray
Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
┌────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps
├────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 68 microseconds, 967 nanoseconds │ 57.6793 │ 1176.08 │ 4 │ 100
BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 58 microseconds, 82 nanoseconds │ 68.489 │ 1396.49 │ 4 │ 100
BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 597 nanoseconds │ 70.28581433.13 │ 4 │ 100
BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 288 nanoseconds │ 59.11881205.43 │ 4 │ 100
└────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
Problem size: (63, 4, 4, 1, 5400), N reads-writes: 4, N-reps: 100, Float_type = Float64, Device_bandwidth_GBs=2039
┌────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┐
│ funcs │ time per call │ bw % │ achieved bw │
├────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┤
│ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 107 microseconds, 387 nanoseconds │ 74.086 │ 1510.61
│ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 105 microseconds, 42 nanoseconds │ 75.7399 │ 1544.34
│ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 102 microseconds, 636 nanoseconds │ 77.51571580.54
│ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 106 microseconds, 896 nanoseconds │ 74.42661517.56
└────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┘
```
=#

#! format: off
module BenchmarkOffset

import CUDA
include("benchmark_utils.jl")

add3(x1, x2, x3) = x1 + x2 + x3
Expand Down Expand Up @@ -76,7 +77,7 @@ function aos_cart_offset!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
e = min(e, et)
end
end
push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4)
return nothing
end;
function aos_cart_offset_kernel!(X, Y, us)
Expand Down Expand Up @@ -131,7 +132,7 @@ function aos_lin_offset!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
e = min(e, et)
end
end
push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4)
return nothing
end;
function aos_lin_offset_kernel!(X, Y, us)
Expand Down Expand Up @@ -184,7 +185,7 @@ function soa_cart_index!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
e = min(e, et)
end
end
push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4)
return nothing
end;
function soa_cart_index_kernel!(X, Y, us)
Expand Down Expand Up @@ -229,7 +230,7 @@ function soa_linear_index!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
e = min(e, et)
end
end
push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4)
return nothing
end;
function soa_linear_index_kernel!(X, Y, us)
Expand Down Expand Up @@ -258,9 +259,10 @@ end
using CUDA
using Test
@testset "Offset benchmark" begin
bm = BO.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float32) # size(problem_size, 4) == 1 to avoid double counting reads/writes
ArrayType = CUDA.CuArray;
# ArrayType = Base.identity;
device_name = CUDA.name(CUDA.device())
bm = BO.Benchmark(;problem_size=(63,4,4,1,5400), device_name, float_type=Float32) # size(problem_size, 4) == 1 to avoid double counting reads/writes
arr(float_type, problem_size, T) = T(zeros(float_type, problem_size...))

FT = Float64;
Expand Down
122 changes: 97 additions & 25 deletions benchmarks/scripts/benchmark_utils.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import CUDA
# import CUDA
import ClimaComms
using BenchmarkTools, Dates
using LazyBroadcast: @lazy

Expand All @@ -14,21 +15,40 @@ macro caller_name(f)
end
end

"""
device_info(device_name::String)
Call with `device_info(CUDA.name(CUDA.device()))`
"""
function device_info(device_name)
device_specs = Dict(
"NVIDIA A100-SXM4-80GB" => (; device_bandwidth_GBs = 2_039),
"Tesla P100-PCIE-16GB" => (; device_bandwidth_GBs = 732),
)
is_cuda = ClimaComms.device() isa ClimaComms.CUDADevice
if is_cuda && haskey(device_specs, device_name)
(; device_bandwidth_GBs) = device_specs[device_name]
return (; device_bandwidth_GBs, exists = true, name = device_name)
else
return (; device_bandwidth_GBs = 1, exists = false, name = device_name)
end
end

Base.@kwdef mutable struct Benchmark
problem_size::Tuple
problem_size = nothing
float_type::Type
device_bandwidth_GBs::Int = 2_039 # (A100 SXM4 80GB)
data::Vector = []
unfound_device::Bool = false
unfound_device_name::String = ""
device_name::String = ""
end

function perf_stats(; bm::Benchmark, kernel_time_s, n_reads_writes)
N = prod(bm.problem_size)
GB = N * n_reads_writes * sizeof(bm.float_type) / 1024^3
achieved_bandwidth_GBs = GB / kernel_time_s
bandwidth_efficiency =
achieved_bandwidth_GBs / bm.device_bandwidth_GBs * 100
return (; N, GB, achieved_bandwidth_GBs, bandwidth_efficiency)
end;
function print_unfound_devices(bm::Benchmark)
bm.unfound_device || return nothing
println("\nUnfound device: $(bm.unfound_device_name). Please")
println("look up specs and add to device_bandwidth() in")
println("$(@__FILE__).\n")
end

time_and_units_str(x::Real) =
trunc_time(string(compound_period(x, Dates.Second)))
Expand All @@ -51,46 +71,98 @@ get_Nh(us::UniversalSizesCC) = us.Nh
get_Nh(::UniversalSizesStatic{Nv, Nij, Nh}) where {Nv, Nij, Nh} = Nh
get_N(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} =
prod((Nv, Nij, Nij, 1, get_Nh(us)))
Base.size(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} =
(Nv, Nij, Nij, 1, get_Nh(us))
UniversalSizesCC(Nv, Nij, Nh) = UniversalSizesCC{Nv, Nij}(Nh)
UniversalSizesStatic(Nv, Nij, Nh) = UniversalSizesStatic{Nv, Nij, Nh}()

import PrettyTables
function tabulate_benchmark(bm)
funcs = map(x -> x.caller, bm.data)
funcs = map(x -> strip(x.caller), bm.data)
timings = map(x -> time_and_units_str(x.kernel_time_s), bm.data)
n_reads_writes = map(x -> x.n_reads_writes, bm.data)
nreps = map(x -> x.nreps, bm.data)
dinfo = device_info(bm.device_name)
achieved_bandwidth_GBs = map(x -> x.achieved_bandwidth_GBs, bm.data)
bandwidth_efficiency = map(x -> x.bandwidth_efficiency, bm.data)
bandwidth_efficiency = if dinfo.exists
map(x -> x / dinfo.device_bandwidth_GBs * 100, achieved_bandwidth_GBs)
else
()
end
problem_size = map(x -> x.problem_size, bm.data)
# if we specify the problem size up front, then make
# sure that there is no variation when collecting:
if !isnothing(bm.problem_size)
@assert all(prod.(problem_size) .== prod(bm.problem_size))
end
N = map(x -> prod(x), problem_size)
no_bw_efficiency = length(bandwidth_efficiency) == 0
header = [
"funcs",
"time per call",
"bw %",
(no_bw_efficiency ? () : ("bw %",))...,
"achieved bw",
"n-reads/writes",
"n-reps",
(allequal(n_reads_writes) ? () : ("N reads-writes",))...,
(allequal(N) ? () : ("problem size",))...,
(allequal(nreps) ? () : ("n-reps",))...,
]
data = hcat(
args = (
funcs,
timings,
bandwidth_efficiency,
(no_bw_efficiency ? () : (bandwidth_efficiency,))...,
achieved_bandwidth_GBs,
n_reads_writes,
nreps,
(allequal(n_reads_writes) ? () : (n_reads_writes,))...,
(allequal(N) ? () : (problem_size,))...,
(allequal(nreps) ? () : (nreps,))...,
)
data = hcat(args...)
n_reads_writes_str =
allequal(n_reads_writes) ? "N reads-writes: $(n_reads_writes[1]), " : ""
problem_size_str = allequal(N) ? "Problem size: $(problem_size[1]), " : ""
nreps_str = allequal(nreps) ? "N-reps: $(nreps[1]), " : ""
device_bandwidth_GBs_str =
dinfo.exists ? "Device_bandwidth_GBs=$(dinfo.device_bandwidth_GBs)" : ""
print_unfound_devices(bm)
title = strip(
"$problem_size_str$n_reads_writes_str$nreps_str Float_type = $(bm.float_type), $device_bandwidth_GBs_str",
)
title = "Problem size: $(bm.problem_size), float_type = $(bm.float_type), device_bandwidth_GBs=$(bm.device_bandwidth_GBs)"
PrettyTables.pretty_table(data; title, header, alignment = :l, crop = :none)
end

push_info(bm::Nothing; e, nreps, caller, n_reads_writes) = nothing
function push_info(bm; e, nreps, caller, n_reads_writes)
kernel_time_s = e / nreps
push_info(
bm::Nothing;
kernel_time_s,
nreps,
caller,
n_reads_writes,
problem_size,
) = nothing
function push_info(
bm;
kernel_time_s,
nreps,
caller,
n_reads_writes,
problem_size,
)
N = prod(problem_size)
GB = N * n_reads_writes * sizeof(bm.float_type) / 1024^3
achieved_bandwidth_GBs = GB / kernel_time_s
dinfo = device_info(bm.device_name)
if !dinfo.exists
bm.unfound_device = true
bm.unfound_device_name = dinfo.name
end

nt = (;
caller,
kernel_time_s,
n_reads_writes,
nreps,
perf_stats(; bm, kernel_time_s, n_reads_writes)...,
problem_size,
N,
GB,
achieved_bandwidth_GBs,
)
push!(bm.data, nt)
end
Loading

0 comments on commit aed16b0

Please sign in to comment.