-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpeakflops_gpu.jl
132 lines (125 loc) · 4.62 KB
/
peakflops_gpu.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# ------------------------- Theoretical -------------------------
"""
Estimates the theoretical peak performance of a CUDA device in TFLOP/s.
**Keyword arguments:**
* `tensorcores` (default: `hastensorcores()`): toggle usage of tensore cores. If `false`, cuda cores will be used.
* `verbose` (default: `true`): toggle printing of information
* `device` (default: `device()`): CUDA device to be analyzed
* `dtype` (default: `tensorcores ? Float16 : Float32`): element type of the matrices
* `io` (default: `stdout`): set the stream where the results should be printed.
"""
function theoretical_peakflops_gpu(;
device=CUDA.device(),
tensorcores=hastensorcores(),
dtype=tensorcores ? Float16 : Float32,
verbose=true,
io::IO=stdout,
)
if tensorcores
max_peakflops = _theoretical_peakflops_gpu_tensorcores(; device, dtype)
else
max_peakflops = _theoretical_peakflops_gpu_cudacores(; device, dtype)
end
if verbose
printstyled(
io,
"Theoretical Peakflops ($(Symbol(dtype) == :Int8 ? "TOP" : "TFLOP")/s):\n";
bold=true,
)
if hastensorcores()
print(io, " ├ tensorcores: ")
printstyled(io, tensorcores, "\n"; color=:magenta, bold=true)
end
print(io, " ├ dtype: ")
printstyled(io, Symbol(dtype), "\n"; color=:yellow, bold=true)
print(io, " └ max: ")
printstyled(io, round(max_peakflops; digits=1), "\n"; color=:green, bold=true)
end
return max_peakflops
end
function _theoretical_peakflops_gpu_cudacores(; device, dtype)
max_clock_rate = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_CLOCK_RATE) # in kHz
num_cuda_cores = ncudacores(device)
max_peakflops = max_clock_rate * num_cuda_cores * 1e-9 # in TFLOP/s
if dtype == Float32
max_peakflops *= 2
elseif dtype == Float64
max_peakflops *= 1
else
throw(ArgumentError("Unsupported dtype."))
end
return max_peakflops
end
function _theoretical_peakflops_gpu_tensorcores(;
device=CUDA.device(), dtype=Float16, verbose=true
)
cap = CUDA.capability(device)
if cap == v"8.0.0"
devtype = :A100
elseif cap == v"7.0.0"
devtype = :V100
else
error("Unsupported compute capability / device generation.")
end
max_clock_rate = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_CLOCK_RATE) # in kHz
num_tensor_cores = ntensorcores(device)
max_peakflops = max_clock_rate * num_tensor_cores * 1e-9 # in TFLOP/s
if devtype == :A100
if Symbol(dtype) == :Float16
# matrix dimensions 8x8x4, factor 2 for nflops in A*B+C
# see e.g. https://peerj.com/articles/cs-330.pdf
max_peakflops *= 2 * 8 * 8 * 4
elseif Symbol(dtype) in (:Float32, :TensorFloat32, :TF32)
max_peakflops *= 2 * 4 * 8 * 4
elseif Symbol(dtype) == :Float64
max_peakflops *= 2 * 4 * 2 * 2
elseif Symbol(dtype) == :Int8
max_peakflops *= 2 * 2 * 8 * 8 * 4
else
throw(ArgumentError("Unsupported dtype."))
end
elseif devtype == :V100
if Symbol(dtype) == :Float16
max_peakflops *= 2 * 4 * 4 * 4
else
throw(ArgumentError("Unsupported dtype."))
end
end
return max_peakflops
end
# ------------------------- Empirical -------------------------
"""
peakflops_gpu(; tensorcores=hastensorcores(), kwargs...)
Tries to estimate the peak performance of a GPU in TFLOP/s by measuring the time
it takes to perform
* `_kernel_fma_nfmas() * size` many FMAs on CUDA cores (if `tensorcores == false`)
* `_kernel_wmma_nwmmas()` many WMMAs on Tensor Cores (if `tensorcores == true`)
For more keyword argument options see [`peakflops_gpu_fmas`](@ref) and [`peakflops_gpu_wmmas`](@ref).
"""
function peakflops_gpu(;
tensorcores=hastensorcores(),
verbose=true,
dtype=tensorcores ? Float16 : Float32,
io::IO=stdout,
kwargs...,
)
if tensorcores
flops = peakflops_gpu_wmmas(; verbose=false, dtype, kwargs...)
else
flops = peakflops_gpu_fmas(; verbose=false, dtype, kwargs...)
end
if verbose
printstyled(
io, "Peakflops ($(Symbol(dtype) == :Int8 ? "TOP" : "TFLOP")/s):\n"; bold=true
)
if hastensorcores()
print(io, " ├ tensorcores: ")
printstyled(io, tensorcores, "\n"; color=:magenta, bold=true)
end
print(io, " ├ dtype: ")
printstyled(io, Symbol(dtype), "\n"; color=:yellow, bold=true)
print(io, " └ max: ")
printstyled(io, round(flops; digits=1), "\n"; color=:green, bold=true)
end
return flops
end