forked from tinygrad/tinygrad
-
Notifications
You must be signed in to change notification settings - Fork 1
184 lines (180 loc) · 7.86 KB
/
benchmark.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
name: Benchmarks
on:
push:
branches:
- master
- update_benchmark
jobs:
testmacbenchmark:
name: Mac Benchmark
runs-on: [self-hosted, macOS]
defaults:
run:
shell: bash -o pipefail {0}
if: github.repository_owner == 'tinygrad'
env:
PYTHONPATH: .
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Symlink models and datasets
run: |
mkdir -p weights
ln -s ~/tinygrad/disassemblers/applegpu disassemblers/applegpu
ln -s ~/tinygrad/weights/sd-v1-4.ckpt weights/sd-v1-4.ckpt
ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
# TODO: why is this test not reliable?
#- name: Run Stable Diffusion
# run: python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt
- name: Run model inference benchmark
run: METAL=1 python3 test/external/external_model_benchmark.py
- name: Test speed vs torch
run: BIG=2 MPS=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
- name: Run Tensor Core GEMM
run: |
DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
DEBUG=2 HALF=1 python3 extra/gemm/simple_matmul.py | tee matmul_half.txt
- name: Run LLaMA
run: |
JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
- name: Run LLaMA with BEAM
run: JIT=1 BEAM=2 CACHELEVEL=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
- name: Run GPT2
run: |
JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
- name: Run GPT2 w HALF
run: JIT=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
- name: Run GPT2 w HALF/BEAM
run: JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
- name: Run 10 CIFAR training steps
run: STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
# TODO: this is flaky too
# - name: Run 10 CIFAR training steps w winograd
# run: WINO=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
- uses: actions/upload-artifact@v4
with:
name: Speed (Mac)
path: |
onnx_inference_speed.csv
torch_speed.txt
train_cifar.txt
train_cifar_wino.txt
llama_unjitted.txt
llama_jitted.txt
llama_beam.txt
gpt2_unjitted.txt
gpt2_jitted.txt
gpt2_half.txt
gpt2_half_beam.txt
matmul.txt
matmul_half.txt
sd.txt
testnvidiabenchmark:
name: NVIDIA Benchmark
runs-on: [self-hosted, Linux, CUDA]
defaults:
run:
shell: bash -o pipefail {0}
if: github.repository_owner == 'tinygrad'
env:
PYTHONPATH: .
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Print nvidia-smi
run: nvidia-smi
- name: Run model inference benchmark
run: CUDA=1 python3 test/external/external_model_benchmark.py
- name: Test speed vs torch
run: CUDA=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
- name: Run Tensor Core GEMM
run: CUDA=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
- name: Run GPT2
run: |
CUDA=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
CUDA=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
- name: Run GPT2 w HALF
run: CUDA=1 JIT=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
- name: Run GPT2 w HALF/BEAM
run: CUDA=1 JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
- name: Run full CIFAR training
run: time CUDA=1 HALF=1 LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
- uses: actions/upload-artifact@v4
with:
name: Speed (NVIDIA)
path: |
onnx_inference_speed.csv
torch_speed.txt
matmul.txt
gpt2_unjitted.txt
gpt2_jitted.txt
gpt2_half.txt
gpt2_half_beam.txt
train_cifar_one_gpu.txt
testamdbenchmark:
name: tinybox Benchmark
runs-on: [self-hosted, Linux, tinybox]
defaults:
run:
shell: bash -o pipefail {0}
if: github.repository_owner == 'tinygrad'
env:
PYTHONPATH: .
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Show off tinybox
run: /opt/rocm/bin/rocm-bandwidth-test
- name: Symlink models and datasets
run: |
mkdir -p weights
ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
- name: Run model inference benchmark
run: LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 NOCLANG=1 python3 test/external/external_model_benchmark.py
- name: Test speed vs torch
run: |
python3 -c "import torch; print(torch.__version__)"
LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
- name: Run Tensor Core GEMM
run: HSA=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
- name: Run Stable Diffusion
run: HSA=1 python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt
- name: Run LLaMA
run: |
HSA=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
HSA=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
- name: Run GPT2
run: |
HSA=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
HSA=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
- name: Run 10 CIFAR training steps
run: HSA=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
- name: Run 10 CIFAR training steps w HALF
run: HSA=1 STEPS=10 HALF=1 python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
# # TODO: enable this. it took 3 minutes in CI and made the full training one more than 5 minutes
# - name: Run 10 CIFAR training steps w 6 GPUS
# run: time HALF=1 STEPS=10 BS=1536 GPUS=6 python3 examples/hlb_cifar10.py
- name: Run full CIFAR training
run: time HSA=1 HALF=1 LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
- uses: actions/upload-artifact@v4
with:
name: Speed (AMD)
path: |
onnx_inference_speed.csv
torch_speed.txt
train_cifar.txt
train_cifar_half.txt
train_cifar_wino.txt
train_cifar_one_gpu.txt
llama_unjitted.txt
llama_jitted.txt
gpt2_unjitted.txt
gpt2_jitted.txt
matmul.txt
sd.txt