don't allow MLB assigns with different axes (#3557) #19
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Benchmarks | |
on: | |
push: | |
branches: | |
- master | |
- update_benchmark | |
jobs: | |
testmacbenchmark: | |
name: Mac Benchmark | |
runs-on: [self-hosted, macOS] | |
defaults: | |
run: | |
shell: bash -o pipefail {0} | |
if: github.repository_owner == 'tinygrad' | |
env: | |
PYTHONPATH: . | |
steps: | |
- name: Checkout Code | |
uses: actions/checkout@v4 | |
- name: Symlink models and datasets | |
run: | | |
mkdir -p weights | |
ln -s ~/tinygrad/disassemblers/applegpu disassemblers/applegpu | |
ln -s ~/tinygrad/weights/sd-v1-4.ckpt weights/sd-v1-4.ckpt | |
ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
# TODO: why is this test not reliable? | |
#- name: Run Stable Diffusion | |
# run: python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt | |
- name: Run model inference benchmark | |
run: METAL=1 python3 test/external/external_model_benchmark.py | |
- name: Test speed vs torch | |
run: BIG=2 MPS=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt | |
- name: Run Tensor Core GEMM | |
run: | | |
DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt | |
DEBUG=2 HALF=1 python3 extra/gemm/simple_matmul.py | tee matmul_half.txt | |
- name: Run LLaMA | |
run: | | |
JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt | |
JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt | |
- name: Run GPT2 | |
run: | | |
JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt | |
JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt | |
- name: Run 10 CIFAR training steps | |
run: STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt | |
# TODO: this is flaky too | |
# - name: Run 10 CIFAR training steps w winograd | |
# run: WINO=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: Speed (Mac) | |
path: | | |
onnx_inference_speed.csv | |
torch_speed.txt | |
train_cifar.txt | |
train_cifar_wino.txt | |
llama_unjitted.txt | |
llama_jitted.txt | |
gpt2_unjitted.txt | |
gpt2_jitted.txt | |
matmul.txt | |
matmul_half.txt | |
sd.txt | |
testnvidiabenchmark: | |
name: NVIDIA Benchmark | |
runs-on: [self-hosted, Linux, CUDA] | |
defaults: | |
run: | |
shell: bash -o pipefail {0} | |
if: github.repository_owner == 'tinygrad' | |
env: | |
PYTHONPATH: . | |
steps: | |
- name: Checkout Code | |
uses: actions/checkout@v4 | |
- name: Print nvidia-smi | |
run: nvidia-smi | |
- name: Run model inference benchmark | |
run: CUDA=1 python3 test/external/external_model_benchmark.py | |
- name: Test speed vs torch | |
run: CUDA=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt | |
- name: Run GPT2 | |
run: | | |
CUDA=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt | |
CUDA=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt | |
- name: Run GPT2 w HALF | |
run: CUDA=1 JIT=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | |
- name: Run GPT2 w HALF/BEAM | |
run: CUDA=1 JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: Speed (NVIDIA) | |
path: | | |
onnx_inference_speed.csv | |
torch_speed.txt | |
gpt2_unjitted.txt | |
gpt2_jitted.txt | |
gpt2_half_beam.txt | |
testamdbenchmark: | |
name: tinybox Benchmark | |
runs-on: [self-hosted, Linux, tinybox] | |
defaults: | |
run: | |
shell: bash -o pipefail {0} | |
if: github.repository_owner == 'tinygrad' | |
env: | |
PYTHONPATH: . | |
steps: | |
- name: Checkout Code | |
uses: actions/checkout@v4 | |
- name: Show off tinybox | |
run: /opt/rocm/bin/rocm-bandwidth-test | |
- name: Symlink models and datasets | |
run: | | |
mkdir -p weights | |
ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
- name: Run model inference benchmark | |
run: HIP=1 NOCLANG=1 python3 test/external/external_model_benchmark.py | |
- name: Test speed vs torch | |
run: BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt | |
- name: Run Tensor Core GEMM | |
run: HIP=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt | |
- name: Run Stable Diffusion | |
run: python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt | |
- name: Run LLaMA (with HIP) | |
run: | | |
HIP=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt | |
HIP=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt | |
- name: Run GPT2 (with HIP) | |
run: | | |
HIP=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt | |
HIP=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt | |
- name: Run 10 CIFAR training steps | |
run: HIP=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt | |
- name: Run 10 CIFAR training steps w HALF | |
run: HIP=1 STEPS=10 HALF=1 python3 examples/hlb_cifar10.py | tee train_cifar_half.txt | |
# # TODO: enable this. it took 3 minutes in CI and made the full training one more than 5 minutes | |
# - name: Run 10 CIFAR training steps w 6 GPUS | |
# run: time HALF=1 STEPS=10 BS=1536 GPUS=6 python3 examples/hlb_cifar10.py | |
- name: Run full CIFAR training | |
run: time HIP=1 HALF=1 LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: Speed (AMD) | |
path: | | |
onnx_inference_speed.csv | |
torch_speed.txt | |
train_cifar.txt | |
train_cifar_half.txt | |
train_cifar_wino.txt | |
train_cifar_one_gpu.txt | |
llama_unjitted.txt | |
llama_jitted.txt | |
gpt2_unjitted.txt | |
gpt2_jitted.txt | |
matmul.txt | |
sd.txt |