From 02683a8659b6aba5f328129009e71422852cf60a Mon Sep 17 00:00:00 2001 From: chenyu Date: Tue, 20 Feb 2024 09:36:22 -0500 Subject: [PATCH] gate the cast before movements in lazy (#3452) it made gpt2 slower (2ms -> 2.5ms on 3090, 7ms -> 8ms on M1 Max with BEAM=2). disabled it in gpt2 benchmark before understanding the full issue --- .github/workflows/benchmark.yml | 2 +- tinygrad/lazy.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 44d74f7122e4..bc8397037e25 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -90,7 +90,7 @@ jobs: - name: Run GPT2 w HALF run: CUDA=1 JIT=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing - name: Run GPT2 w HALF/BEAM - run: CUDA=1 JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt + run: CUDA=1 JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt - uses: actions/upload-artifact@v4 with: name: Speed (NVIDIA) diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index bf0a3f5612f1..0c905da755f5 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -65,7 +65,9 @@ def contiguous(self): def cast(self, dtype:DType, bitcast:bool=False): if self.dtype == dtype: return self - if dtype.itemsize <= self.dtype.itemsize and self != self.base: return self.base.cast(dtype, bitcast)._view(self.st) + # TODO: applying this makes gpt2 slower + if getenv("CAST_BEFORE_VIEW", 1) and dtype.itemsize <= self.dtype.itemsize and self != self.base: + return self.base.cast(dtype, bitcast)._view(self.st) return create_lazybuffer(self.device, ShapeTracker.from_shape(self.shape), dtype, UnaryOps.CAST, (dtype, bitcast), (self,)) def is_unrealized_const(self): return not self.base.realized and self.base.op is LoadOps.CONST