From 02683a8659b6aba5f328129009e71422852cf60a Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Tue, 20 Feb 2024 09:36:22 -0500
Subject: [PATCH] gate the cast before movements in lazy (#3452)

it made gpt2 slower (2ms -> 2.5ms on 3090, 7ms -> 8ms on M1 Max with BEAM=2).
disabled it in gpt2 benchmark before understanding the full issue
---
 .github/workflows/benchmark.yml | 2 +-
 tinygrad/lazy.py                | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 44d74f7122e4..bc8397037e25 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -90,7 +90,7 @@ jobs:
     - name: Run GPT2 w HALF
       run: CUDA=1 JIT=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing
     - name: Run GPT2 w HALF/BEAM
-      run: CUDA=1 JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
+      run: CUDA=1 JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
     - uses: actions/upload-artifact@v4
       with:
         name: Speed (NVIDIA)
diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py
index bf0a3f5612f1..0c905da755f5 100644
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -65,7 +65,9 @@ def contiguous(self):
 
   def cast(self, dtype:DType, bitcast:bool=False):
     if self.dtype == dtype: return self
-    if dtype.itemsize <= self.dtype.itemsize and self != self.base: return self.base.cast(dtype, bitcast)._view(self.st)
+    # TODO: applying this makes gpt2 slower
+    if getenv("CAST_BEFORE_VIEW", 1) and dtype.itemsize <= self.dtype.itemsize and self != self.base:
+      return self.base.cast(dtype, bitcast)._view(self.st)
     return create_lazybuffer(self.device, ShapeTracker.from_shape(self.shape), dtype, UnaryOps.CAST, (dtype, bitcast), (self,))
 
   def is_unrealized_const(self): return not self.base.realized and self.base.op is LoadOps.CONST