From 42dda34fa1b69376ba6aa8a156b1d9df0defeb73 Mon Sep 17 00:00:00 2001
From: Austin Davis <austinleedavis@users.noreply.github.com>
Date: Wed, 23 Oct 2024 12:07:55 -0400
Subject: [PATCH] Verify vocab is padded before reshaping

This change verifies the vocab is padded (i.e., if V != Vp) before reshaping the tensor.

Otherwise, when V == Vp, `w[key] = w[key].reshape(shape)[:(V-Vp), :]` produces an empty `wte` tensor with shape [0,C].
---
 dev/eval/export_hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/eval/export_hf.py b/dev/eval/export_hf.py
index b52cc28ea..b308da902 100644
--- a/dev/eval/export_hf.py
+++ b/dev/eval/export_hf.py
@@ -88,7 +88,7 @@ def convert(filepath, output, push_to_hub=False, out_dtype="bfloat16"):
         data = np.frombuffer(f.read(num_elements * np.dtype(dtype).itemsize), dtype=dtype)
         w[key] = data.reshape(shape)
         # The binary file saves the padded vocab - drop the padding back to GPT2 size
-        if shape[0] == Vp:
+        if shape[0] == Vp and V != Vp:
             w[key] = w[key].reshape(shape)[:(V-Vp), :]
     # Ensure the file is fully read and then close
     assert f.read() == b''