code for bells and whistles on nepali dataset (in project gutenberg c…

…ode format)
Aananda-giri · Nov 7, 2024 · 5c349cd · 5c349cd
1 parent 7db1d1b
commit 5c349cd
Show file tree

Hide file tree

Showing 2 changed files with 163 additions and 40 deletions.
diff --git a/3. GPT-2/sebastian_gutenberg/modifications.ipynb b/3. GPT-2/sebastian_gutenberg/modifications.ipynb
@@ -32,6 +32,17 @@
         "!pip install datasets --quiet"
       ]
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(\"hi\")"
+      ],
+      "metadata": {
+        "id": "2WdVM-BTn6Y6"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -253,17 +264,7 @@
         "GPT_CONFIG_124M = {\n",
         "            \"vocab_size\": 50000\n",
         "            ...\n",
-        "```\n",
-        "\n",
-        "**3. limit text size to 45Million:**\n",
-        "# otherwise it is giving cuda out of memory error.\n",
-        "text_data = text_data[:45000000]\n",
-        "\n",
-        "**3. Modify start context:**\n",
-        "start_context = \"रामले भात\", # <modified>\n",
-        "\n",
-        "# instead of\n",
-        "start_context = \"Every effort moves you\",   # <original>"
+        "```"
       ]
     },
     {
@@ -277,32 +278,32 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "source": [
+        "!pip install datasets --quiet"
+      ],
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "mOKoDTpAw4T5",
-        "outputId": "f11092a8-fbcf-472c-9c0d-67d85407c3e1"
+        "outputId": "5587b84b-b2b6-46ad-92b1-3f4e0e972dd6"
       },
+      "execution_count": null,
       "outputs": [
         {
-          "name": "stdout",
           "output_type": "stream",
+          "name": "stdout",
           "text": [
-            "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/480.6 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m471.0/480.6 kB\u001b[0m \u001b[31m19.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/480.6 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m471.0/480.6 kB\u001b[0m \u001b[31m23.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m14.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/116.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/179.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/134.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/194.1 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m14.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
             "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
             "gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\u001b[0m\u001b[31m\n",
             "\u001b[0m"
           ]
         }
-      ],
-      "source": [
-        "!pip install datasets --quiet"
       ]
     },
     {
@@ -317,8 +318,8 @@
       },
       "outputs": [
         {
-          "name": "stdout",
           "output_type": "stream",
+          "name": "stdout",
           "text": [
             "[Errno 2] No such file or directory: '/content/drive/MyDrive/Research/llm.np/sebastian_gutenberg/'\n",
             "/content\n"
@@ -334,14 +335,90 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "eVrF8P9stXG1"
+        "id": "eVrF8P9stXG1",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "18058b3d-c56b-43a7-a901-51b3ab6f83b7"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\rREADME.md:   0% 0.00/2.39k [00:00<?, ?B/s]\rREADME.md: 100% 2.39k/2.39k [00:00<00:00, 13.5MB/s]\n",
+            "Repo card metadata block was not found. Setting CardData to empty.\n",
+            "Saved chunk 1 to chunk_1.txt\n"
+          ]
+        }
+      ],
       "source": [
         "#  Download dataset\n",
         "!python3 prepare_dataset.py"
       ]
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "# source: https://github.com/rasbt/LLMs-from-scratch/blob/main/appendix-D/01_main-chapter-code/appendix-D.ipynb\n",
+        "!python pretraining_bells_n_whistles.py \\\n",
+        "  --data_dir \"nepberta_sample\" \\\n",
+        "  --n_epochs 5 \\\n",
+        "  --batch_size 4 \\\n",
+        "  --output_dir model_checkpoints"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "QsqUV8q8t1bA",
+        "outputId": "bd3279e9-08b1-442b-d623-93cee304bbd0"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Total files: 1\n",
+            "Tokenizing file 1 of 1: nepberta_sample/combined_1.txt\n",
+            "1793\n",
+            "Training ...\n",
+            "Ep 1 (Iter 000000): Train loss 10.966, Val loss 10.977\n",
+            "Ep 1 (Iter 000100): Train loss 8.701, Val loss 8.410\n",
+            "Ep 1 (Iter 000200): Train loss 8.213, Val loss 8.032\n",
+            "Ep 1 (Iter 000300): Train loss 8.104, Val loss 7.724\n",
+            "Ep 1 (Iter 000400): Train loss 8.121, Val loss 7.451\n",
+            "Ep 1 (Iter 000500): Train loss 8.122, Val loss 7.085\n",
+            "Ep 1 (Iter 000600): Train loss 7.244, Val loss 6.794\n",
+            "Ep 1 (Iter 000700): Train loss 7.670, Val loss 6.366\n",
+            "Ep 1 (Iter 000800): Train loss 6.663, Val loss 6.292\n",
+            "Ep 1 (Iter 000900): Train loss 7.567, Val loss 6.315\n",
+            "Ep 1 (Iter 001000): Train loss 7.092, Val loss 5.973\n",
+            "Ep 1 (Iter 001100): Train loss 7.603, Val loss 5.891\n",
+            "Ep 1 (Iter 001200): Train loss 6.382, Val loss 5.798\n",
+            "Ep 1 (Iter 001300): Train loss 6.817, Val loss 5.496\n",
+            "Ep 1 (Iter 001400): Train loss 7.190, Val loss 5.462\n",
+            "Ep 1 (Iter 001500): Train loss 6.995, Val loss 5.476\n",
+            "Ep 1 (Iter 001600): Train loss 6.431, Val loss 5.573\n",
+            "Ep 1 (Iter 001700): Train loss 6.402, Val loss 5.536\n",
+            "2024-11-07 05:05:39.777346: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+            "2024-11-07 05:05:40.045898: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+            "2024-11-07 05:05:40.115645: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+            "2024-11-07 05:05:40.538212: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+            "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+            "2024-11-07 05:05:42.929598: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+            "रामले भात र त्यसको असर र अन्य कुनै पनि हो । तर पनि हो । तर यो । तर यो । तर पनि हो । तर पनि हो । तर पनि हो । तर पनि हो । तर पनि हो । तर यो । यो । तर पनि हो । तर पनि\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/content/pretraining_bells_n_whistles.py\", line 260, in <module>\n",
+            "    train_losses, val_losses, tokens_seen, lrs = train_model(\n",
+            "  File \"/content/pretraining_bells_n_whistles.py\", line 106, in train_model\n",
+            "    lr = min_lr + (peak_lr - min_lr) * 0.5 * (1 + math.cos(math.pi * progress))\n",
+            "NameError: name 'math' is not defined. Did you mean: 'Path'?\n"
+          ]
+        }
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -350,12 +427,12 @@
           "base_uri": "https://localhost:8080/"
         },
         "id": "rUW9suTsqVyM",
-        "outputId": "73705398-c54f-45f8-fede-12b2f823b186"
+        "outputId": "a7b7395c-6df7-42a5-a671-572a01624a64"
       },
       "outputs": [
         {
-          "name": "stdout",
           "output_type": "stream",
+          "name": "stdout",
           "text": [
             "Total files: 1\n",
             "Tokenizing file 1 of 1: nepberta_sample/combined_1.txt\n",
@@ -411,12 +488,56 @@
             "Ep 2 (Step 3200): Train loss 6.069, Val loss 4.773\n",
             "Ep 2 (Step 3300): Train loss 5.851, Val loss 4.792\n",
             "Ep 2 (Step 3400): Train loss 5.079, Val loss 4.705\n",
-            "Ep 2 (Step 3500): Train loss 6.145, Val loss 4.788\n"
+            "Ep 2 (Step 3500): Train loss 6.145, Val loss 4.788\n",
+            "Saved model_checkpoints/model_pg_3585.pth\n",
+            "Book processed 0h 46m 28s\n",
+            "Total time elapsed 1h 33m 5s\n",
+            "ETA for remaining books: 0h 0m 0s\n",
+            "Tokenizing file 1 of 1: nepberta_sample/combined_1.txt\n",
+            "Training ...\n",
+            "Ep 3 (Step 3600): Train loss 5.810, Val loss 4.634\n",
+            "Ep 3 (Step 3700): Train loss 5.523, Val loss 4.722\n",
+            "Ep 3 (Step 3800): Train loss 5.566, Val loss 4.612\n",
+            "Ep 3 (Step 3900): Train loss 5.847, Val loss 4.667\n",
+            "Ep 3 (Step 4000): Train loss 5.112, Val loss 4.575\n",
+            "रामले भात तरकारी वितरण गरिएको छ । मेलम्ची नगरपालिकामा मेलम्ची नगरपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती\n",
+            "Ep 3 (Step 4100): Train loss 5.566, Val loss 4.584\n",
+            "Ep 3 (Step 4200): Train loss 6.099, Val loss 4.513\n",
+            "Ep 3 (Step 4300): Train loss 5.448, Val loss 4.472\n",
+            "Ep 3 (Step 4400): Train loss 5.047, Val loss 4.466\n",
+            "Ep 3 (Step 4500): Train loss 4.812, Val loss 4.504\n",
+            "Ep 3 (Step 4600): Train loss 5.768, Val loss 4.492\n",
+            "Ep 3 (Step 4700): Train loss 5.545, Val loss 4.480\n",
+            "Ep 3 (Step 4800): Train loss 5.514, Val loss 4.520\n",
+            "Ep 3 (Step 4900): Train loss 5.401, Val loss 4.413\n",
+            "Ep 3 (Step 5000): Train loss 4.242, Val loss 4.406\n",
+            "रामले भात खान लाउन थाले । तर पनि । तर पनि । तर पनि थिएन । तर पनि । तर पनि । तर पनि थिएन । तर पनि । तर पनि । तर पनि । तर पनि । तर पनि । तर पनि । तर पनि थिएन । तर पनि । तर\n",
+            "Ep 3 (Step 5100): Train loss 4.827, Val loss 4.352\n",
+            "Ep 3 (Step 5200): Train loss 5.045, Val loss 4.355\n",
+            "Ep 3 (Step 5300): Train loss 5.586, Val loss 4.271\n",
+            "Saved model_checkpoints/model_pg_5378.pth\n",
+            "Book processed 0h 46m 37s\n",
+            "Total time elapsed 2h 19m 43s\n",
+            "ETA for remaining books: 0h 0m 0s\n",
+            "Tokenizing file 1 of 1: nepberta_sample/combined_1.txt\n",
+            "Training ...\n",
+            "Ep 4 (Step 5400): Train loss 5.380, Val loss 4.290\n",
+            "Ep 4 (Step 5500): Train loss 5.378, Val loss 4.356\n",
+            "Ep 4 (Step 5600): Train loss 5.512, Val loss 4.388\n",
+            "Ep 4 (Step 5700): Train loss 4.381, Val loss 4.288\n",
+            "Ep 4 (Step 5800): Train loss 5.105, Val loss 4.272\n",
+            "Ep 4 (Step 5900): Train loss 5.250, Val loss 4.274\n",
+            "Ep 4 (Step 6000): Train loss 5.974, Val loss 4.215\n",
+            "रामले भात खाए । उनले भने \" अब त । तर पनि त । तर पनि त । तर पनि त कहिले पनि त । तर पनि त । तर पनि त । तर पनि त कहिले पनि त । तर पनि त । तर पनि त कहिले पनि त । तर\n",
+            "Ep 4 (Step 6100): Train loss 4.755, Val loss 4.224\n",
+            "Ep 4 (Step 6200): Train loss 4.875, Val loss 4.207\n",
+            "Ep 4 (Step 6300): Train loss 5.519, Val loss 4.173\n",
+            "Ep 4 (Step 6400): Train loss 4.497, Val loss 4.137\n"
           ]
         }
       ],
       "source": [
-        "# to run the code:\n",
+        "# Alternatively: to run the simpler version of code (source: https://github.com/rasbt/LLMs-from-scratch/tree/main/ch05/03_bonus_pretraining_on_gutenberg)\n",
         "!python pretraining_simple.py \\\n",
         "  --data_dir \"nepberta_sample\" \\\n",
         "  --n_epochs 5 \\\n",
@@ -426,30 +547,30 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 22,
+      "source": [
+        "# !rm -rf model_checkpoints"
+      ],
       "metadata": {
         "id": "bMVaOb7O90G8"
       },
-      "outputs": [],
-      "source": [
-        "!rm -rf model_checkpoints"
-      ]
+      "execution_count": null,
+      "outputs": []
     }
   ],
   "metadata": {
-    "accelerator": "GPU",
     "colab": {
-      "gpuType": "T4",
-      "provenance": []
+      "provenance": [],
+      "gpuType": "T4"
     },
     "kernelspec": {
       "display_name": "Python 3",
       "name": "python3"
     },
     "language_info": {
       "name": "python"
-    }
+    },
+    "accelerator": "GPU"
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
+}
diff --git a/3. GPT-2/sebastian_gutenberg/pretraining_bells_n_whistles.py b/3. GPT-2/sebastian_gutenberg/pretraining_bells_n_whistles.py
@@ -1,10 +1,12 @@
 # code from https://github.com/rasbt/LLMs-from-scratch/blob/main/appendix-D/01_main-chapter-code/appendix-D.ipynb
 
 import argparse
+import math
 import os
 from pathlib import Path
 import time
 
+
 # modified. tokenizer import
 # import tiktoken
 from transformers import PreTrainedTokenizerFast
@@ -141,7 +143,7 @@ def train_model(model, train_loader, val_loader, optimizer, device,
                         f"Val loss {val_loss:.3f}"
                     )
 
-            # Generate and print a sample from the model to monitor progress
+            # Generate and print a sample from the model to monitor progress (at the end of each epoch)
             generate_and_print_sample(
                 model, tokenizer, device, start_context
             )