diff --git a/3. GPT-2/sebastian_gutenberg/modifications.ipynb b/3. GPT-2/sebastian_gutenberg/modifications.ipynb index 5457780..0b9a554 100644 --- a/3. GPT-2/sebastian_gutenberg/modifications.ipynb +++ b/3. GPT-2/sebastian_gutenberg/modifications.ipynb @@ -32,6 +32,17 @@ "!pip install datasets --quiet" ] }, + { + "cell_type": "code", + "source": [ + "print(\"hi\")" + ], + "metadata": { + "id": "2WdVM-BTn6Y6" + }, + "execution_count": null, + "outputs": [] + }, { "cell_type": "markdown", "metadata": { @@ -253,17 +264,7 @@ "GPT_CONFIG_124M = {\n", " \"vocab_size\": 50000\n", " ...\n", - "```\n", - "\n", - "**3. limit text size to 45Million:**\n", - "# otherwise it is giving cuda out of memory error.\n", - "text_data = text_data[:45000000]\n", - "\n", - "**3. Modify start context:**\n", - "start_context = \"रामले भात\", # \n", - "\n", - "# instead of\n", - "start_context = \"Every effort moves you\", # " + "```" ] }, { @@ -277,32 +278,32 @@ }, { "cell_type": "code", - "execution_count": null, + "source": [ + "!pip install datasets --quiet" + ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mOKoDTpAw4T5", - "outputId": "f11092a8-fbcf-472c-9c0d-67d85407c3e1" + "outputId": "5587b84b-b2b6-46ad-92b1-3f4e0e972dd6" }, + "execution_count": null, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/480.6 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m471.0/480.6 kB\u001b[0m \u001b[31m19.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/480.6 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m471.0/480.6 kB\u001b[0m \u001b[31m23.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m14.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/116.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/179.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/134.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/194.1 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m14.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0m" ] } - ], - "source": [ - "!pip install datasets --quiet" ] }, { @@ -317,8 +318,8 @@ }, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "[Errno 2] No such file or directory: '/content/drive/MyDrive/Research/llm.np/sebastian_gutenberg/'\n", "/content\n" @@ -334,14 +335,90 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "eVrF8P9stXG1" + "id": "eVrF8P9stXG1", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "18058b3d-c56b-43a7-a901-51b3ab6f83b7" }, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\rREADME.md: 0% 0.00/2.39k [00:00\n", + " train_losses, val_losses, tokens_seen, lrs = train_model(\n", + " File \"/content/pretraining_bells_n_whistles.py\", line 106, in train_model\n", + " lr = min_lr + (peak_lr - min_lr) * 0.5 * (1 + math.cos(math.pi * progress))\n", + "NameError: name 'math' is not defined. Did you mean: 'Path'?\n" + ] + } + ] + }, { "cell_type": "code", "execution_count": null, @@ -350,12 +427,12 @@ "base_uri": "https://localhost:8080/" }, "id": "rUW9suTsqVyM", - "outputId": "73705398-c54f-45f8-fede-12b2f823b186" + "outputId": "a7b7395c-6df7-42a5-a671-572a01624a64" }, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Total files: 1\n", "Tokenizing file 1 of 1: nepberta_sample/combined_1.txt\n", @@ -411,12 +488,56 @@ "Ep 2 (Step 3200): Train loss 6.069, Val loss 4.773\n", "Ep 2 (Step 3300): Train loss 5.851, Val loss 4.792\n", "Ep 2 (Step 3400): Train loss 5.079, Val loss 4.705\n", - "Ep 2 (Step 3500): Train loss 6.145, Val loss 4.788\n" + "Ep 2 (Step 3500): Train loss 6.145, Val loss 4.788\n", + "Saved model_checkpoints/model_pg_3585.pth\n", + "Book processed 0h 46m 28s\n", + "Total time elapsed 1h 33m 5s\n", + "ETA for remaining books: 0h 0m 0s\n", + "Tokenizing file 1 of 1: nepberta_sample/combined_1.txt\n", + "Training ...\n", + "Ep 3 (Step 3600): Train loss 5.810, Val loss 4.634\n", + "Ep 3 (Step 3700): Train loss 5.523, Val loss 4.722\n", + "Ep 3 (Step 3800): Train loss 5.566, Val loss 4.612\n", + "Ep 3 (Step 3900): Train loss 5.847, Val loss 4.667\n", + "Ep 3 (Step 4000): Train loss 5.112, Val loss 4.575\n", + "रामले भात तरकारी वितरण गरिएको छ । मेलम्ची नगरपालिकामा मेलम्ची नगरपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती\n", + "Ep 3 (Step 4100): Train loss 5.566, Val loss 4.584\n", + "Ep 3 (Step 4200): Train loss 6.099, Val loss 4.513\n", + "Ep 3 (Step 4300): Train loss 5.448, Val loss 4.472\n", + "Ep 3 (Step 4400): Train loss 5.047, Val loss 4.466\n", + "Ep 3 (Step 4500): Train loss 4.812, Val loss 4.504\n", + "Ep 3 (Step 4600): Train loss 5.768, Val loss 4.492\n", + "Ep 3 (Step 4700): Train loss 5.545, Val loss 4.480\n", + "Ep 3 (Step 4800): Train loss 5.514, Val loss 4.520\n", + "Ep 3 (Step 4900): Train loss 5.401, Val loss 4.413\n", + "Ep 3 (Step 5000): Train loss 4.242, Val loss 4.406\n", + "रामले भात खान लाउन थाले । तर पनि । तर पनि । तर पनि थिएन । तर पनि । तर पनि । तर पनि थिएन । तर पनि । तर पनि । तर पनि । तर पनि । तर पनि । तर पनि । तर पनि थिएन । तर पनि । तर\n", + "Ep 3 (Step 5100): Train loss 4.827, Val loss 4.352\n", + "Ep 3 (Step 5200): Train loss 5.045, Val loss 4.355\n", + "Ep 3 (Step 5300): Train loss 5.586, Val loss 4.271\n", + "Saved model_checkpoints/model_pg_5378.pth\n", + "Book processed 0h 46m 37s\n", + "Total time elapsed 2h 19m 43s\n", + "ETA for remaining books: 0h 0m 0s\n", + "Tokenizing file 1 of 1: nepberta_sample/combined_1.txt\n", + "Training ...\n", + "Ep 4 (Step 5400): Train loss 5.380, Val loss 4.290\n", + "Ep 4 (Step 5500): Train loss 5.378, Val loss 4.356\n", + "Ep 4 (Step 5600): Train loss 5.512, Val loss 4.388\n", + "Ep 4 (Step 5700): Train loss 4.381, Val loss 4.288\n", + "Ep 4 (Step 5800): Train loss 5.105, Val loss 4.272\n", + "Ep 4 (Step 5900): Train loss 5.250, Val loss 4.274\n", + "Ep 4 (Step 6000): Train loss 5.974, Val loss 4.215\n", + "रामले भात खाए । उनले भने \" अब त । तर पनि त । तर पनि त । तर पनि त कहिले पनि त । तर पनि त । तर पनि त । तर पनि त कहिले पनि त । तर पनि त । तर पनि त कहिले पनि त । तर\n", + "Ep 4 (Step 6100): Train loss 4.755, Val loss 4.224\n", + "Ep 4 (Step 6200): Train loss 4.875, Val loss 4.207\n", + "Ep 4 (Step 6300): Train loss 5.519, Val loss 4.173\n", + "Ep 4 (Step 6400): Train loss 4.497, Val loss 4.137\n" ] } ], "source": [ - "# to run the code:\n", + "# Alternatively: to run the simpler version of code (source: https://github.com/rasbt/LLMs-from-scratch/tree/main/ch05/03_bonus_pretraining_on_gutenberg)\n", "!python pretraining_simple.py \\\n", " --data_dir \"nepberta_sample\" \\\n", " --n_epochs 5 \\\n", @@ -426,21 +547,20 @@ }, { "cell_type": "code", - "execution_count": 22, + "source": [ + "# !rm -rf model_checkpoints" + ], "metadata": { "id": "bMVaOb7O90G8" }, - "outputs": [], - "source": [ - "!rm -rf model_checkpoints" - ] + "execution_count": null, + "outputs": [] } ], "metadata": { - "accelerator": "GPU", "colab": { - "gpuType": "T4", - "provenance": [] + "provenance": [], + "gpuType": "T4" }, "kernelspec": { "display_name": "Python 3", @@ -448,8 +568,9 @@ }, "language_info": { "name": "python" - } + }, + "accelerator": "GPU" }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file diff --git a/3. GPT-2/sebastian_gutenberg/pretraining_bells_n_whistles.py b/3. GPT-2/sebastian_gutenberg/pretraining_bells_n_whistles.py index b31b50e..d9d172f 100644 --- a/3. GPT-2/sebastian_gutenberg/pretraining_bells_n_whistles.py +++ b/3. GPT-2/sebastian_gutenberg/pretraining_bells_n_whistles.py @@ -1,10 +1,12 @@ # code from https://github.com/rasbt/LLMs-from-scratch/blob/main/appendix-D/01_main-chapter-code/appendix-D.ipynb import argparse +import math import os from pathlib import Path import time + # modified. tokenizer import # import tiktoken from transformers import PreTrainedTokenizerFast @@ -141,7 +143,7 @@ def train_model(model, train_loader, val_loader, optimizer, device, f"Val loss {val_loss:.3f}" ) - # Generate and print a sample from the model to monitor progress + # Generate and print a sample from the model to monitor progress (at the end of each epoch) generate_and_print_sample( model, tokenizer, device, start_context )