Skip to content

Commit

Permalink
code for bells and whistles on nepali dataset (in project gutenberg c…
Browse files Browse the repository at this point in the history
…ode format)
  • Loading branch information
Aananda-giri committed Nov 7, 2024
1 parent 7db1d1b commit 5c349cd
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 40 deletions.
199 changes: 160 additions & 39 deletions 3. GPT-2/sebastian_gutenberg/modifications.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,17 @@
"!pip install datasets --quiet"
]
},
{
"cell_type": "code",
"source": [
"print(\"hi\")"
],
"metadata": {
"id": "2WdVM-BTn6Y6"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
Expand Down Expand Up @@ -253,17 +264,7 @@
"GPT_CONFIG_124M = {\n",
" \"vocab_size\": 50000\n",
" ...\n",
"```\n",
"\n",
"**3. limit text size to 45Million:**\n",
"# otherwise it is giving cuda out of memory error.\n",
"text_data = text_data[:45000000]\n",
"\n",
"**3. Modify start context:**\n",
"start_context = \"रामले भात\", # <modified>\n",
"\n",
"# instead of\n",
"start_context = \"Every effort moves you\", # <original>"
"```"
]
},
{
Expand All @@ -277,32 +278,32 @@
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"!pip install datasets --quiet"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mOKoDTpAw4T5",
"outputId": "f11092a8-fbcf-472c-9c0d-67d85407c3e1"
"outputId": "5587b84b-b2b6-46ad-92b1-3f4e0e972dd6"
},
"execution_count": null,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/480.6 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m471.0/480.6 kB\u001b[0m \u001b[31m19.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/480.6 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m471.0/480.6 kB\u001b[0m \u001b[31m23.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m14.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/116.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/179.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/134.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/194.1 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m14.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\u001b[0m\u001b[31m\n",
"\u001b[0m"
]
}
],
"source": [
"!pip install datasets --quiet"
]
},
{
Expand All @@ -317,8 +318,8 @@
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"name": "stdout",
"text": [
"[Errno 2] No such file or directory: '/content/drive/MyDrive/Research/llm.np/sebastian_gutenberg/'\n",
"/content\n"
Expand All @@ -334,14 +335,90 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "eVrF8P9stXG1"
"id": "eVrF8P9stXG1",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "18058b3d-c56b-43a7-a901-51b3ab6f83b7"
},
"outputs": [],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\rREADME.md: 0% 0.00/2.39k [00:00<?, ?B/s]\rREADME.md: 100% 2.39k/2.39k [00:00<00:00, 13.5MB/s]\n",
"Repo card metadata block was not found. Setting CardData to empty.\n",
"Saved chunk 1 to chunk_1.txt\n"
]
}
],
"source": [
"# Download dataset\n",
"!python3 prepare_dataset.py"
]
},
{
"cell_type": "code",
"source": [
"# source: https://github.com/rasbt/LLMs-from-scratch/blob/main/appendix-D/01_main-chapter-code/appendix-D.ipynb\n",
"!python pretraining_bells_n_whistles.py \\\n",
" --data_dir \"nepberta_sample\" \\\n",
" --n_epochs 5 \\\n",
" --batch_size 4 \\\n",
" --output_dir model_checkpoints"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "QsqUV8q8t1bA",
"outputId": "bd3279e9-08b1-442b-d623-93cee304bbd0"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Total files: 1\n",
"Tokenizing file 1 of 1: nepberta_sample/combined_1.txt\n",
"1793\n",
"Training ...\n",
"Ep 1 (Iter 000000): Train loss 10.966, Val loss 10.977\n",
"Ep 1 (Iter 000100): Train loss 8.701, Val loss 8.410\n",
"Ep 1 (Iter 000200): Train loss 8.213, Val loss 8.032\n",
"Ep 1 (Iter 000300): Train loss 8.104, Val loss 7.724\n",
"Ep 1 (Iter 000400): Train loss 8.121, Val loss 7.451\n",
"Ep 1 (Iter 000500): Train loss 8.122, Val loss 7.085\n",
"Ep 1 (Iter 000600): Train loss 7.244, Val loss 6.794\n",
"Ep 1 (Iter 000700): Train loss 7.670, Val loss 6.366\n",
"Ep 1 (Iter 000800): Train loss 6.663, Val loss 6.292\n",
"Ep 1 (Iter 000900): Train loss 7.567, Val loss 6.315\n",
"Ep 1 (Iter 001000): Train loss 7.092, Val loss 5.973\n",
"Ep 1 (Iter 001100): Train loss 7.603, Val loss 5.891\n",
"Ep 1 (Iter 001200): Train loss 6.382, Val loss 5.798\n",
"Ep 1 (Iter 001300): Train loss 6.817, Val loss 5.496\n",
"Ep 1 (Iter 001400): Train loss 7.190, Val loss 5.462\n",
"Ep 1 (Iter 001500): Train loss 6.995, Val loss 5.476\n",
"Ep 1 (Iter 001600): Train loss 6.431, Val loss 5.573\n",
"Ep 1 (Iter 001700): Train loss 6.402, Val loss 5.536\n",
"2024-11-07 05:05:39.777346: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
"2024-11-07 05:05:40.045898: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
"2024-11-07 05:05:40.115645: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"2024-11-07 05:05:40.538212: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"2024-11-07 05:05:42.929598: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
"रामले भात र त्यसको असर र अन्य कुनै पनि हो । तर पनि हो । तर यो । तर यो । तर पनि हो । तर पनि हो । तर पनि हो । तर पनि हो । तर पनि हो । तर यो । यो । तर पनि हो । तर पनि\n",
"Traceback (most recent call last):\n",
" File \"/content/pretraining_bells_n_whistles.py\", line 260, in <module>\n",
" train_losses, val_losses, tokens_seen, lrs = train_model(\n",
" File \"/content/pretraining_bells_n_whistles.py\", line 106, in train_model\n",
" lr = min_lr + (peak_lr - min_lr) * 0.5 * (1 + math.cos(math.pi * progress))\n",
"NameError: name 'math' is not defined. Did you mean: 'Path'?\n"
]
}
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -350,12 +427,12 @@
"base_uri": "https://localhost:8080/"
},
"id": "rUW9suTsqVyM",
"outputId": "73705398-c54f-45f8-fede-12b2f823b186"
"outputId": "a7b7395c-6df7-42a5-a671-572a01624a64"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"name": "stdout",
"text": [
"Total files: 1\n",
"Tokenizing file 1 of 1: nepberta_sample/combined_1.txt\n",
Expand Down Expand Up @@ -411,12 +488,56 @@
"Ep 2 (Step 3200): Train loss 6.069, Val loss 4.773\n",
"Ep 2 (Step 3300): Train loss 5.851, Val loss 4.792\n",
"Ep 2 (Step 3400): Train loss 5.079, Val loss 4.705\n",
"Ep 2 (Step 3500): Train loss 6.145, Val loss 4.788\n"
"Ep 2 (Step 3500): Train loss 6.145, Val loss 4.788\n",
"Saved model_checkpoints/model_pg_3585.pth\n",
"Book processed 0h 46m 28s\n",
"Total time elapsed 1h 33m 5s\n",
"ETA for remaining books: 0h 0m 0s\n",
"Tokenizing file 1 of 1: nepberta_sample/combined_1.txt\n",
"Training ...\n",
"Ep 3 (Step 3600): Train loss 5.810, Val loss 4.634\n",
"Ep 3 (Step 3700): Train loss 5.523, Val loss 4.722\n",
"Ep 3 (Step 3800): Train loss 5.566, Val loss 4.612\n",
"Ep 3 (Step 3900): Train loss 5.847, Val loss 4.667\n",
"Ep 3 (Step 4000): Train loss 5.112, Val loss 4.575\n",
"रामले भात तरकारी वितरण गरिएको छ । मेलम्ची नगरपालिकामा मेलम्ची नगरपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती गाउँपालिका र इन्द्रावती\n",
"Ep 3 (Step 4100): Train loss 5.566, Val loss 4.584\n",
"Ep 3 (Step 4200): Train loss 6.099, Val loss 4.513\n",
"Ep 3 (Step 4300): Train loss 5.448, Val loss 4.472\n",
"Ep 3 (Step 4400): Train loss 5.047, Val loss 4.466\n",
"Ep 3 (Step 4500): Train loss 4.812, Val loss 4.504\n",
"Ep 3 (Step 4600): Train loss 5.768, Val loss 4.492\n",
"Ep 3 (Step 4700): Train loss 5.545, Val loss 4.480\n",
"Ep 3 (Step 4800): Train loss 5.514, Val loss 4.520\n",
"Ep 3 (Step 4900): Train loss 5.401, Val loss 4.413\n",
"Ep 3 (Step 5000): Train loss 4.242, Val loss 4.406\n",
"रामले भात खान लाउन थाले । तर पनि । तर पनि । तर पनि थिएन । तर पनि । तर पनि । तर पनि थिएन । तर पनि । तर पनि । तर पनि । तर पनि । तर पनि । तर पनि । तर पनि थिएन । तर पनि । तर\n",
"Ep 3 (Step 5100): Train loss 4.827, Val loss 4.352\n",
"Ep 3 (Step 5200): Train loss 5.045, Val loss 4.355\n",
"Ep 3 (Step 5300): Train loss 5.586, Val loss 4.271\n",
"Saved model_checkpoints/model_pg_5378.pth\n",
"Book processed 0h 46m 37s\n",
"Total time elapsed 2h 19m 43s\n",
"ETA for remaining books: 0h 0m 0s\n",
"Tokenizing file 1 of 1: nepberta_sample/combined_1.txt\n",
"Training ...\n",
"Ep 4 (Step 5400): Train loss 5.380, Val loss 4.290\n",
"Ep 4 (Step 5500): Train loss 5.378, Val loss 4.356\n",
"Ep 4 (Step 5600): Train loss 5.512, Val loss 4.388\n",
"Ep 4 (Step 5700): Train loss 4.381, Val loss 4.288\n",
"Ep 4 (Step 5800): Train loss 5.105, Val loss 4.272\n",
"Ep 4 (Step 5900): Train loss 5.250, Val loss 4.274\n",
"Ep 4 (Step 6000): Train loss 5.974, Val loss 4.215\n",
"रामले भात खाए । उनले भने \" अब त । तर पनि त । तर पनि त । तर पनि त कहिले पनि त । तर पनि त । तर पनि त । तर पनि त कहिले पनि त । तर पनि त । तर पनि त कहिले पनि त । तर\n",
"Ep 4 (Step 6100): Train loss 4.755, Val loss 4.224\n",
"Ep 4 (Step 6200): Train loss 4.875, Val loss 4.207\n",
"Ep 4 (Step 6300): Train loss 5.519, Val loss 4.173\n",
"Ep 4 (Step 6400): Train loss 4.497, Val loss 4.137\n"
]
}
],
"source": [
"# to run the code:\n",
"# Alternatively: to run the simpler version of code (source: https://github.com/rasbt/LLMs-from-scratch/tree/main/ch05/03_bonus_pretraining_on_gutenberg)\n",
"!python pretraining_simple.py \\\n",
" --data_dir \"nepberta_sample\" \\\n",
" --n_epochs 5 \\\n",
Expand All @@ -426,30 +547,30 @@
},
{
"cell_type": "code",
"execution_count": 22,
"source": [
"# !rm -rf model_checkpoints"
],
"metadata": {
"id": "bMVaOb7O90G8"
},
"outputs": [],
"source": [
"!rm -rf model_checkpoints"
]
"execution_count": null,
"outputs": []
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"accelerator": "GPU"
},
"nbformat": 4,
"nbformat_minor": 0
}
}
4 changes: 3 additions & 1 deletion 3. GPT-2/sebastian_gutenberg/pretraining_bells_n_whistles.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# code from https://github.com/rasbt/LLMs-from-scratch/blob/main/appendix-D/01_main-chapter-code/appendix-D.ipynb

import argparse
import math
import os
from pathlib import Path
import time


# modified. tokenizer import
# import tiktoken
from transformers import PreTrainedTokenizerFast
Expand Down Expand Up @@ -141,7 +143,7 @@ def train_model(model, train_loader, val_loader, optimizer, device,
f"Val loss {val_loss:.3f}"
)

# Generate and print a sample from the model to monitor progress
# Generate and print a sample from the model to monitor progress (at the end of each epoch)
generate_and_print_sample(
model, tokenizer, device, start_context
)
Expand Down

0 comments on commit 5c349cd

Please sign in to comment.