From a45987a45a0180b68a65a3a25606df745bc0c157 Mon Sep 17 00:00:00 2001
From: haohanchen-yagao <haohanchen1994@gmail.com>
Date: Thu, 26 May 2022 13:17:02 -0700
Subject: [PATCH] Update SMMP notebooks

---
 .../gpt-j/01_train_gptj_smp_notebook.ipynb            |  2 +-
 .../11_train_gptj_smp_tensor_parallel_notebook.ipynb  |  7 +++----
 .../model_parallel/gpt2/smp-train-gpt-simple.ipynb    | 11 +++++------
 .../pytorch/model_parallel/gpt2/train_gpt_simple.py   |  4 ++--
 4 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/training/distributed_training/pytorch/model_parallel/gpt-j/01_train_gptj_smp_notebook.ipynb b/training/distributed_training/pytorch/model_parallel/gpt-j/01_train_gptj_smp_notebook.ipynb
index c8ca985ce6..4b4a0a61bc 100644
--- a/training/distributed_training/pytorch/model_parallel/gpt-j/01_train_gptj_smp_notebook.ipynb
+++ b/training/distributed_training/pytorch/model_parallel/gpt-j/01_train_gptj_smp_notebook.ipynb
@@ -21,7 +21,7 @@
     "This notebook depends on the following files and folders:\n",
     "\n",
     "1. `train_gptj_smp_script.py`: This is an entrypoint script that is passed to the PyTorch estimator in the notebook instructions. This script is responsible for end to end training of the GPT-J model with SMP. The script has additional comments at places where the SMP API is used.\n",
-    "2. `fp16`: This folder is used for 16-bit float training, which contains a fp16 optimizer and various fp16 utilities.\n",
+    "2. `memory_tracker.py`: This contains the functions to track memory usage.\n",
     "3. `learning_rates.py`: This contains the functions for learning rate schedule.\n",
     "4. `requirements.txt`: This will install the dependencies, like the right version of huggingface transformers.\n",
     "5. `preprocess.py`: This will download and preprocess the sst2/glue dataset.\n",
diff --git a/training/distributed_training/pytorch/model_parallel/gpt-j/11_train_gptj_smp_tensor_parallel_notebook.ipynb b/training/distributed_training/pytorch/model_parallel/gpt-j/11_train_gptj_smp_tensor_parallel_notebook.ipynb
index 9f4d8ddb7c..5f74112189 100644
--- a/training/distributed_training/pytorch/model_parallel/gpt-j/11_train_gptj_smp_tensor_parallel_notebook.ipynb
+++ b/training/distributed_training/pytorch/model_parallel/gpt-j/11_train_gptj_smp_tensor_parallel_notebook.ipynb
@@ -39,7 +39,7 @@
     "! pip install -qU \"sagemaker>=2,<3\"\n",
     "! pip install -qU sagemaker-experiments\n",
     "\n",
-    "# import IPython\n",
+    "# import IPythonfp16\n",
     "# IPython.Application.instance().kernel.do_shutdown(True)"
    ]
   },
@@ -851,17 +851,16 @@
     "                    \"partitions\": hyperparameters[\"pipeline_parallel_degree\"],\n",
     "                    \"shard_optimizer_state\": hyperparameters[\"shard_optimizer_state\"] > 0,\n",
     "                    \"prescaled_batch\": hyperparameters[\"prescaled_batch\"] > 0,\n",
-    "                    \"fp16_params\": hyperparameters[\"fp16\"] > 0,\n",
+    "                    \"fp16\": hyperparameters[\"fp16\"] > 0,\n",
     "                    \"optimize\": hyperparameters[\"optimize\"],\n",
     "                    \"auto_partition\": False if hyperparameters[\"manual_partition\"] else True,\n",
     "                    \"default_partition\": 0,\n",
-    "                    \"fp16_params\": hyperparameters[\"fp16\"] > 0,\n",
     "                    \"optimize\": hyperparameters[\"optimize\"],\n",
     "                },\n",
     "            }\n",
     "        },\n",
     "    },\n",
-    "    pytorch_version=\"1.10\",\n",
+    "    pytorch_version=\"1.11\",\n",
     "    transformers_version=\"4.17\",\n",
     "    py_version=\"py38\",\n",
     "    output_path=s3_output_bucket,\n",
diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple.ipynb b/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple.ipynb
index 0668dda4dd..8ad4c78618 100644
--- a/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple.ipynb
+++ b/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Train GPT-2 with PyTorch 1.8.1 and Tensor Parallelism Using the SageMaker Model Parallelism Library"
+    "# Train GPT-2 with PyTorch 1.11 and Tensor Parallelism Using the SageMaker Model Parallelism Library"
    ]
   },
   {
@@ -13,16 +13,16 @@
    "source": [
     "This notebook walks you through how to use the SageMaker model parallelism (SMP) library. You'll learn how to train the GPT-2 model with SageMaker's model parallelism.\n",
     "\n",
-    "The GPT-2 model was proposed by OpenAI in paper [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf). The original GPT-2 is a large transformer-based language model with 1.5 billion parameters. In this notebook, you can experiment with the model parameters to achieve different model sizes. This notebook uses the [Hugging Face Transformers GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html) implementation with the SMP integration. Currently, SMP only supports Hugging Face Transformers version 4.4.2.\n",
+    "The GPT-2 model was proposed by OpenAI in paper [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf). The original GPT-2 is a large transformer-based language model with 1.5 billion parameters. In this notebook, you can experiment with the model parameters to achieve different model sizes. This notebook uses the [Hugging Face Transformers GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html) implementation with the SMP integration. \n",
     "\n",
     "This notebook depends on the following files and folders:\n",
     "\n",
     "- `train_gpt_simple.py`: This is an entrypoint script that is passed to the Hugging Face estimator in the notebook instructions. This script is responsible for end to end training of the GPT-2 model with SMP. The script has additional comments at places where the SMP API is used.\n",
-    "- `fp16`: This folder is used for 16-bit float training, which contains a fp16 optimizer and various fp16 utilities.\n",
     "- `data_pipeline.py`: This contains the datapipeline function to prepare the training data.\n",
     "- `learining_rate.py`: This contains the functions for learning rate schedule.\n",
     "- `requirements.txt`: This will install the dependencies, like the right version of huggingface transformers.\n",
     "- `data_prep_512.py`: This will download and preprocess the openwebtext dataset.\n",
+    "- `memory_tracker.py`: This contains the functions to track memory usage.\n",
     "\n",
     "### Additional Resources\n",
     "If you are a new user of Amazon SageMaker, you may find the following helpful to learn more about SMP and using SageMaker with PyTorch.\n",
@@ -569,17 +569,16 @@
     "                    \"partitions\": hyperparameters[\"pipeline_parallel_degree\"],\n",
     "                    \"shard_optimizer_state\": hyperparameters[\"shard_optimizer_state\"] > 0,\n",
     "                    \"prescaled_batch\": hyperparameters[\"prescaled_batch\"] > 0,\n",
-    "                    \"fp16_params\": hyperparameters[\"fp16\"] > 0,\n",
+    "                    \"fp16\": hyperparameters[\"fp16\"] > 0,\n",
     "                    \"optimize\": hyperparameters[\"optimize\"],\n",
     "                    \"auto_partition\": False if hyperparameters[\"manual_partition\"] else True,\n",
     "                    \"default_partition\": 0,\n",
-    "                    \"fp16_params\": hyperparameters[\"fp16\"] > 0,\n",
     "                    \"optimize\": hyperparameters[\"optimize\"],\n",
     "                },\n",
     "            }\n",
     "        },\n",
     "    },\n",
-    "    pytorch_version=\"1.10\",\n",
+    "    pytorch_version=\"1.11\",\n",
     "    transformers_version=\"4.17\",\n",
     "    py_version=\"py38\",\n",
     "    output_path=s3_output_location,\n",
diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/train_gpt_simple.py b/training/distributed_training/pytorch/model_parallel/gpt2/train_gpt_simple.py
index f18ca789cb..2b676db9fb 100644
--- a/training/distributed_training/pytorch/model_parallel/gpt2/train_gpt_simple.py
+++ b/training/distributed_training/pytorch/model_parallel/gpt2/train_gpt_simple.py
@@ -651,7 +651,7 @@ def parse_args():
 
     # I/O
     io_grp = parser.add_argument_group(title="io", description="location for input and output")
-    io_grp.add_argument("--use_wiki_data", type=int, default=0, help="use bert data for training")
+    io_grp.add_argument("--use_wiki_data", type=int, default=0, help="use wiki corpus data for training")
     io_grp.add_argument("--zipped_data", type=int, default=1, help="input data is zipped files")
     io_grp.add_argument(
         "--epochs", type=int, default=3, help="times of iterating over the training dataset"
@@ -1117,4 +1117,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()