From a45987a45a0180b68a65a3a25606df745bc0c157 Mon Sep 17 00:00:00 2001 From: haohanchen-yagao Date: Thu, 26 May 2022 13:17:02 -0700 Subject: [PATCH] Update SMMP notebooks --- .../gpt-j/01_train_gptj_smp_notebook.ipynb | 2 +- .../11_train_gptj_smp_tensor_parallel_notebook.ipynb | 7 +++---- .../model_parallel/gpt2/smp-train-gpt-simple.ipynb | 11 +++++------ .../pytorch/model_parallel/gpt2/train_gpt_simple.py | 4 ++-- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/training/distributed_training/pytorch/model_parallel/gpt-j/01_train_gptj_smp_notebook.ipynb b/training/distributed_training/pytorch/model_parallel/gpt-j/01_train_gptj_smp_notebook.ipynb index c8ca985ce6..4b4a0a61bc 100644 --- a/training/distributed_training/pytorch/model_parallel/gpt-j/01_train_gptj_smp_notebook.ipynb +++ b/training/distributed_training/pytorch/model_parallel/gpt-j/01_train_gptj_smp_notebook.ipynb @@ -21,7 +21,7 @@ "This notebook depends on the following files and folders:\n", "\n", "1. `train_gptj_smp_script.py`: This is an entrypoint script that is passed to the PyTorch estimator in the notebook instructions. This script is responsible for end to end training of the GPT-J model with SMP. The script has additional comments at places where the SMP API is used.\n", - "2. `fp16`: This folder is used for 16-bit float training, which contains a fp16 optimizer and various fp16 utilities.\n", + "2. `memory_tracker.py`: This contains the functions to track memory usage.\n", "3. `learning_rates.py`: This contains the functions for learning rate schedule.\n", "4. `requirements.txt`: This will install the dependencies, like the right version of huggingface transformers.\n", "5. `preprocess.py`: This will download and preprocess the sst2/glue dataset.\n", diff --git a/training/distributed_training/pytorch/model_parallel/gpt-j/11_train_gptj_smp_tensor_parallel_notebook.ipynb b/training/distributed_training/pytorch/model_parallel/gpt-j/11_train_gptj_smp_tensor_parallel_notebook.ipynb index 9f4d8ddb7c..5f74112189 100644 --- a/training/distributed_training/pytorch/model_parallel/gpt-j/11_train_gptj_smp_tensor_parallel_notebook.ipynb +++ b/training/distributed_training/pytorch/model_parallel/gpt-j/11_train_gptj_smp_tensor_parallel_notebook.ipynb @@ -39,7 +39,7 @@ "! pip install -qU \"sagemaker>=2,<3\"\n", "! pip install -qU sagemaker-experiments\n", "\n", - "# import IPython\n", + "# import IPythonfp16\n", "# IPython.Application.instance().kernel.do_shutdown(True)" ] }, @@ -851,17 +851,16 @@ " \"partitions\": hyperparameters[\"pipeline_parallel_degree\"],\n", " \"shard_optimizer_state\": hyperparameters[\"shard_optimizer_state\"] > 0,\n", " \"prescaled_batch\": hyperparameters[\"prescaled_batch\"] > 0,\n", - " \"fp16_params\": hyperparameters[\"fp16\"] > 0,\n", + " \"fp16\": hyperparameters[\"fp16\"] > 0,\n", " \"optimize\": hyperparameters[\"optimize\"],\n", " \"auto_partition\": False if hyperparameters[\"manual_partition\"] else True,\n", " \"default_partition\": 0,\n", - " \"fp16_params\": hyperparameters[\"fp16\"] > 0,\n", " \"optimize\": hyperparameters[\"optimize\"],\n", " },\n", " }\n", " },\n", " },\n", - " pytorch_version=\"1.10\",\n", + " pytorch_version=\"1.11\",\n", " transformers_version=\"4.17\",\n", " py_version=\"py38\",\n", " output_path=s3_output_bucket,\n", diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple.ipynb b/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple.ipynb index 0668dda4dd..8ad4c78618 100644 --- a/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple.ipynb +++ b/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Train GPT-2 with PyTorch 1.8.1 and Tensor Parallelism Using the SageMaker Model Parallelism Library" + "# Train GPT-2 with PyTorch 1.11 and Tensor Parallelism Using the SageMaker Model Parallelism Library" ] }, { @@ -13,16 +13,16 @@ "source": [ "This notebook walks you through how to use the SageMaker model parallelism (SMP) library. You'll learn how to train the GPT-2 model with SageMaker's model parallelism.\n", "\n", - "The GPT-2 model was proposed by OpenAI in paper [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf). The original GPT-2 is a large transformer-based language model with 1.5 billion parameters. In this notebook, you can experiment with the model parameters to achieve different model sizes. This notebook uses the [Hugging Face Transformers GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html) implementation with the SMP integration. Currently, SMP only supports Hugging Face Transformers version 4.4.2.\n", + "The GPT-2 model was proposed by OpenAI in paper [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf). The original GPT-2 is a large transformer-based language model with 1.5 billion parameters. In this notebook, you can experiment with the model parameters to achieve different model sizes. This notebook uses the [Hugging Face Transformers GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html) implementation with the SMP integration. \n", "\n", "This notebook depends on the following files and folders:\n", "\n", "- `train_gpt_simple.py`: This is an entrypoint script that is passed to the Hugging Face estimator in the notebook instructions. This script is responsible for end to end training of the GPT-2 model with SMP. The script has additional comments at places where the SMP API is used.\n", - "- `fp16`: This folder is used for 16-bit float training, which contains a fp16 optimizer and various fp16 utilities.\n", "- `data_pipeline.py`: This contains the datapipeline function to prepare the training data.\n", "- `learining_rate.py`: This contains the functions for learning rate schedule.\n", "- `requirements.txt`: This will install the dependencies, like the right version of huggingface transformers.\n", "- `data_prep_512.py`: This will download and preprocess the openwebtext dataset.\n", + "- `memory_tracker.py`: This contains the functions to track memory usage.\n", "\n", "### Additional Resources\n", "If you are a new user of Amazon SageMaker, you may find the following helpful to learn more about SMP and using SageMaker with PyTorch.\n", @@ -569,17 +569,16 @@ " \"partitions\": hyperparameters[\"pipeline_parallel_degree\"],\n", " \"shard_optimizer_state\": hyperparameters[\"shard_optimizer_state\"] > 0,\n", " \"prescaled_batch\": hyperparameters[\"prescaled_batch\"] > 0,\n", - " \"fp16_params\": hyperparameters[\"fp16\"] > 0,\n", + " \"fp16\": hyperparameters[\"fp16\"] > 0,\n", " \"optimize\": hyperparameters[\"optimize\"],\n", " \"auto_partition\": False if hyperparameters[\"manual_partition\"] else True,\n", " \"default_partition\": 0,\n", - " \"fp16_params\": hyperparameters[\"fp16\"] > 0,\n", " \"optimize\": hyperparameters[\"optimize\"],\n", " },\n", " }\n", " },\n", " },\n", - " pytorch_version=\"1.10\",\n", + " pytorch_version=\"1.11\",\n", " transformers_version=\"4.17\",\n", " py_version=\"py38\",\n", " output_path=s3_output_location,\n", diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/train_gpt_simple.py b/training/distributed_training/pytorch/model_parallel/gpt2/train_gpt_simple.py index f18ca789cb..2b676db9fb 100644 --- a/training/distributed_training/pytorch/model_parallel/gpt2/train_gpt_simple.py +++ b/training/distributed_training/pytorch/model_parallel/gpt2/train_gpt_simple.py @@ -651,7 +651,7 @@ def parse_args(): # I/O io_grp = parser.add_argument_group(title="io", description="location for input and output") - io_grp.add_argument("--use_wiki_data", type=int, default=0, help="use bert data for training") + io_grp.add_argument("--use_wiki_data", type=int, default=0, help="use wiki corpus data for training") io_grp.add_argument("--zipped_data", type=int, default=1, help="input data is zipped files") io_grp.add_argument( "--epochs", type=int, default=3, help="times of iterating over the training dataset" @@ -1117,4 +1117,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main()