aws · mchoi8739 · Oct 13, 2022 · May 17, 2022 · May 17, 2022 · May 17, 2022
diff --git a/training/distributed_training/pytorch/model_parallel/gpt-j/01_train_gptj_smp_notebook.ipynb b/training/distributed_training/pytorch/model_parallel/gpt-j/01_train_gptj_smp_notebook.ipynb
@@ -21,7 +21,7 @@
     "This notebook depends on the following files and folders:\n",
     "\n",
     "1. `train_gptj_smp_script.py`: This is an entrypoint script that is passed to the PyTorch estimator in the notebook instructions. This script is responsible for end to end training of the GPT-J model with SMP. The script has additional comments at places where the SMP API is used.\n",
-    "2. `fp16`: This folder is used for 16-bit float training, which contains a fp16 optimizer and various fp16 utilities.\n",
+    "2. `memory_tracker.py`: This contains the functions to track memory usage.\n",
     "3. `learning_rates.py`: This contains the functions for learning rate schedule.\n",
     "4. `requirements.txt`: This will install the dependencies, like the right version of huggingface transformers.\n",
     "5. `preprocess.py`: This will download and preprocess the sst2/glue dataset.\n",

diff --git a/...ed_training/pytorch/model_parallel/gpt-j/11_train_gptj_smp_tensor_parallel_notebook.ipynb b/...ed_training/pytorch/model_parallel/gpt-j/11_train_gptj_smp_tensor_parallel_notebook.ipynb
@@ -39,7 +39,7 @@
     "! pip install -qU \"sagemaker>=2,<3\"\n",
     "! pip install -qU sagemaker-experiments\n",
     "\n",
-    "# import IPython\n",
+    "# import IPythonfp16\n",
     "# IPython.Application.instance().kernel.do_shutdown(True)"
    ]
   },
@@ -82,7 +82,7 @@
     "import os\n",
     "\n",
     "from sagemaker import get_execution_role\n",
-    "from sagemaker.huggingface import HuggingFace\n",
+    "from sagemaker.pytorch import PyTorch\n",
     "from smexperiments.experiment import Experiment\n",
     "from smexperiments.trial import Trial\n",
     "import boto3\n",
@@ -611,6 +611,7 @@
     "    \"activation_checkpointing\": 1,\n",
     "    \"activation_strategy\": \"each\",\n",
     "    \"optimize\": \"speed\",\n",
+    "    \"zipped_data\": 0,\n",
     "    # below flag loads model and optimizer state from checkpoint_s3_uri\n",
     "    # 'load_partial': 1,\n",
     "}\n",
@@ -826,7 +827,7 @@
     "    kwargs[\"security_group_ids\"] = [fsx_security_group_id]\n",
     "    kwargs[\"subnets\"] = [fsx_subnet]\n",
     "\n",
-    "smp_estimator = HuggingFace(\n",
+    "smp_estimator = PyTorch(\n",
     "    entry_point=\"train_gptj_smp_tensor_parallel_script.py\",\n",
     "    source_dir=os.getcwd(),\n",
     "    role=role,\n",
@@ -851,18 +852,16 @@
     "                    \"partitions\": hyperparameters[\"pipeline_parallel_degree\"],\n",
     "                    \"shard_optimizer_state\": hyperparameters[\"shard_optimizer_state\"] > 0,\n",
     "                    \"prescaled_batch\": hyperparameters[\"prescaled_batch\"] > 0,\n",
-    "                    \"fp16_params\": hyperparameters[\"fp16\"] > 0,\n",
+    "                    \"fp16\": hyperparameters[\"fp16\"] > 0,\n",
     "                    \"optimize\": hyperparameters[\"optimize\"],\n",
     "                    \"auto_partition\": False if hyperparameters[\"manual_partition\"] else True,\n",
     "                    \"default_partition\": 0,\n",
-    "                    \"fp16_params\": hyperparameters[\"fp16\"] > 0,\n",
     "                    \"optimize\": hyperparameters[\"optimize\"],\n",
     "                },\n",
     "            }\n",
     "        },\n",
     "    },\n",
-    "    pytorch_version=\"1.10\",\n",
-    "    transformers_version=\"4.17\",\n",
+    "    framework_version=\"1.12\",\n",
     "    py_version=\"py38\",\n",
     "    output_path=s3_output_bucket,\n",
     "    checkpoint_s3_uri=checkpoint_s3_uri if not use_fsx else None,\n",

diff --git a/training/distributed_training/pytorch/model_parallel/gpt-j/fp16/__init__.py b/training/distributed_training/pytorch/model_parallel/gpt-j/fp16/__init__.py