From c2987ea4a9a60958f016ed2b1f97e116babc5904 Mon Sep 17 00:00:00 2001 From: Jihyeong Lee Date: Wed, 14 Sep 2022 16:27:02 -0400 Subject: [PATCH] SageMaker-Debugger PT zcc deprecation (#3591) * Updated CNN class activation example for PT 1.12 ZCC deprecation * Updated PyTorch MNIST script change example * updated iterative model pruning examples to PT 1.12 * Updated profiler examples to be nonzcc * Changed nll_loss to NLLLoss * Fixed build issues * Removed vscode metadata from notebooks * renamed experiments to be model specific --- .../cnn_class_activation_maps.ipynb | 22 ++++++++++----- .../entry_point/custom_hook.py | 2 +- .../entry_point/train.py | 1 + .../iterative_model_pruning_alexnet.ipynb | 8 +++--- .../iterative_model_pruning_resnet.ipynb | 6 ++--- .../pytorch_script_change_smdebug.ipynb | 27 ++++++++++--------- .../scripts/pytorch_mnist.py | 16 ++++++----- .../pt_res50_cifar10_distributed.py | 1 + .../pt_res50_cifar10_horovod_dataloader.py | 1 + .../pytorch_res50_cifar10_dataloader.py | 1 + 10 files changed, 51 insertions(+), 34 deletions(-) diff --git a/sagemaker-debugger/model_specific_realtime_analysis/cnn_class_activation_maps/cnn_class_activation_maps.ipynb b/sagemaker-debugger/model_specific_realtime_analysis/cnn_class_activation_maps/cnn_class_activation_maps.ipynb index 0bd3a5f04f..653b95e4e4 100644 --- a/sagemaker-debugger/model_specific_realtime_analysis/cnn_class_activation_maps/cnn_class_activation_maps.ipynb +++ b/sagemaker-debugger/model_specific_realtime_analysis/cnn_class_activation_maps/cnn_class_activation_maps.ipynb @@ -80,7 +80,7 @@ " image.register_hook(self.backward_hook(\"image\"))\n", " \n", " def forward_hook(self, module, inputs, outputs):\n", - " module_name = self.module_maps[module] \n", + " module_name = module._module_name\n", " self._write_inputs(module_name, inputs)\n", " \n", " #register outputs for backward pass. this is expensive, so we will only do it during EVAL mode\n", @@ -326,6 +326,16 @@ "Before starting the SageMaker training job, we need to install some libraries. We will use `smdebug` library to read, filter and analyze raw tensors that are stored in Amazon S3. We will use `opencv-python` library to plot saliency maps as heatmap." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "fab25828", + "metadata": {}, + "outputs": [], + "source": [ + "!apt-get update && apt-get install -y python3-opencv" + ] + }, { "cell_type": "code", "execution_count": null, @@ -570,8 +580,8 @@ " role=role,\n", " train_instance_type=\"ml.p3.2xlarge\",\n", " train_instance_count=1,\n", - " framework_version=\"1.3.1\",\n", - " py_version=\"py3\",\n", + " framework_version=\"1.12.0\",\n", + " py_version=\"py38\",\n", " hyperparameters={\n", " \"epochs\": 5,\n", " \"batch_size_train\": 64,\n", @@ -1325,9 +1335,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Environment (conda_pytorch_p36)", + "display_name": "Python 3.8.11 64-bit ('3.8.11')", "language": "python", - "name": "conda_pytorch_p36" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1339,7 +1349,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.8.11" }, "papermill": { "default_parameters": {}, diff --git a/sagemaker-debugger/model_specific_realtime_analysis/cnn_class_activation_maps/entry_point/custom_hook.py b/sagemaker-debugger/model_specific_realtime_analysis/cnn_class_activation_maps/entry_point/custom_hook.py index 1c445eb15f..8e94b15a88 100644 --- a/sagemaker-debugger/model_specific_realtime_analysis/cnn_class_activation_maps/entry_point/custom_hook.py +++ b/sagemaker-debugger/model_specific_realtime_analysis/cnn_class_activation_maps/entry_point/custom_hook.py @@ -10,7 +10,7 @@ def image_gradients(self, image): image.register_hook(self.backward_hook("image")) def forward_hook(self, module, inputs, outputs): - module_name = self.module_maps[module] + module_name = module._module_name self._write_inputs(module_name, inputs) # register outputs for backward pass. this is expensive, so we will only do it during EVAL mode diff --git a/sagemaker-debugger/model_specific_realtime_analysis/cnn_class_activation_maps/entry_point/train.py b/sagemaker-debugger/model_specific_realtime_analysis/cnn_class_activation_maps/entry_point/train.py index c5e57b9429..d45ab2a3a0 100644 --- a/sagemaker-debugger/model_specific_realtime_analysis/cnn_class_activation_maps/entry_point/train.py +++ b/sagemaker-debugger/model_specific_realtime_analysis/cnn_class_activation_maps/entry_point/train.py @@ -81,6 +81,7 @@ def train_model(epochs, batch_size_train, batch_size_val): # create custom hook that has a customized forward function, so that we can get gradients of outputs hook = custom_hook.CustomHook.create_from_json_file() hook.register_module(model) + hook.register_loss(loss_function) # get the dataloaders for train and test data train_loader, val_loader = get_dataloaders(batch_size_train, batch_size_val) diff --git a/sagemaker-debugger/pytorch_iterative_model_pruning/iterative_model_pruning_alexnet.ipynb b/sagemaker-debugger/pytorch_iterative_model_pruning/iterative_model_pruning_alexnet.ipynb index 37b1970e2c..ff5ab416fd 100644 --- a/sagemaker-debugger/pytorch_iterative_model_pruning/iterative_model_pruning_alexnet.ipynb +++ b/sagemaker-debugger/pytorch_iterative_model_pruning/iterative_model_pruning_alexnet.ipynb @@ -251,7 +251,7 @@ "# name of experiment\n", "timestep = datetime.now()\n", "timestep = timestep.strftime(\"%d-%m-%Y-%H-%M-%S\")\n", - "experiment_name = timestep + \"-model-pruning-experiment\"\n", + "experiment_name = timestep + \"-alexnet-model-pruning-experiment\"\n", "\n", "# create experiment\n", "Experiment.create(\n", @@ -372,12 +372,12 @@ "estimator = PyTorch(\n", " role=sagemaker.get_execution_role(),\n", " instance_count=1,\n", - " instance_type=\"ml.p2.xlarge\",\n", + " instance_type=\"ml.p3.2xlarge\",\n", " volume_size=400,\n", " source_dir=\"src\",\n", " entry_point=\"train.py\",\n", - " framework_version=\"1.6\",\n", - " py_version=\"py3\",\n", + " framework_version=\"1.12\",\n", + " py_version=\"py38\",\n", " metric_definitions=[\n", " {\"Name\": \"train:loss\", \"Regex\": \"loss:(.*?)\"},\n", " {\"Name\": \"eval:acc\", \"Regex\": \"acc:(.*?)\"},\n", diff --git a/sagemaker-debugger/pytorch_iterative_model_pruning/iterative_model_pruning_resnet.ipynb b/sagemaker-debugger/pytorch_iterative_model_pruning/iterative_model_pruning_resnet.ipynb index 9d4049371b..2c08e08870 100644 --- a/sagemaker-debugger/pytorch_iterative_model_pruning/iterative_model_pruning_resnet.ipynb +++ b/sagemaker-debugger/pytorch_iterative_model_pruning/iterative_model_pruning_resnet.ipynb @@ -216,7 +216,7 @@ "# name of experiment\n", "timestep = datetime.now()\n", "timestep = timestep.strftime(\"%d-%m-%Y-%H-%M-%S\")\n", - "experiment_name = timestep + \"-model-pruning-experiment\"\n", + "experiment_name = timestep + \"resnet-model-pruning-experiment\"\n", "\n", "# create experiment\n", "Experiment.create(\n", @@ -340,8 +340,8 @@ " volume_size=400,\n", " source_dir=\"src\",\n", " entry_point=\"train.py\",\n", - " framework_version=\"1.6\",\n", - " py_version=\"py3\",\n", + " framework_version=\"1.12\",\n", + " py_version=\"py38\",\n", " metric_definitions=[\n", " {\"Name\": \"train:loss\", \"Regex\": \"loss:(.*?)\"},\n", " {\"Name\": \"eval:acc\", \"Regex\": \"acc:(.*?)\"},\n", diff --git a/sagemaker-debugger/pytorch_model_debugging/pytorch_script_change_smdebug.ipynb b/sagemaker-debugger/pytorch_model_debugging/pytorch_script_change_smdebug.ipynb index 46cfd0be7b..8ae6dcc438 100644 --- a/sagemaker-debugger/pytorch_model_debugging/pytorch_script_change_smdebug.ipynb +++ b/sagemaker-debugger/pytorch_model_debugging/pytorch_script_change_smdebug.ipynb @@ -98,13 +98,13 @@ "\n", "Tensors that debug hook captures are stored in S3 location specified by you. There are two ways you can configure Amazon SageMaker Debugger for storage:\n", "\n", - " 1. **Zero code change**: If you use any of SageMaker provided [Deep Learning containers](https://docs.aws.amazon.com/sagemaker/latest/dg/pre-built-containers-frameworks-deep-learning.html) then you don't need to make any changes to your training script for tensors to be stored. Amazon SageMaker Debugger will use the configuration you provide in the framework `Estimator` to save tensors in the fashion you specify.\n", + " 1. **Zero code change (DEPRECATED for PyTorch versions >= 1.12)**: If you use any of SageMaker provided [Deep Learning containers](https://docs.aws.amazon.com/sagemaker/latest/dg/pre-built-containers-frameworks-deep-learning.html) then you don't need to make any changes to your training script for tensors to be stored. Amazon SageMaker Debugger will use the configuration you provide in the framework `Estimator` to save tensors in the fashion you specify.\n", " \n", " **Note**: In case of PyTorch training, Debugger collects output tensors in GLOBAL mode by default. In other words, this option does not distinguish output tensors from different phases within an epoch, such as training phase and validation phase.\n", " \n", " 2. **Script change**: Use the SageMaker Debugger client library, SMDebug, and customize training scripts to save the specific tensors you want at different frequencies and configurations. Refer to the [DeveloperGuide](https://github.com/awslabs/sagemaker-debugger/tree/master/docs) for details on how to use SageMaker Debugger with your choice of framework in your training script.\n", " \n", - "In this notebook, we choose the second option to properly save the output tensors from different training phases.\n", + "In this notebook, we choose the second option to properly save the output tensors from different training phases since we're using PyTorch=1.12\n", "\n", "### Analysis of tensors\n", "\n", @@ -289,19 +289,20 @@ " ```\n", "\n", "\n", - "- **Step 4**: In the `main()` function, create the SMDebug hook and register to the model.\n", + "- **Step 4**: In the `main()` function, create the SMDebug hook and register to the model and loss function.\n", "\n", " ```python\n", " hook = smd.Hook.create_from_json_file()\n", " hook.register_hook(model)\n", + " hook.register_loss(loss_fn)\n", " ```\n", "\n", "\n", "- **Step 4**: In the `main()` function, pass the SMDebug hook to the `train()` and `test()` functions in the epoch loop.\n", "\n", " ```python\n", - " train(args, model, device, train_loader, optimizer, epoch, hook)\n", - " test(model, device, test_loader, hook)\n", + " train(args, model, loss_fn, device, train_loader, optimizer, epoch, hook)\n", + " test(model, device, loss_fn, test_loader, hook)\n", " ```" ] }, @@ -983,7 +984,7 @@ }, "outputs": [], "source": [ - "len(trial.tensor(\"nll_loss_output_0\").steps(mode=ModeKeys.TRAIN))" + "len(trial.tensor(\"NLLLoss_output_0\").steps(mode=ModeKeys.TRAIN))" ] }, { @@ -1002,7 +1003,7 @@ }, "outputs": [], "source": [ - "len(trial.tensor(\"nll_loss_output_0\").steps(mode=ModeKeys.EVAL))" + "len(trial.tensor(\"NLLLoss_output_0\").steps(mode=ModeKeys.EVAL))" ] }, { @@ -1116,7 +1117,7 @@ }, "outputs": [], "source": [ - "plot_tensor(trial, \"nll_loss_output_0\")" + "plot_tensor(trial, \"NLLLoss_output_0\")" ] }, { @@ -1142,7 +1143,7 @@ "RuleEvaluationConditionMet: Evaluation of the rule Overfit at step 4000 resulted in the condition being met\n", "```\n", "\n", - "Based on this rule evaluation and the plot above, we can conclude that the training job has an overfit issue. While the `nll_loss_output_0` line is decreasing, the `val_nll_loss_output_0` line is fluctuating and not decreasing. \n", + "Based on this rule evaluation and the plot above, we can conclude that the training job has an overfit issue. While the `NLLLoss_output_0` line is decreasing, the `val_NLLLoss_output_0` line is fluctuating and not decreasing. \n", "\n", "To resolve the overfit problem, you need to consider using or double-checking the following techniques:\n", "\n", @@ -1277,9 +1278,9 @@ "metadata": { "instance_type": "ml.g4dn.xlarge", "kernelspec": { - "display_name": "Environment (conda_pytorch_p36)", + "display_name": "Python 3.8.11 64-bit ('3.8.11')", "language": "python", - "name": "conda_pytorch_p36" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1291,7 +1292,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.8.11" }, "papermill": { "default_parameters": {}, @@ -1310,4 +1311,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/sagemaker-debugger/pytorch_model_debugging/scripts/pytorch_mnist.py b/sagemaker-debugger/pytorch_model_debugging/scripts/pytorch_mnist.py index d4342e3566..e9e43ffd08 100644 --- a/sagemaker-debugger/pytorch_model_debugging/scripts/pytorch_mnist.py +++ b/sagemaker-debugger/pytorch_model_debugging/scripts/pytorch_mnist.py @@ -74,7 +74,7 @@ def forward(self, x): return output -def train(args, model, device, train_loader, optimizer, epoch, hook): +def train(args, model, loss_fn, device, train_loader, optimizer, epoch, hook): model.train() # =================================================# # 2. Set the SMDebug hook for the training phase. # @@ -84,12 +84,12 @@ def train(args, model, device, train_loader, optimizer, epoch, hook): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) - loss = F.nll_loss(output, target) + loss = loss_fn(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: print( - "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( + "Train Epoch: {} [{}/{} ({:.0f}%)]\t Loss: {:.6f}".format( epoch, batch_idx * len(data), len(train_loader.dataset), @@ -101,7 +101,7 @@ def train(args, model, device, train_loader, optimizer, epoch, hook): break -def test(model, device, test_loader, hook): +def test(model, loss_fn, device, test_loader, hook): model.eval() # ===================================================# # 3. Set the SMDebug hook for the validation phase. # @@ -113,7 +113,7 @@ def test(model, device, test_loader, hook): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) - test_loss += F.nll_loss(output, target, reduction="sum").item() # sum up batch loss + test_loss += loss_fn(output, target).item() # sum up batch loss pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() @@ -201,12 +201,14 @@ def main(): test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) model = Net().to(device) + loss_fn = nn.NLLLoss() # ======================================================# # 4. Register the SMDebug hook to save output tensors. # # ======================================================# hook = smd.Hook.create_from_json_file() hook.register_hook(model) + hook.register_loss(loss_fn) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) @@ -215,8 +217,8 @@ def main(): # ===========================================================# # 5. Pass the SMDebug hook to the train and test functions. # # ===========================================================# - train(args, model, device, train_loader, optimizer, epoch, hook) - test(model, device, test_loader, hook) + train(args, model, loss_fn, device, train_loader, optimizer, epoch, hook) + test(model, loss_fn, device, test_loader, hook) scheduler.step() if args.save_model: diff --git a/sagemaker-debugger/pytorch_profiling/entry_point/pt_res50_cifar10_distributed.py b/sagemaker-debugger/pytorch_profiling/entry_point/pt_res50_cifar10_distributed.py index 384333ce45..cc2e60f047 100644 --- a/sagemaker-debugger/pytorch_profiling/entry_point/pt_res50_cifar10_distributed.py +++ b/sagemaker-debugger/pytorch_profiling/entry_point/pt_res50_cifar10_distributed.py @@ -53,6 +53,7 @@ def train(batch_size, epoch, net, hook, device, local_rank): epoch_times = [] if hook: + hook.register_module(net) hook.register_loss(loss_optim) # train the model diff --git a/sagemaker-debugger/pytorch_profiling/entry_point/pt_res50_cifar10_horovod_dataloader.py b/sagemaker-debugger/pytorch_profiling/entry_point/pt_res50_cifar10_horovod_dataloader.py index 0a46a6dabd..290baf6738 100644 --- a/sagemaker-debugger/pytorch_profiling/entry_point/pt_res50_cifar10_horovod_dataloader.py +++ b/sagemaker-debugger/pytorch_profiling/entry_point/pt_res50_cifar10_horovod_dataloader.py @@ -101,6 +101,7 @@ def train(batch_size, epoch, net, hook, args, local_rank): print("START VALIDATING") if hook: + hook.register_module(net) hook.set_mode(modes.EVAL) test_sampler.set_epoch(i) net.eval() diff --git a/sagemaker-debugger/pytorch_profiling/entry_point/pytorch_res50_cifar10_dataloader.py b/sagemaker-debugger/pytorch_profiling/entry_point/pytorch_res50_cifar10_dataloader.py index 095cd57032..6636b67979 100644 --- a/sagemaker-debugger/pytorch_profiling/entry_point/pytorch_res50_cifar10_dataloader.py +++ b/sagemaker-debugger/pytorch_profiling/entry_point/pytorch_res50_cifar10_dataloader.py @@ -75,6 +75,7 @@ def train(args, net, device): epoch_times = [] if hook: + hook.register_module(net) hook.register_loss(loss_optim) # train the model