SageMaker-Debugger PT zcc deprecation (aws#3591)

* Updated CNN class activation example for PT 1.12 ZCC deprecation * Updated PyTorch MNIST script change example * updated iterative model pruning examples to PT 1.12 * Updated profiler examples to be nonzcc * Changed nll_loss to NLLLoss * Fixed build issues * Removed vscode metadata from notebooks * renamed experiments to be model specific
atqy · Oct 28, 2022 · 96bfccd · 96bfccd
1 parent 1069de9
commit 96bfccd
Show file tree

Hide file tree

Showing 10 changed files with 51 additions and 34 deletions.
diff --git a/...odel_specific_realtime_analysis/cnn_class_activation_maps/cnn_class_activation_maps.ipynb b/...odel_specific_realtime_analysis/cnn_class_activation_maps/cnn_class_activation_maps.ipynb
@@ -80,7 +80,7 @@
     "        image.register_hook(self.backward_hook(\"image\"))\n",
     "        \n",
     "    def forward_hook(self, module, inputs, outputs):\n",
-    "        module_name = self.module_maps[module]   \n",
+    "        module_name = module._module_name\n",
     "        self._write_inputs(module_name, inputs)\n",
     "        \n",
     "        #register outputs for backward pass. this is expensive, so we will only do it during EVAL mode\n",
@@ -326,6 +326,16 @@
     "Before starting the SageMaker training job, we need to install some libraries. We will use `smdebug` library to read, filter and analyze raw tensors that are stored in Amazon S3. We will use `opencv-python` library to plot saliency maps as heatmap."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fab25828",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!apt-get update && apt-get install -y python3-opencv"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -570,8 +580,8 @@
     "    role=role,\n",
     "    train_instance_type=\"ml.p3.2xlarge\",\n",
     "    train_instance_count=1,\n",
-    "    framework_version=\"1.3.1\",\n",
-    "    py_version=\"py3\",\n",
+    "    framework_version=\"1.12.0\",\n",
+    "    py_version=\"py38\",\n",
     "    hyperparameters={\n",
     "        \"epochs\": 5,\n",
     "        \"batch_size_train\": 64,\n",
@@ -1325,9 +1335,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Environment (conda_pytorch_p36)",
+   "display_name": "Python 3.8.11 64-bit ('3.8.11')",
    "language": "python",
-   "name": "conda_pytorch_p36"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1339,7 +1349,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.10"
+   "version": "3.8.11"
   },
   "papermill": {
    "default_parameters": {},

diff --git a/...ger/model_specific_realtime_analysis/cnn_class_activation_maps/entry_point/custom_hook.py b/...ger/model_specific_realtime_analysis/cnn_class_activation_maps/entry_point/custom_hook.py
@@ -10,7 +10,7 @@ def image_gradients(self, image):
         image.register_hook(self.backward_hook("image"))
 
     def forward_hook(self, module, inputs, outputs):
-        module_name = self.module_maps[module]
+        module_name = module._module_name
         self._write_inputs(module_name, inputs)
 
         # register outputs for backward pass. this is expensive, so we will only do it during EVAL mode

diff --git a/...-debugger/model_specific_realtime_analysis/cnn_class_activation_maps/entry_point/train.py b/...-debugger/model_specific_realtime_analysis/cnn_class_activation_maps/entry_point/train.py
@@ -81,6 +81,7 @@ def train_model(epochs, batch_size_train, batch_size_val):
     # create custom hook that has a customized forward function, so that we can get gradients of outputs
     hook = custom_hook.CustomHook.create_from_json_file()
     hook.register_module(model)
+    hook.register_loss(loss_function)
 
     # get the dataloaders for train and test data
     train_loader, val_loader = get_dataloaders(batch_size_train, batch_size_val)

diff --git a/sagemaker-debugger/pytorch_iterative_model_pruning/iterative_model_pruning_alexnet.ipynb b/sagemaker-debugger/pytorch_iterative_model_pruning/iterative_model_pruning_alexnet.ipynb
@@ -251,7 +251,7 @@
     "# name of experiment\n",
     "timestep = datetime.now()\n",
     "timestep = timestep.strftime(\"%d-%m-%Y-%H-%M-%S\")\n",
-    "experiment_name = timestep + \"-model-pruning-experiment\"\n",
+    "experiment_name = timestep + \"-alexnet-model-pruning-experiment\"\n",
     "\n",
     "# create experiment\n",
     "Experiment.create(\n",
@@ -372,12 +372,12 @@
     "estimator = PyTorch(\n",
     "    role=sagemaker.get_execution_role(),\n",
     "    instance_count=1,\n",
-    "    instance_type=\"ml.p2.xlarge\",\n",
+    "    instance_type=\"ml.p3.2xlarge\",\n",
     "    volume_size=400,\n",
     "    source_dir=\"src\",\n",
     "    entry_point=\"train.py\",\n",
-    "    framework_version=\"1.6\",\n",
-    "    py_version=\"py3\",\n",
+    "    framework_version=\"1.12\",\n",
+    "    py_version=\"py38\",\n",
     "    metric_definitions=[\n",
     "        {\"Name\": \"train:loss\", \"Regex\": \"loss:(.*?)\"},\n",
     "        {\"Name\": \"eval:acc\", \"Regex\": \"acc:(.*?)\"},\n",

diff --git a/sagemaker-debugger/pytorch_iterative_model_pruning/iterative_model_pruning_resnet.ipynb b/sagemaker-debugger/pytorch_iterative_model_pruning/iterative_model_pruning_resnet.ipynb
@@ -216,7 +216,7 @@
     "# name of experiment\n",
     "timestep = datetime.now()\n",
     "timestep = timestep.strftime(\"%d-%m-%Y-%H-%M-%S\")\n",
-    "experiment_name = timestep + \"-model-pruning-experiment\"\n",
+    "experiment_name = timestep + \"resnet-model-pruning-experiment\"\n",
     "\n",
     "# create experiment\n",
     "Experiment.create(\n",
@@ -340,8 +340,8 @@
     "    volume_size=400,\n",
     "    source_dir=\"src\",\n",
     "    entry_point=\"train.py\",\n",
-    "    framework_version=\"1.6\",\n",
-    "    py_version=\"py3\",\n",
+    "    framework_version=\"1.12\",\n",
+    "    py_version=\"py38\",\n",
     "    metric_definitions=[\n",
     "        {\"Name\": \"train:loss\", \"Regex\": \"loss:(.*?)\"},\n",
     "        {\"Name\": \"eval:acc\", \"Regex\": \"acc:(.*?)\"},\n",

diff --git a/sagemaker-debugger/pytorch_model_debugging/pytorch_script_change_smdebug.ipynb b/sagemaker-debugger/pytorch_model_debugging/pytorch_script_change_smdebug.ipynb
@@ -98,13 +98,13 @@
     "\n",
     "Tensors that debug hook captures are stored in S3 location specified by you. There are two ways you can configure Amazon SageMaker Debugger for storage:\n",
     "\n",
-    "   1. **Zero code change**: If you use any of SageMaker provided [Deep Learning containers](https://docs.aws.amazon.com/sagemaker/latest/dg/pre-built-containers-frameworks-deep-learning.html) then you don't need to make any changes to your training script for tensors to be stored. Amazon SageMaker Debugger will use the configuration you provide in the framework `Estimator` to save tensors in the fashion you specify.\n",
+    "   1. **Zero code change (DEPRECATED for PyTorch versions >= 1.12)**: If you use any of SageMaker provided [Deep Learning containers](https://docs.aws.amazon.com/sagemaker/latest/dg/pre-built-containers-frameworks-deep-learning.html) then you don't need to make any changes to your training script for tensors to be stored. Amazon SageMaker Debugger will use the configuration you provide in the framework `Estimator` to save tensors in the fashion you specify.\n",
     "       \n",
     "       **Note**: In case of PyTorch training, Debugger collects output tensors in GLOBAL mode by default. In other words, this option does not distinguish output tensors from different phases within an epoch, such as training phase and validation phase.\n",
     "       \n",
     "   2. **Script change**: Use the SageMaker Debugger client library, SMDebug, and customize training scripts to save the specific tensors you want at different frequencies and configurations. Refer to the [DeveloperGuide](https://github.com/awslabs/sagemaker-debugger/tree/master/docs) for details on how to use SageMaker Debugger with your choice of framework in your training script.\n",
     "   \n",
-    "In this notebook, we choose the second option to properly save the output tensors from different training phases.\n",
+    "In this notebook, we choose the second option to properly save the output tensors from different training phases since we're using PyTorch=1.12\n",
     "\n",
     "### Analysis of tensors\n",
     "\n",
@@ -289,19 +289,20 @@
     "    ```\n",
     "\n",
     "\n",
-    "- **Step 4**: In the `main()` function, create the SMDebug hook and register to the model.\n",
+    "- **Step 4**: In the `main()` function, create the SMDebug hook and register to the model and loss function.\n",
     "\n",
     "    ```python\n",
     "    hook = smd.Hook.create_from_json_file()\n",
     "    hook.register_hook(model)\n",
+    "    hook.register_loss(loss_fn)\n",
     "    ```\n",
     "\n",
     "\n",
     "- **Step 4**: In the `main()` function, pass the SMDebug hook to the `train()` and `test()` functions in the epoch loop.\n",
     "\n",
     "    ```python\n",
-    "    train(args, model, device, train_loader, optimizer, epoch, hook)\n",
-    "    test(model, device, test_loader, hook)\n",
+    "    train(args, model, loss_fn, device, train_loader, optimizer, epoch, hook)\n",
+    "    test(model, device, loss_fn, test_loader, hook)\n",
     "    ```"
    ]
   },
@@ -983,7 +984,7 @@
    },
    "outputs": [],
    "source": [
-    "len(trial.tensor(\"nll_loss_output_0\").steps(mode=ModeKeys.TRAIN))"
+    "len(trial.tensor(\"NLLLoss_output_0\").steps(mode=ModeKeys.TRAIN))"
    ]
   },
   {
@@ -1002,7 +1003,7 @@
    },
    "outputs": [],
    "source": [
-    "len(trial.tensor(\"nll_loss_output_0\").steps(mode=ModeKeys.EVAL))"
+    "len(trial.tensor(\"NLLLoss_output_0\").steps(mode=ModeKeys.EVAL))"
    ]
   },
   {
@@ -1116,7 +1117,7 @@
    },
    "outputs": [],
    "source": [
-    "plot_tensor(trial, \"nll_loss_output_0\")"
+    "plot_tensor(trial, \"NLLLoss_output_0\")"
    ]
   },
   {
@@ -1142,7 +1143,7 @@
     "RuleEvaluationConditionMet: Evaluation of the rule Overfit at step 4000 resulted in the condition being met\n",
     "```\n",
     "\n",
-    "Based on this rule evaluation and the plot above, we can conclude that the training job has an overfit issue. While the `nll_loss_output_0` line is decreasing, the `val_nll_loss_output_0` line is fluctuating and not decreasing. \n",
+    "Based on this rule evaluation and the plot above, we can conclude that the training job has an overfit issue. While the `NLLLoss_output_0` line is decreasing, the `val_NLLLoss_output_0` line is fluctuating and not decreasing. \n",
     "\n",
     "To resolve the overfit problem, you need to consider using or double-checking the following techniques:\n",
     "\n",
@@ -1277,9 +1278,9 @@
  "metadata": {
   "instance_type": "ml.g4dn.xlarge",
   "kernelspec": {
-   "display_name": "Environment (conda_pytorch_p36)",
+   "display_name": "Python 3.8.11 64-bit ('3.8.11')",
    "language": "python",
-   "name": "conda_pytorch_p36"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1291,7 +1292,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.10"
+   "version": "3.8.11"
   },
   "papermill": {
    "default_parameters": {},
@@ -1310,4 +1311,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
diff --git a/sagemaker-debugger/pytorch_model_debugging/scripts/pytorch_mnist.py b/sagemaker-debugger/pytorch_model_debugging/scripts/pytorch_mnist.py
@@ -74,7 +74,7 @@ def forward(self, x):
         return output
 
 
-def train(args, model, device, train_loader, optimizer, epoch, hook):
+def train(args, model, loss_fn, device, train_loader, optimizer, epoch, hook):
     model.train()
     # =================================================#
     # 2. Set the SMDebug hook for the training phase. #
@@ -84,12 +84,12 @@ def train(args, model, device, train_loader, optimizer, epoch, hook):
         data, target = data.to(device), target.to(device)
         optimizer.zero_grad()
         output = model(data)
-        loss = F.nll_loss(output, target)
+        loss = loss_fn(output, target)
         loss.backward()
         optimizer.step()
         if batch_idx % args.log_interval == 0:
             print(
-                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                "Train Epoch: {} [{}/{} ({:.0f}%)]\t Loss: {:.6f}".format(
                     epoch,
                     batch_idx * len(data),
                     len(train_loader.dataset),
@@ -101,7 +101,7 @@ def train(args, model, device, train_loader, optimizer, epoch, hook):
                 break
 
 
-def test(model, device, test_loader, hook):
+def test(model, loss_fn, device, test_loader, hook):
     model.eval()
     # ===================================================#
     # 3. Set the SMDebug hook for the validation phase. #
@@ -113,7 +113,7 @@ def test(model, device, test_loader, hook):
         for data, target in test_loader:
             data, target = data.to(device), target.to(device)
             output = model(data)
-            test_loss += F.nll_loss(output, target, reduction="sum").item()  # sum up batch loss
+            test_loss += loss_fn(output, target).item()  # sum up batch loss
             pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
             correct += pred.eq(target.view_as(pred)).sum().item()
 
@@ -201,12 +201,14 @@ def main():
     test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
 
     model = Net().to(device)
+    loss_fn = nn.NLLLoss()
 
     # ======================================================#
     # 4. Register the SMDebug hook to save output tensors. #
     # ======================================================#
     hook = smd.Hook.create_from_json_file()
     hook.register_hook(model)
+    hook.register_loss(loss_fn)
 
     optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
 
@@ -215,8 +217,8 @@ def main():
         # ===========================================================#
         # 5. Pass the SMDebug hook to the train and test functions. #
         # ===========================================================#
-        train(args, model, device, train_loader, optimizer, epoch, hook)
-        test(model, device, test_loader, hook)
+        train(args, model, loss_fn, device, train_loader, optimizer, epoch, hook)
+        test(model, loss_fn, device, test_loader, hook)
         scheduler.step()
 
     if args.save_model:

diff --git a/sagemaker-debugger/pytorch_profiling/entry_point/pt_res50_cifar10_distributed.py b/sagemaker-debugger/pytorch_profiling/entry_point/pt_res50_cifar10_distributed.py
@@ -53,6 +53,7 @@ def train(batch_size, epoch, net, hook, device, local_rank):
     epoch_times = []
 
     if hook:
+        hook.register_module(net)
         hook.register_loss(loss_optim)
     # train the model
 

diff --git a/sagemaker-debugger/pytorch_profiling/entry_point/pt_res50_cifar10_horovod_dataloader.py b/sagemaker-debugger/pytorch_profiling/entry_point/pt_res50_cifar10_horovod_dataloader.py
@@ -101,6 +101,7 @@ def train(batch_size, epoch, net, hook, args, local_rank):
 
         print("START VALIDATING")
         if hook:
+            hook.register_module(net)
             hook.set_mode(modes.EVAL)
         test_sampler.set_epoch(i)
         net.eval()

diff --git a/sagemaker-debugger/pytorch_profiling/entry_point/pytorch_res50_cifar10_dataloader.py b/sagemaker-debugger/pytorch_profiling/entry_point/pytorch_res50_cifar10_dataloader.py
@@ -75,6 +75,7 @@ def train(args, net, device):
     epoch_times = []
 
     if hook:
+        hook.register_module(net)
         hook.register_loss(loss_optim)
     # train the model