Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SageMaker-Debugger PT zcc deprecation #3591

Merged
merged 11 commits into from
Sep 14, 2022
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
" image.register_hook(self.backward_hook(\"image\"))\n",
" \n",
" def forward_hook(self, module, inputs, outputs):\n",
" module_name = self.module_maps[module] \n",
" module_name = module._module_name\n",
" self._write_inputs(module_name, inputs)\n",
" \n",
" #register outputs for backward pass. this is expensive, so we will only do it during EVAL mode\n",
Expand Down Expand Up @@ -326,6 +326,16 @@
"Before starting the SageMaker training job, we need to install some libraries. We will use `smdebug` library to read, filter and analyze raw tensors that are stored in Amazon S3. We will use `opencv-python` library to plot saliency maps as heatmap."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fab25828",
"metadata": {},
"outputs": [],
"source": [
"!apt-get update && apt-get install -y python3-opencv"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -570,8 +580,8 @@
" role=role,\n",
" train_instance_type=\"ml.p3.2xlarge\",\n",
" train_instance_count=1,\n",
" framework_version=\"1.3.1\",\n",
" py_version=\"py3\",\n",
" framework_version=\"1.12.0\",\n",
" py_version=\"py38\",\n",
" hyperparameters={\n",
" \"epochs\": 5,\n",
" \"batch_size_train\": 64,\n",
Expand Down Expand Up @@ -1325,9 +1335,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Environment (conda_pytorch_p36)",
"display_name": "Python 3.8.11 64-bit ('3.8.11')",
"language": "python",
"name": "conda_pytorch_p36"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -1339,7 +1349,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
"version": "3.8.11"
},
"papermill": {
"default_parameters": {},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def image_gradients(self, image):
image.register_hook(self.backward_hook("image"))

def forward_hook(self, module, inputs, outputs):
module_name = self.module_maps[module]
module_name = module._module_name
self._write_inputs(module_name, inputs)

# register outputs for backward pass. this is expensive, so we will only do it during EVAL mode
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def train_model(epochs, batch_size_train, batch_size_val):
# create custom hook that has a customized forward function, so that we can get gradients of outputs
hook = custom_hook.CustomHook.create_from_json_file()
hook.register_module(model)
hook.register_loss(loss_function)

# get the dataloaders for train and test data
train_loader, val_loader = get_dataloaders(batch_size_train, batch_size_val)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@
"# name of experiment\n",
"timestep = datetime.now()\n",
"timestep = timestep.strftime(\"%d-%m-%Y-%H-%M-%S\")\n",
"experiment_name = timestep + \"-model-pruning-experiment\"\n",
"experiment_name = timestep + \"-alexnet-model-pruning-experiment\"\n",
"\n",
"# create experiment\n",
"Experiment.create(\n",
Expand Down Expand Up @@ -372,12 +372,12 @@
"estimator = PyTorch(\n",
" role=sagemaker.get_execution_role(),\n",
" instance_count=1,\n",
" instance_type=\"ml.p2.xlarge\",\n",
" instance_type=\"ml.p3.2xlarge\",\n",
" volume_size=400,\n",
" source_dir=\"src\",\n",
" entry_point=\"train.py\",\n",
" framework_version=\"1.6\",\n",
" py_version=\"py3\",\n",
" framework_version=\"1.12\",\n",
" py_version=\"py38\",\n",
" metric_definitions=[\n",
" {\"Name\": \"train:loss\", \"Regex\": \"loss:(.*?)\"},\n",
" {\"Name\": \"eval:acc\", \"Regex\": \"acc:(.*?)\"},\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@
"# name of experiment\n",
"timestep = datetime.now()\n",
"timestep = timestep.strftime(\"%d-%m-%Y-%H-%M-%S\")\n",
"experiment_name = timestep + \"-model-pruning-experiment\"\n",
"experiment_name = timestep + \"resnet-model-pruning-experiment\"\n",
"\n",
"# create experiment\n",
"Experiment.create(\n",
Expand Down Expand Up @@ -340,8 +340,8 @@
" volume_size=400,\n",
" source_dir=\"src\",\n",
" entry_point=\"train.py\",\n",
" framework_version=\"1.6\",\n",
" py_version=\"py3\",\n",
" framework_version=\"1.12\",\n",
" py_version=\"py38\",\n",
" metric_definitions=[\n",
" {\"Name\": \"train:loss\", \"Regex\": \"loss:(.*?)\"},\n",
" {\"Name\": \"eval:acc\", \"Regex\": \"acc:(.*?)\"},\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,13 @@
"\n",
"Tensors that debug hook captures are stored in S3 location specified by you. There are two ways you can configure Amazon SageMaker Debugger for storage:\n",
"\n",
" 1. **Zero code change**: If you use any of SageMaker provided [Deep Learning containers](https://docs.aws.amazon.com/sagemaker/latest/dg/pre-built-containers-frameworks-deep-learning.html) then you don't need to make any changes to your training script for tensors to be stored. Amazon SageMaker Debugger will use the configuration you provide in the framework `Estimator` to save tensors in the fashion you specify.\n",
" 1. **Zero code change (DEPRECATED for PyTorch versions >= 1.12)**: If you use any of SageMaker provided [Deep Learning containers](https://docs.aws.amazon.com/sagemaker/latest/dg/pre-built-containers-frameworks-deep-learning.html) then you don't need to make any changes to your training script for tensors to be stored. Amazon SageMaker Debugger will use the configuration you provide in the framework `Estimator` to save tensors in the fashion you specify.\n",
" \n",
" **Note**: In case of PyTorch training, Debugger collects output tensors in GLOBAL mode by default. In other words, this option does not distinguish output tensors from different phases within an epoch, such as training phase and validation phase.\n",
" \n",
" 2. **Script change**: Use the SageMaker Debugger client library, SMDebug, and customize training scripts to save the specific tensors you want at different frequencies and configurations. Refer to the [DeveloperGuide](https://github.com/awslabs/sagemaker-debugger/tree/master/docs) for details on how to use SageMaker Debugger with your choice of framework in your training script.\n",
" \n",
"In this notebook, we choose the second option to properly save the output tensors from different training phases.\n",
"In this notebook, we choose the second option to properly save the output tensors from different training phases since we're using PyTorch=1.12\n",
"\n",
"### Analysis of tensors\n",
"\n",
Expand Down Expand Up @@ -289,19 +289,20 @@
" ```\n",
"\n",
"\n",
"- **Step 4**: In the `main()` function, create the SMDebug hook and register to the model.\n",
"- **Step 4**: In the `main()` function, create the SMDebug hook and register to the model and loss function.\n",
"\n",
" ```python\n",
" hook = smd.Hook.create_from_json_file()\n",
" hook.register_hook(model)\n",
" hook.register_loss(loss_fn)\n",
" ```\n",
"\n",
"\n",
"- **Step 4**: In the `main()` function, pass the SMDebug hook to the `train()` and `test()` functions in the epoch loop.\n",
"\n",
" ```python\n",
" train(args, model, device, train_loader, optimizer, epoch, hook)\n",
" test(model, device, test_loader, hook)\n",
" train(args, model, loss_fn, device, train_loader, optimizer, epoch, hook)\n",
" test(model, device, loss_fn, test_loader, hook)\n",
" ```"
]
},
Expand Down Expand Up @@ -983,7 +984,7 @@
},
"outputs": [],
"source": [
"len(trial.tensor(\"nll_loss_output_0\").steps(mode=ModeKeys.TRAIN))"
"len(trial.tensor(\"NLLLoss_output_0\").steps(mode=ModeKeys.TRAIN))"
]
},
{
Expand All @@ -1002,7 +1003,7 @@
},
"outputs": [],
"source": [
"len(trial.tensor(\"nll_loss_output_0\").steps(mode=ModeKeys.EVAL))"
"len(trial.tensor(\"NLLLoss_output_0\").steps(mode=ModeKeys.EVAL))"
]
},
{
Expand Down Expand Up @@ -1116,7 +1117,7 @@
},
"outputs": [],
"source": [
"plot_tensor(trial, \"nll_loss_output_0\")"
"plot_tensor(trial, \"NLLLoss_output_0\")"
]
},
{
Expand All @@ -1142,7 +1143,7 @@
"RuleEvaluationConditionMet: Evaluation of the rule Overfit at step 4000 resulted in the condition being met\n",
"```\n",
"\n",
"Based on this rule evaluation and the plot above, we can conclude that the training job has an overfit issue. While the `nll_loss_output_0` line is decreasing, the `val_nll_loss_output_0` line is fluctuating and not decreasing. \n",
"Based on this rule evaluation and the plot above, we can conclude that the training job has an overfit issue. While the `NLLLoss_output_0` line is decreasing, the `val_NLLLoss_output_0` line is fluctuating and not decreasing. \n",
"\n",
"To resolve the overfit problem, you need to consider using or double-checking the following techniques:\n",
"\n",
Expand Down Expand Up @@ -1277,9 +1278,9 @@
"metadata": {
"instance_type": "ml.g4dn.xlarge",
"kernelspec": {
"display_name": "Environment (conda_pytorch_p36)",
"display_name": "Python 3.8.11 64-bit ('3.8.11')",
"language": "python",
"name": "conda_pytorch_p36"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -1291,7 +1292,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
"version": "3.8.11"
},
"papermill": {
"default_parameters": {},
Expand All @@ -1310,4 +1311,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def forward(self, x):
return output


def train(args, model, device, train_loader, optimizer, epoch, hook):
def train(args, model, loss_fn, device, train_loader, optimizer, epoch, hook):
model.train()
# =================================================#
# 2. Set the SMDebug hook for the training phase. #
Expand All @@ -84,12 +84,12 @@ def train(args, model, device, train_loader, optimizer, epoch, hook):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss = loss_fn(output, target)
loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
print(
"Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
"Train Epoch: {} [{}/{} ({:.0f}%)]\t Loss: {:.6f}".format(
epoch,
batch_idx * len(data),
len(train_loader.dataset),
Expand All @@ -101,7 +101,7 @@ def train(args, model, device, train_loader, optimizer, epoch, hook):
break


def test(model, device, test_loader, hook):
def test(model, loss_fn, device, test_loader, hook):
model.eval()
# ===================================================#
# 3. Set the SMDebug hook for the validation phase. #
Expand All @@ -113,7 +113,7 @@ def test(model, device, test_loader, hook):
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction="sum").item() # sum up batch loss
test_loss += loss_fn(output, target).item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()

Expand Down Expand Up @@ -201,12 +201,14 @@ def main():
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

model = Net().to(device)
loss_fn = nn.NLLLoss()

# ======================================================#
# 4. Register the SMDebug hook to save output tensors. #
# ======================================================#
hook = smd.Hook.create_from_json_file()
hook.register_hook(model)
hook.register_loss(loss_fn)

optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

Expand All @@ -215,8 +217,8 @@ def main():
# ===========================================================#
# 5. Pass the SMDebug hook to the train and test functions. #
# ===========================================================#
train(args, model, device, train_loader, optimizer, epoch, hook)
test(model, device, test_loader, hook)
train(args, model, loss_fn, device, train_loader, optimizer, epoch, hook)
test(model, loss_fn, device, test_loader, hook)
scheduler.step()

if args.save_model:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def train(batch_size, epoch, net, hook, device, local_rank):
epoch_times = []

if hook:
hook.register_module(net)
hook.register_loss(loss_optim)
# train the model

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def train(batch_size, epoch, net, hook, args, local_rank):

print("START VALIDATING")
if hook:
hook.register_module(net)
hook.set_mode(modes.EVAL)
test_sampler.set_epoch(i)
net.eval()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def train(args, net, device):
epoch_times = []

if hook:
hook.register_module(net)
hook.register_loss(loss_optim)
# train the model

Expand Down