Skip to content

Commit

Permalink
Change default collections to only CollectionKeys.LOSSES (aws#347)
Browse files Browse the repository at this point in the history
* PyTorch only registers loss by default

* Remove unreachable RuntimeError

* Remove methods

* Remove imports

* Make index_meta clearer

* Remove reset_collections() from fixture

* Merge with alpha

* Skip test for now

* Remove logging warning that crashed on teardown
  • Loading branch information
jarednielsen authored Nov 11, 2019
1 parent 3377070 commit 1134f64
Show file tree
Hide file tree
Showing 13 changed files with 146 additions and 77 deletions.
13 changes: 13 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

import pytest
import shutil


def pytest_addoption(parser):
Expand All @@ -27,3 +28,15 @@ def pytest_collection_modifyitems(config, items):
for item in items:
if "slow" in item.keywords:
item.add_marker(skip_slow)


@pytest.fixture(scope="function")
def out_dir():
""" Use this method to construct an out_dir.
Then it will be automatically cleaned up for you, passed into the test method, and we'll have
fewer folders lying around.
"""
out_dir = "/tmp/test"
shutil.rmtree(out_dir, ignore_errors=True)
return out_dir
90 changes: 59 additions & 31 deletions tests/pytorch/test_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,58 +31,86 @@ def forward(self, x):
return x


@pytest.mark.slow # 0:05 to run
def test_register_loss():
"""Test that the loss is saved as a tensor."""
ts.reset_collections()
out_dir = "/tmp/pytorch_test_loss"
shutil.rmtree(out_dir, ignore_errors=True)
def create_net_and_train(out_dir, n_steps, use_loss_module=False, use_loss_functional=False):
assert (
use_loss_module != use_loss_functional
), "Exactly one of `use_loss_module` and `use_loss_functional` must be true."

net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.05, momentum=0.9)
criterion = nn.CrossEntropyLoss()

hook = ts.TornasoleHook(
out_dir=out_dir,
# With the default SaveConfig, the weights are not saved (only loss/gradient).
# The weights tensors will be saved only at the final step, and only if they're a multiple
# of save_interval. Issue with flushing?
save_config=ts.SaveConfig(save_interval=1),
)
ts.reset_collections()
hook = ts.TornasoleHook(out_dir=out_dir, save_config=ts.SaveConfig(save_interval=1))
hook.register_hook(net)
hook.register_loss(criterion) # This is the important line
if use_loss_module:
hook.register_loss(criterion)

batch_size = 1
n_steps = 5
# Use the same data at each step to test loss decreasing
inputs, labels = torch.rand(batch_size, 3, 32, 32), torch.zeros(batch_size).long()
for _ in range(n_steps):
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, labels)
if use_loss_module:
loss = criterion(outputs, labels)
if use_loss_functional:
loss = F.cross_entropy(outputs, labels)
hook.record_tensor_value(tensor_name="cross_entropy_loss", tensor_value=loss)
loss.backward()
optimizer.step()

# TODO(nieljare): Remove reliance on hook._cleanup()
# What if the user has a training loop, then calls the Trials API in the same Python script
# (like we do here). Then it'll crash, likewise in a Jupyter notebook.
hook._cleanup()
# Users can call this method to immediately use the Trials API.
hook.close()
ts.del_hook()


@pytest.mark.slow # 0:05 to run
def test_register_loss_functional(out_dir):
""" Test that the loss (as F.cross_entropy_loss) is saved as a tensor. """
n_steps = 5
create_net_and_train(out_dir=out_dir, n_steps=n_steps, use_loss_functional=True)

trial = create_trial(path=out_dir)
loss_coll = hook.collection_manager.get("losses")
assert len(loss_coll.tensor_names) == 3
loss_coll = trial.collection("losses")
loss_tensor = trial.tensor("cross_entropy_loss_output_0")

loss_tensor = trial.tensor("CrossEntropyLoss_output_0")
print(f"loss_tensor.steps() = {loss_tensor.steps()}")
# Capture ['cross_entropy_loss_output_0']
assert len(trial.tensors()) == 1
assert len(loss_coll.tensor_names) == 1

gradient_tensor = trial.tensor("gradient/Net_fc1.weight")
print(f"gradient_tensor.steps() = {gradient_tensor.steps()}")
# Loss should be logged for all the steps since passed `available_steps = range(n_steps)`
assert len(trial.steps()) == n_steps
assert len(loss_tensor.steps()) == n_steps

# Loss should be decreasing
assert loss_tensor.value(0) > loss_tensor.value(4)


@pytest.mark.slow # 0:05 to run
@pytest.mark.skip(
"Nihal will re-enable"
) # TODO (NihalHarish): Re-enable after removing the cache singleton.
def test_register_loss_module(out_dir):
""" Test that the loss (as nn.Module) is saved as a tensor.
Also test that nothing else is saved under the default config.
"""
breakpoint()
n_steps = 5
create_net_and_train(out_dir=out_dir, n_steps=n_steps, use_loss_module=True)

trial = create_trial(path=out_dir)
loss_coll = trial.collection("losses")
loss_tensor = trial.tensor("CrossEntropyLoss_output_0")

weight_tensor = trial.tensor("Net_fc1.weight")
print(f"weight_tensor.steps() = {weight_tensor.steps()}")
# Capture ['CrossEntropyLoss_input_0', 'CrossEntropyLoss_input_1', 'CrossEntropyLoss_output_0']
assert len(trial.tensors()) == 3
assert len(loss_coll.tensor_names) == 3

# Loss should be logged for all the steps since passed `available_steps = range(n_steps)`
assert len(trial.steps()) == n_steps
assert len(weight_tensor.steps()) == n_steps
assert len(gradient_tensor.steps()) == n_steps
assert len(loss_tensor.steps()) == n_steps

# Loss should be decreasing
assert loss_tensor.value(0) > loss_tensor.value(4)
13 changes: 12 additions & 1 deletion tests/pytorch/test_modes.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from __future__ import print_function
import numpy as np
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from tornasole import modes, SaveConfig, SaveConfigMode
import tornasole.pytorch as ts
from tornasole.pytorch.hook import *
from tornasole.pytorch.collection import *
from tornasole.pytorch import reset_collections
Expand Down Expand Up @@ -61,6 +63,7 @@ def train(model, device, optimizer, num_steps=500, save_steps=[]):
optimizer.zero_grad()
output = model(Variable(data, requires_grad=True))
loss = F.nll_loss(output, target)
ts.get_hook().record_tensor_value(tensor_name="my_loss", tensor_value=loss)
loss.backward()
if i in save_steps:
model.saved["gradient/Net_fc1.weight"][i] = model.fc1.weight.grad.data.numpy().copy()
Expand All @@ -84,16 +87,24 @@ def helper_test_modes(hook=None, out_dir="./test_output/test_hook_modes/"):
model = Net(to_save=save_steps).to(device)
json = hook is not None
if hook is None:
out_dir = out_dir + "/" + prefix
out_dir = str(Path(out_dir, prefix))
hook = TornasoleHook(
out_dir=out_dir,
save_config=SaveConfig({modes.TRAIN: SaveConfigMode(save_steps=save_steps)}),
include_collections=[
CollectionKeys.WEIGHTS,
CollectionKeys.BIASES,
CollectionKeys.GRADIENTS,
CollectionKeys.DEFAULT,
CollectionKeys.LOSSES,
],
)

hook.register_hook(model)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
hook.set_mode(mode=modes.TRAIN)
train(model, device, optimizer, num_steps=10, save_steps=save_steps)

trial = create_trial(path=out_dir, name="test output")

assert len(trial.modes()) == 1
Expand Down
12 changes: 11 additions & 1 deletion tests/pytorch/test_simple_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,17 @@ def create_tornasole_hook(output_dir, module=None, hook_type="saveall", save_ste
elif hook_type == "weights-bias-gradients":
save_config = SaveConfig(save_steps=save_steps)
# Create a hook that logs ONLY weights, biases, and gradients
hook = TornasoleHook(out_dir=output_dir, save_config=save_config)
hook = TornasoleHook(
out_dir=output_dir,
save_config=save_config,
include_collections=[
CollectionKeys.WEIGHTS,
CollectionKeys.BIASES,
CollectionKeys.GRADIENTS,
CollectionKeys.DEFAULT,
CollectionKeys.LOSSES,
],
)
return hook


Expand Down
46 changes: 23 additions & 23 deletions tornasole/core/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,29 +52,29 @@ class CollectionKeys:

class Collection:
"""
Collection object helps group tensors for easier handling during saving as well
as analysis. A collection has its own list of tensors, reduction config
and save config. This allows setting of different save and reduction configs
for different tensors.
...
Attributes
----------
name: str
name of collection
include_regex: list of (str representing regex for tensor names or block names)
list of regex expressions representing names of tensors (tf) or blocks(gluon)
to include for this collection
reduction_config: ReductionConfig object
reduction config to be applied for this collection.
if this is not passed, uses the default reduction_config
save_config: SaveConfig object
save config to be applied for this collection.
if this is not passed, uses the default save_config
"""
Collection object helps group tensors for easier handling during saving as well
as analysis. A collection has its own list of tensors, reduction config
and save config. This allows setting of different save and reduction configs
for different tensors.
...
Attributes
----------
name: str
name of collection
include_regex: list of (str representing regex for tensor names or block names)
list of regex expressions representing names of tensors (tf) or blocks(gluon)
to include for this collection
reduction_config: ReductionConfig object
reduction config to be applied for this collection.
if this is not passed, uses the default reduction_config
save_config: SaveConfig object
save config to be applied for this collection.
if this is not passed, uses the default save_config
"""

def __init__(
self,
Expand Down
3 changes: 3 additions & 0 deletions tornasole/core/hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,9 @@ def _close_tb_writer(self):
self.tb_writers[self.mode].close()
del self.tb_writers[self.mode]

def close(self):
self._cleanup()

def _cleanup(self):
self._close_writers()
training_has_ended(self.out_dir)
Expand Down
6 changes: 6 additions & 0 deletions tornasole/core/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,12 @@ def _step(self, step_num, mode=ModeKeys.GLOBAL, worker=None):
raise StepNotYetAvailable(step_num, mode)
assert False, "Should not happen"

def values(self, mode=ModeKeys.GLOBAL, worker=None):
res = {}
for step in self.steps():
res[step] = self.value(step_num=step, mode=mode, worker=worker)
return res

def value(self, step_num, mode=ModeKeys.GLOBAL, worker=None):
# step refreshes
s = self._step(step_num=step_num, mode=mode, worker=worker)
Expand Down
14 changes: 8 additions & 6 deletions tornasole/core/tfevent/index_file_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,25 @@

class IndexWriter(object):
def __init__(self, file_path):
""" Writer is initialized upon adding the first index. """
self.file_path = file_path
self.writer = self._init_writer()
self.index_payload = []
self.index_meta = {}
self.writer = None

def __exit__(self):
self.close()

def _init_writer(self):
s3, bucket_name, key_name = is_s3(self.file_path)
if s3:
writer = TSAccessS3(bucket_name, key_name, binary=False)
self.writer = TSAccessS3(bucket_name, key_name, binary=False)
else:
writer = TSAccessFile(self.file_path, "a+")
return writer
self.writer = TSAccessFile(self.file_path, "a+")

def add_index(self, tensorlocation):
if not self.writer:
self._init_writer()
if not self.index_meta:
self.index_meta = {
"mode": tensorlocation.mode,
Expand Down Expand Up @@ -55,8 +57,8 @@ def close(self):
if self.writer is not None:
if self.index_meta and self.index_payload:
self.flush()
self.writer.close()
self.writer = None
self.writer.close()
self.writer = None


class Index:
Expand Down
5 changes: 2 additions & 3 deletions tornasole/mxnet/hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ def __init__(
include_collections=None,
save_all=False,
):
collection_manager = get_collection_manager()
super().__init__(
collection_manager=get_collection_manager(),
collection_manager=collection_manager,
default_include_collections=DEFAULT_INCLUDE_COLLECTIONS,
data_type_name=mx.ndarray.NDArray.__name__,
out_dir=out_dir,
Expand Down Expand Up @@ -107,8 +108,6 @@ def _export_model(self):
f"Could not export model graph for tensorboard "
f"due to the mxnet exception: {e}"
)
else:
self.logger.warning("Tornasole does not know the model")

# This hook is invoked by trainer prior to running the forward pass.
def forward_pre_hook(self, block, inputs):
Expand Down
12 changes: 3 additions & 9 deletions tornasole/pytorch/hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,7 @@
from tornasole.pytorch.singleton_utils import set_hook
from tornasole.pytorch.utils import get_reduction_of_data, make_numpy_array

DEFAULT_INCLUDE_COLLECTIONS = [
CollectionKeys.WEIGHTS,
CollectionKeys.BIASES,
CollectionKeys.GRADIENTS,
CollectionKeys.DEFAULT,
CollectionKeys.LOSSES,
]
DEFAULT_INCLUDE_COLLECTIONS = [CollectionKeys.LOSSES]


class TornasoleHook(CallbackHook):
Expand All @@ -36,9 +30,9 @@ def __init__(
include_collections=None,
save_all=False,
):

collection_manager = get_collection_manager()
super().__init__(
collection_manager=get_collection_manager(),
collection_manager=collection_manager,
default_include_collections=DEFAULT_INCLUDE_COLLECTIONS,
data_type_name=torch.Tensor.__name__,
out_dir=out_dir,
Expand Down
3 changes: 2 additions & 1 deletion tornasole/tensorflow/hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,9 @@ def __init__(
a shortcut for saving all tensors in the model.
they are all saved in the collection `all`
"""
collection_manager = get_collection_manager()
super().__init__(
collection_manager=get_collection_manager(),
collection_manager=collection_manager,
default_include_collections=DEFAULT_INCLUDE_COLLECTIONS,
out_dir=out_dir,
export_tensorboard=export_tensorboard,
Expand Down
3 changes: 2 additions & 1 deletion tornasole/tensorflow/keras.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ def __init__(
if save_all is not None:
msg = "'include_regex' is not yet supported and will be ignored."
self.logger.warning(msg)
collection_manager = get_collection_manager()
super().__init__(
collection_manager=get_collection_manager(),
collection_manager=collection_manager,
default_include_collections=DEFAULT_INCLUDE_COLLECTIONS,
out_dir=out_dir,
dry_run=dry_run,
Expand Down
Loading

0 comments on commit 1134f64

Please sign in to comment.