Skip to content

Commit

Permalink
Remove graph export support in pytorch, and fix subtle bugs in tensor…
Browse files Browse the repository at this point in the history
…board dir assignment (aws#370)

* Remove graph export support in pytorch, and fix some expands of user provided tensorboard directory

* Removed print

* check none

* fix test

Signed-off-by: Rahul Huilgol <[email protected]>

* fix json load of tensorboard configs

* Address comments

Signed-off-by: Rahul Huilgol <[email protected]>
  • Loading branch information
rahul003 authored Nov 11, 2019
1 parent fc2ea3e commit 3377070
Show file tree
Hide file tree
Showing 23 changed files with 44 additions and 1,690 deletions.
8 changes: 6 additions & 2 deletions examples/pytorch/scripts/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ def create_tornasole_hook(output_dir, module=None, hook_type="saveall", save_ste
# Create a hook that logs weights, biases, gradients and inputs/ouputs of model
if hook_type == "saveall":
hook = TornasoleHook(
out_dir=output_dir, save_config=SaveConfig(save_steps=save_steps), save_all=True
out_dir=output_dir,
save_config=SaveConfig(save_steps=save_steps),
save_all=True,
export_tensorboard=True,
)
elif hook_type == "module-input-output":
# The names of input and output tensors of a module are in following format
Expand All @@ -57,11 +60,12 @@ def create_tornasole_hook(output_dir, module=None, hook_type="saveall", save_ste
out_dir=output_dir,
save_config=SaveConfig(save_steps=save_steps),
include_collections=["weights", "gradients", "biases", "l_mod"],
export_tensorboard=True,
)
elif hook_type == "weights-bias-gradients":
save_config = SaveConfig(save_steps=save_steps)
# Create a hook that logs ONLY weights, biases, and gradients
hook = TornasoleHook(out_dir=output_dir, save_config=save_config)
hook = TornasoleHook(out_dir=output_dir, save_config=save_config, export_tensorboard=True)
return hook


Expand Down
1 change: 1 addition & 0 deletions examples/tensorflow/scripts/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def str2bool(v):
include_collections=["weights", "gradients", "losses"],
save_config=ts.SaveConfig(save_interval=args.tornasole_frequency),
reduction_config=rdnc,
tensorboard_dir=args.tornasole_path + "/tb/",
)

hook.set_mode(ts.modes.TRAIN)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def compile_summary_protobuf():
proto_paths = ["tornasole/core/tfevent/proto", "tornasole/pytorch/proto"]
proto_paths = ["tornasole/core/tfevent/proto"]
cmd = "set -ex && protoc "
for proto_path in proto_paths:
proto_files = os.path.join(proto_path, "*.proto")
Expand Down
2 changes: 1 addition & 1 deletion tests/core/test_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def test_tensorboard_dir_script_export_tensorboard():
""" In script mode, passing `export_tensorboard=True` results in tensorboard_dir=out_dir. """
with ScriptSimulator() as sim:
hook = ts.TornasoleHook(out_dir=sim.out_dir, export_tensorboard=True)
assert hook.tensorboard_dir == hook.out_dir
assert hook.tensorboard_dir == os.path.join(hook.out_dir, "tensorboard")


def test_tensorboard_dir_script_specify_tensorboard_dir():
Expand Down
1 change: 0 additions & 1 deletion tornasole/core/access_layer/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ def __init__(self, path, mode):
self.path = path
self.mode = mode
ensure_dir(path)

if mode in WRITE_MODES:
self.temp_path = get_temp_path(self.path)
ensure_dir(self.temp_path)
Expand Down
21 changes: 10 additions & 11 deletions tornasole/core/hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from tornasole.core.collection_manager import CollectionManager
from tornasole.core.save_config import SaveConfig, SaveConfigMode
from tornasole.core.access_layer import training_has_ended
from tornasole.core.hook_utils import verify_and_get_out_dir
from tornasole.core.hook_utils import verify_and_get_out_dir, get_tensorboard_dir
from tornasole.core.sagemaker_utils import is_sagemaker_job
from tornasole.core.modes import ModeKeys, ALLOWED_MODES
from tornasole.core.utils import flatten, get_tb_worker
Expand Down Expand Up @@ -92,16 +92,11 @@ def __init__(
they are all saved in the collection `all`
"""
self.out_dir = verify_and_get_out_dir(out_dir)

if export_tensorboard and tensorboard_dir:
self.tensorboard_dir = tensorboard_dir
elif not export_tensorboard and tensorboard_dir:
# Assume the user forgot `export_tensorboard` and save anyway.
self.tensorboard_dir = tensorboard_dir
elif export_tensorboard and not tensorboard_dir:
self.tensorboard_dir = out_dir
else:
self.tensorboard_dir = None
self.tensorboard_dir = get_tensorboard_dir(
export_tensorboard=export_tensorboard,
tensorboard_dir=tensorboard_dir,
out_dir=self.out_dir,
)

self.dry_run = dry_run
self.worker = None
Expand Down Expand Up @@ -428,6 +423,9 @@ def _write_scalar_summary(self, tensor_name, tensor_value, save_colls):
return

if np_val.squeeze().ndim == 0:
self.logger.debug(
f"Saving scalar summary {tensor_name} for global step {self.step}"
)
tb_writer.write_scalar_summary(tensor_name, np_val, self.step)
else:
self.logger.debug(
Expand All @@ -449,6 +447,7 @@ def _write_histogram_summary(self, tensor_name, tensor_value, save_collections):
return

hist_name = f"histograms/{s_col.name}/{tensor_name}"
self.logger.debug(f"Saving {hist_name} for global step {self.step}")
tb_writer.write_histogram_summary(
tdata=np_value, tname=hist_name, global_step=self.step
)
Expand Down
15 changes: 15 additions & 0 deletions tornasole/core/hook_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,18 @@ def verify_and_get_out_dir(out_dir):
check_dir_exists(out_dir)

return out_dir


def get_tensorboard_dir(export_tensorboard, tensorboard_dir, out_dir):
if tensorboard_dir is not None:
tensorboard_dir = os.path.expanduser(tensorboard_dir)

if export_tensorboard and tensorboard_dir:
return tensorboard_dir
elif not export_tensorboard and tensorboard_dir:
# Assume the user forgot `export_tensorboard` and save anyway.
return tensorboard_dir
elif export_tensorboard and not tensorboard_dir:
return os.path.join(out_dir, "tensorboard")
else:
return None
4 changes: 2 additions & 2 deletions tornasole/core/json_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ def create_hook_from_json_config(
export_tensorboard = bool(tensorboard_dir is not None)
# Otherwise, place TB artifacts in out_dir
else:
tensorboard_dir = tornasole_params[EXPORT_TENSORBOARD_KEY]
export_tensorboard = tornasole_params[TENSORBOARD_DIR_KEY]
tensorboard_dir = tornasole_params[TENSORBOARD_DIR_KEY]
export_tensorboard = tornasole_params[EXPORT_TENSORBOARD_KEY]

return hook_cls(
out_dir=out_dir,
Expand Down
14 changes: 5 additions & 9 deletions tornasole/core/locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,8 @@ def load_filename(cls, s, print_error=True):
logger.error("Failed to load efl: ", s)
return None

@staticmethod
@abstractmethod
def get_dir(trial_dir):
def get_file_location(self):
pass


Expand Down Expand Up @@ -94,13 +93,10 @@ def __init__(self, step_num, worker_name, mode=None):
self.mode = mode
self.type = "tensorboard"

@staticmethod
def get_dir(trial_dir):
return os.path.join(trial_dir, "tensorboard")

def get_file_location(self, trial_dir=""):
if trial_dir:
event_key_prefix = os.path.join(self.get_dir(trial_dir), self.mode.name)
def get_file_location(self, base_dir=""):
# when base_dir is empty it just returns the relative file path
if base_dir:
event_key_prefix = os.path.join(base_dir, self.mode.name)
else:
event_key_prefix = os.path.join(self.type, self.mode.name)

Expand Down
2 changes: 1 addition & 1 deletion tornasole/core/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def __init__(
el = TensorboardFileLocation(
step_num=self.step, worker_name=self.worker, mode=self.mode
)
event_file_path = el.get_file_location(trial_dir=self.trial_dir)
event_file_path = el.get_file_location(base_dir=self.trial_dir)
self.index_writer = None
else:
assert False, "Writer type not supported: {}".format(wtype)
Expand Down
51 changes: 0 additions & 51 deletions tornasole/pytorch/_proto_graph.py

This file was deleted.

Loading

0 comments on commit 3377070

Please sign in to comment.