Write support for tensorboard (aws#252)

* Save histograms for weights and gradients * Use standard TF summary function * undo line break changes * fix cases when bool tensor was being passed to add_histogram, and fix tests * Fix region bug and update tb_writer construction * Include summaries if any write_histogram was set to True * Refactor writers in core * set default step to 0 * Use new writer in hook * Cherry picking change of refactor writers * set default step to 0 * remove histogram related stuff * rename IndexUtil * Fix imports * remove import of re * Fix import of summary proto * Fix step usage in writers * Fix step usage by event file writer * Remove direcotry in tensorboard directory, and add collection name as prefix for summaries created * Fix import errors * Fix resnet example which did not have str2bool args * Fix core test * Fix core test * Indentation and move some code to a new function * Merged Vikas' branch on tb data read * Add untested support to read tensorboard data * Write mode and mode_step for summaries, and fix the error of multiple global steps being assigned to same train step * remove unnecessary file * remove test script * Remove changes to imagenet script * working scalars * Change path of tornasole event files * Have new index file per mode for tensorboard events * Move tensor values to different file * move to outside tensors folder * Change frequencies for tf examples * Introduce CollectionKeys * Merging export as json * Make histogram a reduction config property, and add save_raw_tensor field to reduction config. Verified the usage for tensorflow. Also some cleanup with respect to save config in save manager * Fix bug in loading collections * Fix writing tensorboard data in global mode * Add graph support to pytorch models. Copied some new protos, and a couple of files from torch.tensorboard. * Working graph export for mxnet * Save graph correctly for mxnet * undo utils change worker pid * fix import * fix import * do not flush index writer * remove data files * Fix save config issue * make save_histogram a property of collection * Fix save config bugs, and add scalar support to TF * Skip summaries whose tensors are unreachable in graph, and avoid adding histogram when original collection is not included * Move histogram creation to writer instead of event_file_writer, refactor should_save_collection in save manager, add save_scalar methods to MXNet and Pytorch * WIP tensor scalar support * undo add of data * remove test * use correct writer * Make saving scalars work, and added type checks * Writing scalars and tensors supported. tested in tensorboard. need to test through trials * WIP testing steps * remove save scalar and tensor for now because of step number issues. work on trial loading tensorboard data and come back to this * Working reads in non index mode * Tensorboard reads working with indexing * cleanup index file location function * Make pytorch tests working * Reduce length of test_estimator_modes, and add tf tensorboard test * Add basic scalar summary test * Untested completed reads of tensorboard data * Add more tensorboard tests for trial * fix test when reading event files for tensorboard from s3 * Fixed a reduction test * Fix reduction test in TF * Fix merge of a test * fix logger import, and default save/reduction config in save manager * Fix reduction save_raw_tensor in TF * Some cleanup of prepare and collection includes * fix tf tests * Fix all tests * Add tensorboard index test * Fix tensorboard test wrt optimizer_variables * not save histogram for strings * remove when nan support * add hash * Fix collection checks in xgboost * add xgboost tests * Typo * Update hook.py (aws#243) * reduce length of test and add / to prefix * WIP move to tornasole hist summaries for TF * Change collections_to_save_for_step, make TF use custom histograms, refactor to _save_tensor method for all frameworks * rename to save_for_tensor * undo some files * undo some files * Update tests.sh * remove pytorch graph support * remove mxnet graph support * cleanup * remove tf tensorboard duplicated test * Fix bug of tb writer not being closed after exporting graph * WIP fixing tests * Remove read changes * fix value types remaining in code * fix tests * catch exception when nan * use make_numpy_array for xgboost * Fix xgboost error where collections_in_set was empty but not none * change log * remove summary collections * tweak dry run behavior * Fix dry run flag * undo move of steps to own file * Delete steps.py * fix import * fix import in test * cleanup * remove index for tensorboard data * Address review comments * Update hook.py
atqy · Oct 11, 2019 · ce31b26 · ce31b26
1 parent e7e9b46
commit ce31b26
Show file tree

Hide file tree

Showing 35 changed files with 1,032 additions and 542 deletions.
diff --git a/tests/analysis/exceptions/test_exceptions.py b/tests/analysis/exceptions/test_exceptions.py
@@ -1,9 +1,11 @@
+import shutil
 import pytest
 import uuid
 from tests.analysis.utils import generate_data
 from tornasole.trials import create_trial
 from tornasole.exceptions import *
 import boto3 as boto3
+import os
 
 def del_s3(bucket, file_path):
   s3_client = boto3.client('s3')
@@ -12,8 +14,7 @@ def del_s3(bucket, file_path):
 @pytest.mark.slow # 0:40 to run
 def test_refresh_tensors():
     trial_name = str(uuid.uuid4())
-    path = 's3://tornasolecodebuildtest/rules/tensors/ts_output/train/'
-    bucket = 'tornasolecodebuildtest'
+    path = '/tmp/tornasole_analysis_tests/test_refresh_tensors/'
     num_steps = 8
     num_tensors = 10
     for i in range(num_steps):
@@ -29,7 +30,6 @@ def test_refresh_tensors():
       assert False
     except TensorUnavailable:
       pass
-    del_s3(bucket, file_path=path)
 
     assert tr.tensor('foo_1') is not None
     assert tr.tensor('foo_1').value(num_steps - 1) is not None
@@ -63,3 +63,5 @@ def test_refresh_tensors():
       assert False
     except StepNotYetAvailable:
       pass
+
+    shutil.rmtree(os.path.join(path, trial_name))
diff --git a/tests/core/test_index.py b/tests/core/test_index.py
@@ -1,6 +1,6 @@
 from tornasole.core.writer import FileWriter
 from tornasole.core.reader import FileReader
-from tornasole.core.locations import EventFileLocation, IndexFileLocationUtils
+from tornasole.core.locations import TensorFileLocation, IndexFileLocationUtils
 import shutil
 import os
 import numpy as np
@@ -21,8 +21,8 @@ def test_index():
         writer.write_tensor(tdata=numpy_tensor[i], tname=n)
     writer.flush()
     writer.close()
-    efl = EventFileLocation(step_num=step, worker_name=worker)
-    eventfile = efl.get_location(trial_dir=run_dir)
+    efl = TensorFileLocation(step_num=step, worker_name=worker)
+    eventfile = efl.get_file_location(trial_dir=run_dir)
     indexfile = IndexFileLocationUtils.get_index_key_for_step(run_dir, step, worker)
 
     fo = open(eventfile, "rb")

diff --git a/tests/tensorflow/hooks/test_estimator_modes.py b/tests/tensorflow/hooks/test_estimator_modes.py
@@ -9,7 +9,6 @@
 Integration tests with S3 take 95% of the time.
 """
 
-
 import pytest
 import tensorflow.compat.v1 as tf
 import numpy as np
@@ -23,9 +22,11 @@
 from tornasole.tensorflow import reset_collections
 from tornasole.tensorflow.hook import TornasoleHook
 from tornasole.trials import create_trial
+from tornasole.core.utils import is_s3
 from tests.analysis.utils import delete_s3_prefix
 
-def help_test_mnist(path, save_config=None, hook=None, set_modes=True):
+def help_test_mnist(path, save_config=None, hook=None, set_modes=True,
+                    num_train_steps=20, num_eval_steps=10):
     trial_dir = path
     tf.reset_default_graph()
     if hook is None:
@@ -135,102 +136,99 @@ def train(num_steps):
     if set_modes:
         hook.set_mode(ts.modes.TRAIN)
     # train one step and display the probabilties
-    train(2)
+    train(num_train_steps/2)
 
     if set_modes:
         hook.set_mode(ts.modes.EVAL)
     mnist_classifier.evaluate(input_fn=eval_input_fn,
-                              steps=3,
+                              steps=num_eval_steps,
                               hooks=[hook])
 
     if set_modes:
         hook.set_mode(ts.modes.TRAIN)
-    train(2)
+    train(num_train_steps/2)
 
     return train
 
-@pytest.mark.slow # 0:02 to run
-def test_mnist_local():
-    run_id = 'trial_' + datetime.now().strftime('%Y%m%d-%H%M%S%f')
-    trial_dir = os.path.join(TORNASOLE_TF_HOOK_TESTS_DIR, run_id)
-    help_test_mnist(trial_dir, ts.SaveConfig(save_interval=2))
+def helper_test_mnist_trial(trial_dir):
     tr = create_trial(trial_dir)
-    assert len(tr.available_steps()) == 4
+    assert len(tr.available_steps()) == 3
     assert len(tr.available_steps(mode=ts.modes.TRAIN)) == 2
-    assert len(tr.available_steps(mode=ts.modes.EVAL)) == 2
+    assert len(tr.available_steps(mode=ts.modes.EVAL)) == 1
     assert len(tr.tensors()) == 17
-    shutil.rmtree(trial_dir)
+    on_s3, bucket, prefix = is_s3(trial_dir)
+    if not on_s3:
+        shutil.rmtree(trial_dir, ignore_errors=True)
+    else:
+        delete_s3_prefix(bucket, prefix)
+
+@pytest.mark.slow # 0:02 to run
+def test_mnist(on_s3=False):
+    run_id = 'trial_' + datetime.now().strftime('%Y%m%d-%H%M%S%f')
+    if on_s3:
+        bucket = 'tornasole-testing'
+        prefix = 'tornasole_tf/hooks/estimator_modes/' + run_id
+        trial_dir = f's3://{bucket}/{prefix}'
+    else:
+        trial_dir = os.path.join(TORNASOLE_TF_HOOK_TESTS_DIR, run_id)
+    help_test_mnist(trial_dir, save_config=ts.SaveConfig(save_interval=2),
+                    num_train_steps=4, num_eval_steps=2)
+    helper_test_mnist_trial(trial_dir)
 
 @pytest.mark.slow # 0:02 to run
 def test_mnist_local_json():
     out_dir = 'newlogsRunTest1/test_mnist_local_json_config'
     shutil.rmtree(out_dir, ignore_errors=True)
     os.environ[TORNASOLE_CONFIG_FILE_PATH_ENV_STR] = 'tests/tensorflow/hooks/test_json_configs/test_mnist_local.json'
     hook = TornasoleHook.hook_from_config()
-    help_test_mnist(path=out_dir, hook=hook)
-    tr = create_trial(out_dir)
-    assert len(tr.available_steps()) == 4
-    assert len(tr.available_steps(mode=ts.modes.TRAIN)) == 2
-    assert len(tr.available_steps(mode=ts.modes.EVAL)) == 2
-    assert len(tr.tensors()) == 17
-    shutil.rmtree(out_dir, ignore_errors=True)
+    help_test_mnist(path=out_dir, hook=hook,
+                    num_train_steps=4, num_eval_steps=2)
+    helper_test_mnist_trial(out_dir)
 
 @pytest.mark.slow # 1:04 to run
 def test_mnist_s3():
-    run_id = 'trial_' + datetime.now().strftime('%Y%m%d-%H%M%S%f')
-    bucket = 'tornasole-testing'
-    prefix = 'tornasole_tf/hooks/estimator_modes/' + run_id
-    trial_dir = f's3://{bucket}/{prefix}'
-    help_test_mnist(trial_dir, ts.SaveConfig(save_interval=2))
+    # Takes 1:04 to run, compared to 4 seconds above.
+    # Speed improvements, or should we migrate integration tests to their own folder?
+    test_mnist(True)
+
+def helper_test_multi_save_configs_trial(trial_dir):
     tr = create_trial(trial_dir)
-    assert len(tr.available_steps()) == 4
-    assert len(tr.available_steps(mode=ts.modes.TRAIN)) == 2
+    assert len(tr.available_steps()) == 5, tr.available_steps()
+    assert len(tr.available_steps(mode=ts.modes.TRAIN)) == 3
     assert len(tr.available_steps(mode=ts.modes.EVAL)) == 2
     assert len(tr.tensors()) == 17
-    delete_s3_prefix(bucket, prefix)
+    on_s3, bucket, prefix = is_s3(trial_dir)
+    if not on_s3:
+        shutil.rmtree(trial_dir)
+    else:
+        delete_s3_prefix(bucket, prefix)
 
 @pytest.mark.slow # 0:04 to run
-def test_mnist_local_multi_save_configs():
+def test_mnist_local_multi_save_configs(on_s3=False):
+    # Runs in 0:04
     run_id = 'trial_' + datetime.now().strftime('%Y%m%d-%H%M%S%f')
-    trial_dir = os.path.join(TORNASOLE_TF_HOOK_TESTS_DIR, run_id)
+    if on_s3:
+        bucket = 'tornasole-testing'
+        prefix = 'tornasole_tf/hooks/estimator_modes/' + run_id
+        trial_dir = f's3://{bucket}/{prefix}'
+    else:
+        trial_dir = os.path.join(TORNASOLE_TF_HOOK_TESTS_DIR, run_id)
     help_test_mnist(trial_dir, ts.SaveConfig({
         ts.modes.TRAIN: ts.SaveConfigMode(save_interval=2),
         ts.modes.EVAL: ts.SaveConfigMode(save_interval=3)
-    }))
-    tr = create_trial(trial_dir)
-    assert len(tr.available_steps()) == 3
-    assert len(tr.available_steps(mode=ts.modes.TRAIN)) == 2
-    assert len(tr.available_steps(mode=ts.modes.EVAL)) == 1
-    assert len(tr.tensors()) == 17
-    shutil.rmtree(trial_dir)
+    }), num_train_steps=6, num_eval_steps=4)
+    helper_test_multi_save_configs_trial(trial_dir)
 
 @pytest.mark.slow # 0:52 to run
 def test_mnist_s3_multi_save_configs():
-    run_id = 'trial_' + datetime.now().strftime('%Y%m%d-%H%M%S%f')
-    bucket = 'tornasole-testing'
-    prefix = 'tornasole_tf/hooks/estimator_modes/' + run_id
-    trial_dir = f's3://{bucket}/{prefix}'
-    help_test_mnist(trial_dir, ts.SaveConfig({
-        ts.modes.TRAIN: ts.SaveConfigMode(save_interval=2),
-        ts.modes.EVAL: ts.SaveConfigMode(save_interval=3)
-    }))
-    tr = create_trial(trial_dir)
-    assert len(tr.available_steps()) == 3
-    assert len(tr.available_steps(mode=ts.modes.TRAIN)) == 2
-    assert len(tr.available_steps(mode=ts.modes.EVAL)) == 1
-    assert len(tr.tensors()) == 17
-    delete_s3_prefix(bucket, prefix)
+    # Takes 0:52 to run, compared to 4 seconds above. Speed improvements?
+    test_mnist_local_multi_save_configs(True)
 
 @pytest.mark.slow # 0:02 to run
 def test_mnist_local_multi_save_configs_json():
     out_dir = 'newlogsRunTest1/test_save_config_modes_hook_config'
     shutil.rmtree(out_dir, ignore_errors=True)
     os.environ[TORNASOLE_CONFIG_FILE_PATH_ENV_STR] = 'tests/tensorflow/hooks/test_json_configs/test_save_config_modes_hook_config.json'
     hook = ts.TornasoleHook.hook_from_config()
-    help_test_mnist(out_dir, hook=hook)
-    tr = create_trial(out_dir)
-    assert len(tr.available_steps()) == 3
-    assert len(tr.available_steps(mode=ts.modes.TRAIN)) == 2
-    assert len(tr.available_steps(mode=ts.modes.EVAL)) == 1
-    assert len(tr.tensors()) == 17
-    shutil.rmtree(out_dir)
+    help_test_mnist(out_dir, hook=hook, num_train_steps=6, num_eval_steps=4)
+    helper_test_multi_save_configs_trial(out_dir)
diff --git a/tests/tensorflow/hooks/test_losses.py b/tests/tensorflow/hooks/test_losses.py
@@ -10,9 +10,11 @@
 def test_mnist_local():
   run_id = 'trial_' + datetime.now().strftime('%Y%m%d-%H%M%S%f')
   trial_dir = os.path.join(TORNASOLE_TF_HOOK_TESTS_DIR, run_id)
-  help_test_mnist(trial_dir, ts.SaveConfig(save_interval=2))
+  help_test_mnist(trial_dir, ts.SaveConfig(save_interval=2),
+                  num_train_steps=4,
+                  num_eval_steps=2)
   tr = create_trial(trial_dir)
   assert len(tr.collection('losses').get_tensor_names()) == 1
   for t in tr.collection('losses').get_tensor_names():
-    assert len(tr.tensor(t).steps()) == 4
+    assert len(tr.tensor(t).steps()) == 3
   shutil.rmtree(trial_dir)
diff --git a/tests/tensorflow/hooks/test_reductions.py b/tests/tensorflow/hooks/test_reductions.py
@@ -1,47 +1,53 @@
 import os
 import shutil
 from datetime import datetime
+
 from tornasole.core.reduction_config import ALLOWED_REDUCTIONS, ALLOWED_NORMS
 from tornasole.core.json_config import TORNASOLE_CONFIG_FILE_PATH_ENV_STR
 from tornasole.exceptions import *
 import tornasole.tensorflow as ts
 from .utils import *
 
 
-def helper_test_reductions(trial_dir, hook):
+def helper_test_reductions(trial_dir, hook, save_raw_tensor):
     simple_model(hook)
     _, files = get_dirs_files(trial_dir)
-    coll = ts.get_collections()
     from tornasole.trials import create_trial
 
     tr = create_trial(trial_dir)
-    assert len(tr.tensors()) == 3
+    assert len(tr.tensors()) == 3, tr.tensors()
     for tname in tr.tensors():
         t = tr.tensor(tname)
         try:
-            t.value(0)
-            assert False
-        except TensorUnavailableForStep:
+            print(t.value(0))
+            if save_raw_tensor is False:
+                assert False, (tname, t.value(0))
+        except TensorUnavailableForStep as e:
+            if save_raw_tensor is True:
+                assert False, (t.name, e)
             pass
         assert len(t.reduction_values(0)) == 18
         for r in ALLOWED_REDUCTIONS + ALLOWED_NORMS:
             for b in [False, True]:
                 assert t.reduction_value(0, reduction_name=r, abs=b, worker=None) is not None
 
 
-def test_reductions():
+def test_reductions(save_raw_tensor=False):
     run_id = 'trial_' + datetime.now().strftime('%Y%m%d-%H%M%S%f')
     trial_dir = os.path.join('/tmp/tornasole_rules_tests/', run_id)
     pre_test_clean_up()
     rdnc = ReductionConfig(reductions=ALLOWED_REDUCTIONS,
                            abs_reductions=ALLOWED_REDUCTIONS,
                            norms=ALLOWED_NORMS,
-                           abs_norms=ALLOWED_NORMS)
+                           abs_norms=ALLOWED_NORMS,
+                           save_raw_tensor=save_raw_tensor)
     hook = TornasoleHook(out_dir=trial_dir,
                          save_config=SaveConfig(save_interval=1),
                          reduction_config=rdnc)
-    helper_test_reductions(trial_dir, hook)
+    helper_test_reductions(trial_dir, hook, save_raw_tensor)
 
+def test_reductions_with_raw_tensor():
+    test_reductions(save_raw_tensor=True)
 
 def test_reductions_json():
     trial_dir = "newlogsRunTest1/test_reductions"
@@ -50,4 +56,4 @@ def test_reductions_json():
         TORNASOLE_CONFIG_FILE_PATH_ENV_STR] = "tests/tensorflow/hooks/test_json_configs/test_reductions.json"
     pre_test_clean_up()
     hook = ts.TornasoleHook.hook_from_config()
-    helper_test_reductions(trial_dir, hook)
+    helper_test_reductions(trial_dir, hook, False)
diff --git a/tests/tensorflow/hooks/test_save_all_full.py b/tests/tensorflow/hooks/test_save_all_full.py
@@ -24,15 +24,15 @@ def test_save_all_full(hook=None, trial_dir=None):
     dirs, _ = get_dirs_files(os.path.join(trial_dir, 'events'))
 
     coll = get_collections()
-    assert len(coll) == 6
+    assert all([x in coll.keys() for x in ['all','weights','gradients','losses','optimizer_variables']])
     assert len(coll['weights'].tensor_names) == 1
     assert len(coll['gradients'].tensor_names) == 1
     assert len(coll['losses'].tensor_names) == 1
 
     assert TORNASOLE_DEFAULT_COLLECTIONS_FILE_NAME in files
     cm = CollectionManager.load(join(trial_dir, TORNASOLE_DEFAULT_COLLECTIONS_FILE_NAME))
 
-    assert len(cm.collections) == 6
+    assert len(cm.collections) == len(coll), (coll, cm.collections)
     assert len(cm.collections['weights'].tensor_names) == 1
     assert len(cm.collections['losses'].tensor_names) == 1
     assert len(cm.collections['gradients'].tensor_names) == 1
@@ -43,7 +43,6 @@ def test_save_all_full(hook=None, trial_dir=None):
                                     len(cm.collections['gradients'].tensor_names)
     num_tensors_collection = len(coll['weights'].tensor_names) + \
                              len(coll['gradients'].tensor_names)
-
     assert num_tensors_collection == num_tensors_loaded_collection
     assert len(dirs) == 5
     for step in dirs:
@@ -55,9 +54,10 @@ def test_save_all_full(hook=None, trial_dir=None):
             for x in fr.read_tensors():
                 tensor_name, step, tensor_data, mode, mode_step = x
                 i += 1
+                print(tensor_name)
                 size += tensor_data.nbytes
-        assert i == 85
-        assert size == 1470
+        assert i == 84
+        assert size == 1462
     if hook_created:
         shutil.rmtree(trial_dir)
 

diff --git a/tests/tensorflow/hooks/test_save_config.py b/tests/tensorflow/hooks/test_save_config.py
@@ -90,7 +90,7 @@ def test_save_config_start_and_end_json():
 
 
 def helper_save_config_modes(trial_dir, hook):
-    help_test_mnist(trial_dir, hook=hook)
+    help_test_mnist(trial_dir, hook=hook, num_train_steps=4, num_eval_steps=3)
     tr = create_trial(trial_dir)
     for tname in tr.tensors_in_collection('weights'):
         t = tr.tensor(tname)

diff --git a/tests/tensorflow/hooks/test_save_reductions.py b/tests/tensorflow/hooks/test_save_reductions.py
@@ -10,13 +10,12 @@ def helper_save_reductions(trial_dir, hook):
   simple_model(hook)
   _, files = get_dirs_files(trial_dir)
   coll = get_collections()
-  assert len(coll) == 5
   assert len(coll['weights'].tensor_names) == 1
   assert len(coll['gradients'].tensor_names) == 1
 
   assert TORNASOLE_DEFAULT_COLLECTIONS_FILE_NAME in files
   cm = CollectionManager.load(join(trial_dir, TORNASOLE_DEFAULT_COLLECTIONS_FILE_NAME))
-  assert len(cm.collections) == 5
+  assert len(cm.collections) == len(coll)
   assert len(cm.collections['weights'].tensor_names) == 1
   assert len(cm.collections['gradients'].tensor_names) == 1
   # as we hadn't asked to be saved
@@ -60,8 +59,6 @@ def test_save_reductions():
   helper_save_reductions(trial_dir, hook)
 
 
-
-
 def test_save_reductions_json():
   trial_dir = "newlogsRunTest1/test_save_reductions"
   shutil.rmtree(trial_dir, ignore_errors=True)

diff --git a/tests/tensorflow/hooks/test_training_end.py b/tests/tensorflow/hooks/test_training_end.py
@@ -18,6 +18,6 @@ def test_training_job_has_ended():
           [sys.executable, "examples/tensorflow/scripts/simple.py",
            "--tornasole_path", trial_dir,
            '--steps', '10', '--tornasole_frequency', '5'],
-          env={'CUDA_VISIBLE_DEVICES':'-1'})
+          env={'CUDA_VISIBLE_DEVICES':'-1', 'TORNASOLE_LOG_LEVEL': 'debug'})
   assert has_training_ended(trial_dir) == True
   shutil.rmtree(trial_dir)