-
Notifications
You must be signed in to change notification settings - Fork 0
[RDST-111] keep tensorboard directory clean during s3 sync #3
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,6 +38,7 @@ def __init__(self, estimator, logdir=None): | |
threading.Thread.__init__(self) | ||
self.event = threading.Event() | ||
self.estimator = estimator | ||
self.aws_sync_dir = tempfile.mkdtemp() | ||
self.logdir = logdir or tempfile.mkdtemp() | ||
|
||
@staticmethod | ||
|
@@ -47,6 +48,29 @@ def _cmd_exists(cmd): | |
for path in os.environ["PATH"].split(os.pathsep) | ||
) | ||
|
||
@staticmethod | ||
def _sync_directories(from_directory, to_directory): | ||
"""Sync to_directory with from_directory by copying each file in | ||
to_directory with new contents. Why do this? Because TensorBoard picks | ||
up temp files from `aws s3 sync` and then stops reading the correct | ||
tfevent files. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a relevant TensorBoard issue we could include here? I'm just curious if that could be an easier reference for the future in understanding this and knowing if/when it's fixed, versus using This is the closest one I came across, though it's slightly different: tensorflow/tensorboard#70 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I still think it's probably related to tensorflow/tensorboard#349. I'll put that in the docstring. |
||
|
||
Args: | ||
from_directory (str): The directory with updated files. | ||
to_directory (str): The directory to be synced. | ||
""" | ||
for root, dirs, files in os.walk(from_directory): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you'll want to check to see if |
||
to_root = root.replace(from_directory, to_directory) | ||
for directory in dirs: | ||
to_child_dir = os.path.join(to_root, directory) | ||
if not os.path.exists(to_child_dir): | ||
os.mkdir(to_child_dir) | ||
for fname in files: | ||
from_file = os.path.join(root, fname) | ||
to_file = os.path.join(to_root, fname) | ||
with open(from_file, 'rb') as a, open(to_file, 'wb') as b: | ||
b.write(a.read()) | ||
|
||
def validate_requirements(self): | ||
"""Ensure that TensorBoard and the AWS CLI are installed. | ||
|
||
|
@@ -98,8 +122,9 @@ def run(self): | |
while not self.estimator.checkpoint_path: | ||
self.event.wait(1) | ||
while not self.event.is_set(): | ||
args = ['aws', 's3', 'sync', self.estimator.checkpoint_path, self.logdir] | ||
args = ['aws', 's3', 'sync', self.estimator.checkpoint_path, self.aws_sync_dir] | ||
subprocess.call(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
self._sync_directories(self.aws_sync_dir, self.logdir) | ||
self.event.wait(10) | ||
tensorboard_process.terminate() | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My gut says this should be
self._aws_sync_dir
, but it's not clear to me why some fields are considered public while others aren't, so it probably doesn't matter.