Skip to content

Commit

Permalink
Rename env variable (aws#91)
Browse files Browse the repository at this point in the history
* rename env variable
  • Loading branch information
NihalHarish authored and Vikas-kum committed Jan 10, 2020
1 parent 5246cda commit 47265df
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 12 deletions.
14 changes: 12 additions & 2 deletions docs/env_var.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,28 @@ Contains the path to the directory where metrics will be recorded for consumptio
This is relevant only in SageMaker environment, where this variable points to a pre-defined location.


#### `TRAINING_END_DELAY_REFRESH`:
**Note**: The environment variables below are applicable for versions > 0.4.14

#### `SMDEBUG_TRAINING_END_DELAY_REFRESH`:

During analysis, a [trial](analysis.md) is created to query for tensors from a specified directory. This
directory contains collections, events, and index files. This environment variable
specifies how many seconds to wait before refreshing the index files to check if training has ended
and the tensor is available. By default value, this value is set to 1.


#### `INCOMPLETE_STEP_WAIT_WINDOW`:
#### `SMDEBUG_INCOMPLETE_STEP_WAIT_WINDOW`:

During analysis, a [trial](analysis.md) is created to query for tensors from a specified directory. This
directory contains collections, events, and index files. A trial checks to see if a step
specified in the smdebug hook has been completed. This environment variable
specifies the maximum number of incomplete steps that the trial will wait for before marking
half of them as complete. Default: 1000


#### `SMDEBUG_MISSING_EVENT_FILE_RETRY_LIMIT`:

During analysis, a [trial](analysis.md) is created to query for tensors from a specified directory. This
directory contains collections, events, and index files. All the tensor data is stored in the event files.
When tensor data contained in an event file that is not available has been requested, this variable specifcies
the number of times we retry the request.
8 changes: 5 additions & 3 deletions smdebug/core/config_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@
LATEST_MODE_STEP = "latest-mode-step"
TRAINING_RUN = "training-run"

INCOMPLETE_STEP_WAIT_WINDOW_KEY = "INCOMPLETE_STEP_WAIT_WINDOW"
INCOMPLETE_STEP_WAIT_WINDOW_KEY = "SMDEBUG_INCOMPLETE_STEP_WAIT_WINDOW"
INCOMPLETE_STEP_WAIT_WINDOW_DEFAULT = 1000
DEFAULT_EVENT_FILE_RETRY_LIMIT = 100

TRAINING_END_DELAY_REFRESH_KEY = "TRAINING_END_DELAY_REFRESH"
MISSING_EVENT_FILE_RETRY_LIMIT_KEY = "SMDEBUG_MISSING_EVENT_FILE_RETRY_LIMIT"
MISSING_EVENT_FILE_RETRY_LIMIT = 100

TRAINING_END_DELAY_REFRESH_KEY = "SMDEBUG_TRAINING_END_DELAY_REFRESH"
TRAINING_END_DELAY_REFRESH_DEFAULT = 1

CALLABLE_CACHE_ENV_VAR = "SMDEBUG_KERAS_CALLABLE_CACHE_TYPE"
Expand Down
7 changes: 5 additions & 2 deletions smdebug/core/index_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
# First Party
from smdebug.core.access_layer.s3handler import ReadObjectRequest, S3Handler
from smdebug.core.access_layer.utils import has_training_ended
from smdebug.core.config_constants import DEFAULT_EVENT_FILE_RETRY_LIMIT
from smdebug.core.config_constants import (
MISSING_EVENT_FILE_RETRY_LIMIT,
MISSING_EVENT_FILE_RETRY_LIMIT_KEY,
)
from smdebug.core.locations import IndexFileLocationUtils, TensorLocation
from smdebug.core.logger import get_logger
from smdebug.core.modes import ModeKeys
Expand Down Expand Up @@ -100,7 +103,7 @@ class IndexReader(ABC):

def __init__(self, path):
self.event_file_retry_limit = os.getenv(
"TORNASOLE_EVENT_FILE_RETRY_LIMIT", DEFAULT_EVENT_FILE_RETRY_LIMIT
MISSING_EVENT_FILE_RETRY_LIMIT_KEY, MISSING_EVENT_FILE_RETRY_LIMIT
)
self.path = path
self.logger = get_logger()
Expand Down
8 changes: 3 additions & 5 deletions tests/analysis/trials/test_has_passed_step_scenarios.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# Standard Library
import os

# Third Party
import pytest

# First Party
from smdebug.core.config_constants import INCOMPLETE_STEP_WAIT_WINDOW_KEY
from smdebug.core.tensor import StepState
from smdebug.exceptions import NoMoreData, StepUnavailable
from smdebug.trials import create_trial
Expand Down Expand Up @@ -419,7 +419,7 @@ def test_three_writers_not_all_steps_written_but_later_step_written_complete_job


@pytest.mark.slow
def test_override_if_too_many_steps_skipped():
def test_override_if_too_many_steps_skipped(monkeypatch):
"""Test Scenario Description"
workers : [a,b,c]
steps :{
Expand Down Expand Up @@ -449,7 +449,7 @@ def test_override_if_too_many_steps_skipped():
window is smaller than the set threshold
"""

os.environ["INCOMPLETE_STEP_WAIT_WINDOW"] = "10"
monkeypatch.setenv(INCOMPLETE_STEP_WAIT_WINDOW_KEY, "10")

path = "s3://smdebug-testing/resources/has_step_scenarios/too-many-steps-skipped"
trial = create_trial(path)
Expand Down Expand Up @@ -487,8 +487,6 @@ def test_override_if_too_many_steps_skipped():
== "resources/has_step_scenarios/too-many-steps-skipped/index/000000000/000000000009_worker_2.json"
)

del os.environ["INCOMPLETE_STEP_WAIT_WINDOW"]


@pytest.mark.slow
def test_partially_written_tensors():
Expand Down

0 comments on commit 47265df

Please sign in to comment.