Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix logging issues #108

Merged
merged 17 commits into from
Oct 7, 2021
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .coveragerc_py38
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[run]
branch = True
timid = True

[report]
exclude_lines =
pragma: no cover
pragma: py3 no cover
if six.PY2
elif six.PY2

partial_branches =
pragma: no cover
pragma: py3 no cover
if six.PY3
elif six.PY3

show_missing = True

fail_under = 90
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.idea
.vscode
.cache/
build/
dist/
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.9.3.dev0
4.0.0
6 changes: 3 additions & 3 deletions buildspec-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ phases:
# run unit tests
- AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION=
tox -e py36,py37 --parallel all -- test/unit
tox -e py37,py38 --parallel all -- test/unit

# run functional tests
- $(aws ecr get-login --no-include-email --region us-west-2)
- IGNORE_COVERAGE=- tox -e py36,py37 -- test/functional
- IGNORE_COVERAGE=- tox -e py37,py38 -- test/functional


# build dummy container
Expand All @@ -36,7 +36,7 @@ phases:
- cd ../..

# run local integration tests
- IGNORE_COVERAGE=- tox -e py36,py37 -- test/integration/local
- IGNORE_COVERAGE=- tox -e py37,py38 -- test/integration/local

# generate the distribution package
- python3 setup.py sdist
Expand Down
6 changes: 3 additions & 3 deletions buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ phases:
# run unit tests
- AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION=
tox -e py36,py37 --parallel all -- test/unit
tox -e py37,py38 --parallel all -- test/unit

# run functional tests
- $(aws ecr get-login --no-include-email --region us-west-2)
- IGNORE_COVERAGE=- tox -e py36,py37 -- test/functional
- IGNORE_COVERAGE=- tox -e py37,py38 -- test/functional

# build dummy container
- python setup.py sdist
Expand All @@ -33,4 +33,4 @@ phases:
- cd ../..

# run local integration tests
- IGNORE_COVERAGE=- tox -e py36,py37 -- test/integration/local
- IGNORE_COVERAGE=- tox -e py37,py38 -- test/integration/local
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we planning not to update py36 containers?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. This is because of asyncio.run which is available from Py3.7. There are other ways to achieve this using asyncio in py3.6 but it is too low level and handling them is very difficult.

7 changes: 3 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License'). You
# may not use this file except in compliance with the License. A copy of
Expand Down Expand Up @@ -74,9 +74,8 @@ def read_version():
"Natural Language :: English",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
],
install_requires=required_packages,
extras_require={
Expand All @@ -86,7 +85,7 @@ def read_version():
"pytest-cov",
"mock",
"sagemaker[local]<2",
"black==19.3b0 ; python_version >= '3.6'",
"black==19.3b0 ; python_version >= '3.7'",
]
},
entry_points={"console_scripts": ["train=sagemaker_training.cli.train:main"]},
Expand Down
57 changes: 57 additions & 0 deletions src/sagemaker_training/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,60 @@
# language governing permissions and limitations under the License.
"""This file is executed when the sagemaker_training package is imported."""
from __future__ import absolute_import

# list of errors: To show user error message on the SM Training job page
# [x for x in dir(__builtins__) if 'Error' in x]
_PYTHON_ERRORS_ = [
"ArithmeticError",
"AssertionError",
"AttributeError",
"BlockingIOError",
"BrokenPipeError",
"BufferError",
"ChildProcessError",
"ConnectionAbortedError",
"ConnectionError",
"ConnectionRefusedError",
"ConnectionResetError",
"EOFError",
"EnvironmentError",
"FileExistsError",
"FileNotFoundError",
"FloatingPointError",
"IOError",
"ImportError",
"IndentationError",
"IndexError",
"InterruptedError",
"IsADirectoryError",
"KeyError",
"LookupError",
"MemoryError",
"ModuleNotFoundError",
"NameError",
"NotADirectoryError",
"NotImplementedError",
"OSError",
"OverflowError",
"PermissionError",
"ProcessLookupError",
"RecursionError",
"ReferenceError",
"RuntimeError",
"SyntaxError",
"SystemError",
"TabError",
"TimeoutError",
"TypeError",
"UnboundLocalError",
"UnicodeDecodeError",
"UnicodeEncodeError",
"UnicodeError",
"UnicodeTranslateError",
"ValueError",
"ZeroDivisionError",
"Invalid requirement",
"ResourceExhaustedError",
"OutOfRangeError",
"InvalidArgumentError",
]
22 changes: 11 additions & 11 deletions src/sagemaker_training/encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,17 +82,17 @@ def default(_array_like):
def json_to_numpy(string_like, dtype=None):
"""Convert a JSON object to a numpy array.

Args:
string_like (str): JSON string.
dtype (dtype, optional): Data type of the resulting array. If None,
the dtypes will be determined by the
contents of each column, individually.
This argument can only be used to
'upcast' the array. For downcasting,
use the .astype(t) method.
Returns:
(np.array): Numpy array.
"""
Args:
string_like (str): JSON string.
dtype (dtype, optional): Data type of the resulting array. If None,
the dtypes will be determined by the
contents of each column, individually.
This argument can only be used to
'upcast' the array. For downcasting,
use the .astype(t) method.
Returns:
(np.array): Numpy array.
"""
data = json.loads(string_like)
return np.array(data, dtype=dtype)

Expand Down
2 changes: 1 addition & 1 deletion src/sagemaker_training/entry_point.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License'). You
# may not use this file except in compliance with the License. A copy of
Expand Down
38 changes: 18 additions & 20 deletions src/sagemaker_training/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@
SAGEMAKER_BASE_PATH = os.path.join("/opt", "ml") # type: str
BASE_PATH_ENV = "SAGEMAKER_BASE_DIR" # type: str

HYPERPARAMETERS_FILE = "hyperparameters.json" # type: str
RESOURCE_CONFIG_FILE = "resourceconfig.json" # type: str
INPUT_DATA_CONFIG_FILE = "inputdataconfig.json" # type: str


def _write_json(obj, path): # type: (object, str) -> None
"""Write a serializeable object as a JSON file."""
Expand Down Expand Up @@ -65,10 +69,11 @@ def _is_training_path_configured(): # type: () -> bool

def _set_base_path_env(): # type: () -> None
"""Set the environment variable SAGEMAKER_BASE_DIR as
~/sagemaker_local/{timestamp}/opt/ml
~/sagemaker_local/jobs/{timestamp}/opt/ml
"""
timestamp = str(time.time())
local_config_dir = os.path.join(
os.path.expanduser("~"), "sagemaker_local", "jobs", str(time.time()), "opt", "ml"
os.path.expanduser("~"), "sagemaker_local", "jobs", timestamp, "opt", "ml"
)

logger.info("Setting environment variable SAGEMAKER_BASE_DIR as %s ." % local_config_dir)
Expand Down Expand Up @@ -139,18 +144,13 @@ def _set_base_path_env(): # type: () -> None
str: the path to the intermediate output directory, e.g. /opt/ml/output/intermediate.
"""

HYPERPARAMETERS_FILE = "hyperparameters.json" # type: str
RESOURCE_CONFIG_FILE = "resourceconfig.json" # type: str
INPUT_DATA_CONFIG_FILE = "inputdataconfig.json" # type: str

hyperparameters_file_dir = os.path.join(input_config_dir, HYPERPARAMETERS_FILE) # type: str
input_data_config_file_dir = os.path.join(input_config_dir, INPUT_DATA_CONFIG_FILE) # type: str
resource_config_file_dir = os.path.join(input_config_dir, RESOURCE_CONFIG_FILE) # type: str


def _create_training_directories():
"""Create the directory structure and files necessary for training under the base path.
"""
"""Create the directory structure and files necessary for training under the base path."""
logger.info("Creating a new training folder under %s ." % base_dir)

os.makedirs(model_dir)
Expand Down Expand Up @@ -196,7 +196,7 @@ def read_hyperparameters(): # type: () -> dict
"""Read the hyperparameters from /opt/ml/input/config/hyperparameters.json.

For more information about hyperparameters.json:
https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html#your-algorithms-training-algo-running-container-hyperparameters
https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo-running-container.html#your-algorithms-training-algo-running-container-hyperparameters

Returns:
(dict[string, object]): A dictionary containing the hyperparameters.
Expand Down Expand Up @@ -225,7 +225,7 @@ def read_resource_config(): # type: () -> dict
"""Read the resource configuration from /opt/ml/input/config/resourceconfig.json.

For more information about resourceconfig.json:
https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html#your-algorithms-training-algo-running-container-dist-training
https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo-running-container.html#your-algorithms-training-algo-running-container-dist-training

Returns:
resource_config (dict[string, object]): the contents from /opt/ml/input/config/resourceconfig.json.
Expand Down Expand Up @@ -264,7 +264,7 @@ def read_input_data_config(): # type: () -> dict
}}

For more information about inpudataconfig.json:
https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html#your-algorithms-training-algo-running-container-dist-training
https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo-running-container.html#your-algorithms-training-algo-running-container-inputdataconfig

Returns:
input_data_config (dict[string, object]): Contents from /opt/ml/input/config/inputdataconfig.json.
Expand All @@ -273,7 +273,7 @@ def read_input_data_config(): # type: () -> dict


def channel_path(channel): # type: (str) -> str
""" Return the directory containing the channel data file(s) which is:
"""Return the directory containing the channel data file(s) which is:
- <self.base_dir>/input/data/<channel>

For more information about channels: https://docs.aws.amazon.com/sagemaker/latest/dg/API_Channel.html
Expand Down Expand Up @@ -326,7 +326,7 @@ class Environment(mapping.MappingMixin): # pylint:disable=too-many-public-metho
get the path of the channel 'training' from the inputdataconfig.json file
>>>training_dir = environment.channel_input_dirs['training']

get a the hyperparameter 'training_data_file' from hyperparameters.json file
get the hyperparameter 'training_data_file' from hyperparameters.json file
>>>file_name = environment.hyperparameters['training_data_file']

get the folder where the model should be saved
Expand Down Expand Up @@ -407,7 +407,7 @@ class Environment(mapping.MappingMixin): # pylint:disable=too-many-public-metho
}}

You can find more information about /opt/ml/input/config/inputdataconfig.json here:
https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html#your-algorithms-training-algo-running-container-inputdataconfig
https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo-running-container.html#your-algorithms-training-algo-running-container-inputdataconfig

output_data_dir (str): The dir to write non-model training artifacts (e.g. evaluation
results) which will be retained by SageMaker,
Expand Down Expand Up @@ -476,7 +476,7 @@ def __init__(self, resource_config=None, input_data_config=None, hyperparameters
}}

You can find more information about /opt/ml/input/config/inputdataconfig.json here:
https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html#your-algorithms-training-algo-running-container-inputdataconfig
https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo-running-container.html#your-algorithms-training-algo-running-container-inputdataconfig

hyperparameters (dict[string, object]): An instance of `HyperParameters` containing the
training job hyperparameters.
Expand Down Expand Up @@ -564,7 +564,7 @@ def __init__(self, resource_config=None, input_data_config=None, hyperparameters

@property
def model_dir(self): # type: () -> str
""" The directory where models should be saved.
"""The directory where models should be saved.

Returns:
str: The directory where models should be saved, e.g., /opt/ml/model/
Expand Down Expand Up @@ -651,14 +651,12 @@ def _parse_module_name(program_param):

@property
def is_master(self): # type: () -> bool
"""Returns True if host is master.
"""
"""Returns True if host is master."""
return self._is_master

@property
def master_hostname(self): # type: () -> str
"""Returns the hostname of the master node.
"""
"""Returns the hostname of the master node."""
return self._master_hostname

@property
Expand Down
34 changes: 26 additions & 8 deletions src/sagemaker_training/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,39 @@ class _CalledProcessError(ClientError):
cmd, return_code, output
"""

def __init__(self, cmd, return_code=None, output=None):
self.return_code = return_code
def __init__(self, cmd, return_code=None, output=None, info=None):
self.return_code = str(return_code)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can return_code be None, in which case conversion to str will fail. should we add a check here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

str conversion of NoneType object will be just None. So it should be fine in this case.

self.cmd = cmd
self.output = output
self.extra_info = info
super(_CalledProcessError, self).__init__()

def __str__(self):
if six.PY3 and self.output:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we remove the PY3 check, since PY2 is deprecated ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for catching these. Removed PY2 related in the following commit

error_msg = "\n%s" % self.output.decode("latin1")
# error_msg = "%s" % self.output.decode("latin1")
if isinstance(self.output, bytes):
error_msg = "%s" % self.output.decode("utf-8")
else:
error_msg = "%s" % self.output
elif self.output:
error_msg = "\n%s" % self.output
error_msg = "%s" % self.output
else:
error_msg = ""

message = '%s:\nCommand "%s"%s' % (type(self).__name__, self.cmd, error_msg)
if self.extra_info is None:
message = '%s:\nExitCode %s\nErrorMessage "%s"\nCommand "%s"' % (
type(self).__name__,
self.return_code,
error_msg,
self.cmd,
)
else:
message = '%s:\nExitCode %s\nErrorMessage "%s"\nExtraInfo "%s"\nCommand "%s"' % (
type(self).__name__,
self.return_code,
error_msg,
self.extra_info,
self.cmd,
)
return message.strip()


Expand All @@ -64,11 +82,11 @@ class ExecuteUserScriptError(_CalledProcessError):
"""Error class indicating a user script failed to execute."""


class ChannelDoesNotExistException(Exception):
class ChannelDoesNotExistError(Exception):
"""Error class indicating a channel does not exist."""

def __init__(self, channel_name):
super(ChannelDoesNotExistException, self).__init__(
super(ChannelDoesNotExistError, self).__init__(
"Channel %s is not a valid channel" % channel_name
)

Expand Down
1 change: 0 additions & 1 deletion src/sagemaker_training/intermediate_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,6 @@ def start_sync(
watchers = {}
wd = inotify.add_watch(intermediate_path, watch_flags)
watchers[wd] = ""

# start subprocess to sync any files from intermediate folder to s3
p = multiprocessing.Process(target=_watch, args=[inotify, watchers, watch_flags, s3_uploader])
# Make the process daemonic as a safety switch to prevent training job from hanging forever
Expand Down
Loading