Skip to content

Commit

Permalink
channgin test rule to throw in case of errors (aws#105)
Browse files Browse the repository at this point in the history
* fixing a bug in regex which is causing mxnet test to fail

* removing parenthesis from assert

* modifying regex

* fixing invoker

* argparse flag is str. making from bool to st in invoker

* Adding option to run test regexes and make the integration test runner clean
  • Loading branch information
Vikas-kum authored Aug 19, 2019
1 parent dd57c78 commit 4e51959
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 35 deletions.
5 changes: 2 additions & 3 deletions tests/analysis/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -160,16 +160,15 @@
--rule_name allzero --flag True --end_step 3 --collection_names ReluActivation
]


# test cases for pytorch
-
- exploding_tensor/pytorch/false
- pytorch
- *Enable
- [*torch_simple,
--lr .01 --momentum 0.8 --tornasole-frequency 8 --steps 20 --hook-type saveall --random-seed True,
--lr .01 --momentum 0.8 --tornasole-frequency 15 --steps 46 --hook-type saveall --random-seed True,
*invoker,
--rule_name explodingtensor --end_step 20 --flag False
--rule_name explodingtensor --end_step 46 --flag False
]
-
- exploding_tensor/pytorch/true
Expand Down
2 changes: 1 addition & 1 deletion tests/analysis/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
def pytest_addoption(parser):
parser.addoption('--mode', action='store', dest='mode', default=None)
parser.addoption('--mode', dest='mode', default=None)
parser.addoption('--path_to_config', action='store', dest='path_to_config', default=None)
parser.addoption('--tf_path', action='store', dest='tf_path', default=None)
parser.addoption('--pytorch_path', action='store', dest='pytorch_path', default=None)
Expand Down
56 changes: 40 additions & 16 deletions tests/analysis/integration_testing_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,6 @@ def __init__(self, framework, path_to_config=None, env_dict={}, test_case_list=[
env_dict['core_path'] is not None else '.'
os.environ['CODEBUILD_SRC_DIR'] = env_dict['CODEBUILD_SRC_DIR'] if 'CODEBUILD_SRC_DIR' in env_dict and \
env_dict['CODEBUILD_SRC_DIR'] is not None else '.'
# create s3 client
self.s3 = boto3.resource('s3')
# create a local folder to store log files
if not os.path.exists('./integration_test_log'):
os.mkdir('./integration_test_log/')
Expand Down Expand Up @@ -134,7 +132,7 @@ def upload_log_to_s3(self, s3_prefix):
files_to_upload = []
## tornasole.log
from shutil import copyfile

s3 = boto3.client('s3')
copyfile('tornasole.log', 'tornasole-s3.log')
files_to_upload.append('tornasole-s3.log')
## integration_test_log
Expand All @@ -147,9 +145,10 @@ def upload_log_to_s3(self, s3_prefix):
files_to_upload.append(os.path.join(r, file))
## upload
for log_file in files_to_upload:
print("Uploading file: {} to s3".format(log_file))
self.s3.Object(BUCKET, s3_prefix + "/" + log_file).put(
Body=open(log_file, 'rb'))
logger.info("Uploading file: {} to s3".format(log_file))
with open(log_file) as lf:
s3.put_object(Bucket=BUCKET, Key= s3_prefix + "/" + log_file, Body=lf)


def delete_local_log(self):
files = glob.glob('./integration_test_log/*')
Expand Down Expand Up @@ -180,7 +179,9 @@ def run_one_script(self, path_to_script, script_args, trial_dir, job_name, mode,
commands = "python {} --tornasole_path {} {}".format(path_to_script, trial_dir, script_args)
logger.info("IntegrationTest running command {}".format(commands))
# use subprocess to execute cmd line prompt
command_list = commands.split(' ')
command_list = commands.split()
logger.info("command_list : {}".format(command_list))

# create a subprocess using Popen
p = Popen(command_list,
stdout=PIPE if self.stdout_mode else None,
Expand All @@ -204,6 +205,27 @@ def run_one_script(self, path_to_script, script_args, trial_dir, job_name, mode,
error, out))
## returning exit code
exit(p.returncode)

def _is_test_allowed(self, job):
name = job[TEST_NAME_INDEX]
in_test_cases = True
if len(self.test_cases) > 0:
in_test_cases = name in self.test_cases
logger.info("Test cases specified, in_test_cases is {} testname:{}".format(in_test_cases, name))
mathes_regex = True
if self.test_case_regex is not None:
if re.match(self.test_case_regex, job[TEST_NAME_INDEX]):
mathes_regex = True
else:
mathes_regex = False
logger.info("Test regex specified, mathes_regex is {} testname:{}".format(mathes_regex, name))

is_enabled = job[SHOULD_RUN_INDEX]
logger.info("Test {} is enabled:{}".format(name, is_enabled))
is_framework_allowed = (job[FRAMEWORK_INDEX] == self.framework)
is_allowed = is_enabled and (in_test_cases or mathes_regex) and is_framework_allowed
logger.info("Test {} is allowed:{}".format(name, is_allowed))
return is_allowed
# run 'job's provided by user. a 'job' is a training/test scripts combination
# mode: testing mode, either 'auto' or 'manual'
# jobs: a list of lists, the sublist is called a ‘job’
Expand All @@ -229,16 +251,17 @@ def run_jobs(self):
# <path_test_script>,
# <test_script_args>
# ]
if job[FRAMEWORK_INDEX] != 'tensorflow' and job[FRAMEWORK_INDEX] != 'pytorch' \
and job[FRAMEWORK_INDEX] != 'mxnet' and job[TEST_NAME_INDEX] != 'values':
framework = job[FRAMEWORK_INDEX]
if job[TEST_NAME_INDEX] == 'values':
continue
ALLOWED_FRAMEWORK = ['tensorflow', 'pytorch', 'mxnet'] ## Note values is first dict in yaml file. It's a hack
if framework not in ALLOWED_FRAMEWORK:
raise Exception('Wrong test case category', job[TEST_NAME_INDEX])

if not self._is_test_allowed(job):
continue
# if user has specified regex search for certain test cases, only of these, which are turned on would run
if (self.test_case_regex is not None and re.match(self.test_case_regex, job[TEST_NAME_INDEX]) is not None
and job[SHOULD_RUN_INDEX]) or\
(self.test_case_regex is None and self.test_cases != [] and job[TEST_NAME_INDEX] in self.test_cases
and job[SHOULD_RUN_INDEX]) or\
(self.test_case_regex is None and self.test_cases == []
and (self.framework is None or job[FRAMEWORK_INDEX] == self.framework) and job[SHOULD_RUN_INDEX]):
else:
job_info = job[TEST_INFO_INDEX]
for mode_1 in self.serial_and_parallel:
if self.serial_and_parallel[mode_1]:
Expand Down Expand Up @@ -292,7 +315,8 @@ def run_jobs(self):
if exit_code > 0:
# upload all the files to s3
self.upload_log_to_s3(upload_time_str)

msg = "exit code of pytest run non zero. Please check logs in s3://{}{}".format(BUCKET, upload_time_str)
assert False, msg
# once all jobs are finished, delete the outputs on local and s3
#self.delete_local_trials(local_trials)
#self.delete_s3_trials(s3_trials)
Expand Down
20 changes: 9 additions & 11 deletions tests/analysis/invoker.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,10 @@ def invoke_rule(rule_obj, flag, start_step, end_step):
step = start_step if start_step is not None else 0
logger.info('Started execution of rule {} at step {}'
.format(type(rule_obj).__name__, step))
return_false = False
exception_thrown = "False"
while (end_step is None) or (step < end_step): # if end_step is not provided, do infinite checking
try:
rule_obj.invoke(step)
if flag == 'False':
return_false = True
elif flag == 'True':
# every step should return True in this case,
# meaning exception condition should be met
assert False
step += 1
except StepUnavailable as e:
logger.info(e)
Expand All @@ -28,13 +22,17 @@ def invoke_rule(rule_obj, flag, start_step, end_step):
except RuleEvaluationConditionMet as e:
logger.info(e)
step += 1
exception_thrown = "True"
break
except NoMoreData as e:
logger.info(e)
break

# if flag is False, return_false should be True after the loop
if flag == 'False':
assert return_false
logger.info('Ending execution of rule {} with step={} '
.format(rule_obj.__class__.__name__, step - 1))

msg = "Flag passed :{} , exception_thrown:{}".format(flag, exception_thrown)
if flag != exception_thrown:
assert False, msg

if __name__ == '__main__':
import argparse
Expand Down
6 changes: 4 additions & 2 deletions tornasole/core/tfevent/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,11 @@ def match_regex(s):

@staticmethod
def load_filename(s, print_error=True):
m = re.search('(\d+)_(.*).tfevents', s)
last_delimiter_index = s.rfind('/')
event_file_name = s[last_delimiter_index+1 : ]
m = re.search('(.*)_(.*).tfevents', event_file_name)
if m:
step_num = m.group(1)
step_num = int(m.group(1))
worker_name = m.group(2)
return EventFileLocation(step_num=step_num, worker_name=worker_name)
else:
Expand Down
1 change: 0 additions & 1 deletion tornasole/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ def flatten(lis):
new_lis.append(item)
return new_lis


_logger_initialized = False


Expand Down
3 changes: 3 additions & 0 deletions tornasole/trials/s3_trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,15 @@ def _read_all_events_file_from_s3(self, start_after_key=None):
objects = self._list_s3_objects(self.bucket_name,
os.path.join(self.prefix_name, 'events'),
start_after_key)
self.logger.debug("Got objects:{}".format(objects))
for objname in objects:
efl = EventFileLocation.match_regex(objname)
if efl:
if (self.range_steps is not None and self._step_in_range(efl.step_num)) or \
self.range_steps is None:
self.keys.append(objname)
else:
self.logger.debug("Skipping step:{} as it is not in range{} {}".format(efl.step_num, self.range_steps[0], self.range_steps[1]))
else:
self.logger.debug(f'Skipping object {objname}')
self.logger.debug(f'Loading {len(self.keys)} new steps')
Expand Down
2 changes: 1 addition & 1 deletion tornasole/trials/trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def maybe_refresh(self, name=None):
return
retry_count = 1
training_ended = self.training_ended()
if training_ended and not self.loaded_all_steps:
if training_ended and self.loaded_all_steps== False:
retry_count = 2
while retry_count > 0:
if name is None:
Expand Down

0 comments on commit 4e51959

Please sign in to comment.