diff --git a/mlperf_logging/package_checker/package_checker.py b/mlperf_logging/package_checker/package_checker.py index 50b22c6..b2811ad 100644 --- a/mlperf_logging/package_checker/package_checker.py +++ b/mlperf_logging/package_checker/package_checker.py @@ -14,6 +14,7 @@ from ..compliance_checker.mlp_compliance import usage_choices, rule_choices from ..rcp_checker import rcp_checker from .seed_checker import find_source_files_under, SeedChecker +from .power_checker import PowerChecker from ..system_desc_checker import system_desc_checker from ..benchmark_meta import get_allowed_benchmarks, get_result_file_counts @@ -47,6 +48,7 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror, global_seed_checker_bypass = seed_checker_bypass seed_checker = SeedChecker(ruleset) + power_checker = PowerChecker(ruleset) too_many_errors = False result_folder = os.path.join(folder, 'results') for system_folder in _get_sub_folders(result_folder): @@ -146,6 +148,7 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror, logging.warning('Unknown files in result directory: %s', benchmark_folder) errors_found = 0 + error_set = set({}) result_files.sort() for result_file in result_files: result_basename = os.path.basename(result_file) @@ -173,13 +176,19 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror, ) if not valid: errors_found += 1 - if errors_found == 1 and benchmark != 'unet3d': + error_set.add(result_name) + power_folder = os.path.join(benchmark_folder, "power") + if os.path.exists(power_folder): + power_valid, power_errors = power_checker.check_power(power_folder, result_files) + error_set = error_set | power_errors + error_list = list(error_set) + if len(error_list) == 1 and benchmark != 'unet3d': logging.warning(" 1 file does not comply, accepting this under olympic scoring") - elif errors_found > 0 and errors_found <= 4 and benchmark == 'unet3d': - logging.warning(" %d files do not comply, accepting this under olympic scoring", errors_found) - elif errors_found > 0: + elif len(error_list) > 0 and len(error_list) <= 4 and benchmark == 'unet3d': + logging.warning(" %d files do not comply, accepting this under olympic scoring", len(error_list)) + elif len(error_list) > 0: too_many_errors = True - logging.error(" %d files do not comply, directory cannot be accepted", errors_found) + logging.error(" %d files do not comply, directory cannot be accepted", len(error_list)) # Check if each run use unique seeds. if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0'} and division == 'closed': diff --git a/mlperf_logging/package_checker/power_checker.py b/mlperf_logging/package_checker/power_checker.py new file mode 100644 index 0000000..b1daa2f --- /dev/null +++ b/mlperf_logging/package_checker/power_checker.py @@ -0,0 +1,96 @@ +import warnings +import os +import logging + + +class PowerChecker: + """ Check for errors in the MLPerf Power submissions. + Current checks are: + + 1. Check there is a power folder for each result + 2. Check the power file names + 3. Check there are the same number of nodes and switches in each run + (No file is missing) + + Unsatisfying any of the above checks results in failure. + """ + def __init__(self, ruleset): + self._ruleset = ruleset + + def check_range(self, l, n): + seen = set({}) + errors = [] + for e in l: + if e < 0 or e > (n-1) or e in seen: + return False + + return True + + def check_equals(self, l): + counter = {} + errors = [] + for e in l: + if e in counter: + counter[e] += 1 + else: + counter[e] = 1 + max_equals = max(counter, key = counter.get) + for i, e in enumerate(l): + if e != max_equals: + errors.append(i) + + return len(errors) == 0, errors + + def check_power(self, power_folder, result_files): + system, benchmark = os.path.normpath(power_folder).split(os.sep)[-3:-1] + errors_found = 0 + errors_set = set() + + node_lens = [] + sw_lens = [] + for result_file in result_files: + result_name, _ = os.path.splitext(os.path.basename(result_file)) + if os.path.exists(os.path.join(power_folder, result_name)): + power_result_folder = os.path.join(power_folder, result_name) + power_files = os.listdir(power_result_folder) + node_results = [file for file in power_files if file.startswith("node")] + sw_results = [file for file in power_files if file.startswith("sw")] + node_idx = [int(os.path.splitext(os.path.basename(file))[0].split('_')[-1]) for file in node_results] + sw_idx = [int(os.path.splitext(os.path.basename(file))[0].split('_')[-1]) for file in sw_results] + + if len(power_files) > len(node_results) + len(sw_results): + logging.warning("Detected %d total files in directory %s, but some do not conform", len(power_files), power_result_folder) + + if not self.check_range(node_idx, len(node_results)): + logging.warning("Bad naming of node power files in directory %s, expected to be node_x with x in range [0, %d]", power_result_folder, len(node_results)-1) + errors_found += 1 + errors_set.add(result_name) + if not self.check_range(sw_idx, len(sw_results)): + logging.warning("Bad naming of sw power files in directory %s, expected to be sw_x with x in range [0, %d]", power_result_folder, len(sw_results)-1) + errors_found += 1 + errors_set.add(result_name) + + node_lens.append(len(node_results)) + sw_lens.append(len(sw_results)) + pass + else: + logging.warning("Package does not contain power result for %s/%s: %s", system, benchmark, result_name) + errors_found += 1 + errors_set.add(result_name) + + result_names = [os.path.splitext(os.path.basename(result_file))[0] for result_file in result_files] + + valid, errors = self.check_equals(node_lens) + node_errors = set([result_names[error] for error in errors]) + for error_result in [result_names[error] for error in errors]: + logging.warning("Inconsistent number of nodes in directory %s/%s", power_folder, error_result) + logging.warning("Directory %s/%s does not comply", power_folder, error_result) + + valid, errors = self.check_equals(sw_lens) + sw_errors = set([result_names[error] for error in errors]) + for error_result in [result_names[error] for error in errors]: + logging.warning("Inconsistent number of sw in directory %s/%s", power_folder, error_result) + logging.warning("Directory %s/%s does not comply", power_folder, error_result) + + errors_set = errors_set | node_errors | sw_errors + return errors_found == 0, errors_set \ No newline at end of file