Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add power submission checker #402

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions mlperf_logging/package_checker/package_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ..compliance_checker.mlp_compliance import usage_choices, rule_choices
from ..rcp_checker import rcp_checker
from .seed_checker import find_source_files_under, SeedChecker
from .power_checker import PowerChecker
from ..system_desc_checker import system_desc_checker

from ..benchmark_meta import get_allowed_benchmarks, get_result_file_counts
Expand Down Expand Up @@ -47,6 +48,7 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,
global_seed_checker_bypass = seed_checker_bypass

seed_checker = SeedChecker(ruleset)
power_checker = PowerChecker(ruleset)
too_many_errors = False
result_folder = os.path.join(folder, 'results')
for system_folder in _get_sub_folders(result_folder):
Expand Down Expand Up @@ -146,6 +148,7 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,
logging.warning('Unknown files in result directory: %s', benchmark_folder)

errors_found = 0
error_set = set({})
result_files.sort()
for result_file in result_files:
result_basename = os.path.basename(result_file)
Expand Down Expand Up @@ -173,13 +176,19 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,
)
if not valid:
errors_found += 1
if errors_found == 1 and benchmark != 'unet3d':
error_set.add(result_name)
power_folder = os.path.join(benchmark_folder, "power")
if os.path.exists(power_folder):
power_valid, power_errors = power_checker.check_power(power_folder, result_files)
error_set = error_set | power_errors
error_list = list(error_set)
if len(error_list) == 1 and benchmark != 'unet3d':
logging.warning(" 1 file does not comply, accepting this under olympic scoring")
elif errors_found > 0 and errors_found <= 4 and benchmark == 'unet3d':
logging.warning(" %d files do not comply, accepting this under olympic scoring", errors_found)
elif errors_found > 0:
elif len(error_list) > 0 and len(error_list) <= 4 and benchmark == 'unet3d':
logging.warning(" %d files do not comply, accepting this under olympic scoring", len(error_list))
elif len(error_list) > 0:
too_many_errors = True
logging.error(" %d files do not comply, directory cannot be accepted", errors_found)
logging.error(" %d files do not comply, directory cannot be accepted", len(error_list))

# Check if each run use unique seeds.
if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0'} and division == 'closed':
Expand Down
96 changes: 96 additions & 0 deletions mlperf_logging/package_checker/power_checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import warnings
import os
import logging


class PowerChecker:
""" Check for errors in the MLPerf Power submissions.
Current checks are:

1. Check there is a power folder for each result
2. Check the power file names
3. Check there are the same number of nodes and switches in each run
(No file is missing)

Unsatisfying any of the above checks results in failure.
"""
def __init__(self, ruleset):
self._ruleset = ruleset

def check_range(self, l, n):
seen = set({})
errors = []
for e in l:
if e < 0 or e > (n-1) or e in seen:
return False

return True

def check_equals(self, l):
counter = {}
errors = []
for e in l:
if e in counter:
counter[e] += 1
else:
counter[e] = 1
max_equals = max(counter, key = counter.get)
for i, e in enumerate(l):
if e != max_equals:
errors.append(i)

return len(errors) == 0, errors

def check_power(self, power_folder, result_files):
system, benchmark = os.path.normpath(power_folder).split(os.sep)[-3:-1]
errors_found = 0
errors_set = set()

node_lens = []
sw_lens = []
for result_file in result_files:
result_name, _ = os.path.splitext(os.path.basename(result_file))
if os.path.exists(os.path.join(power_folder, result_name)):
power_result_folder = os.path.join(power_folder, result_name)
power_files = os.listdir(power_result_folder)
node_results = [file for file in power_files if file.startswith("node")]
sw_results = [file for file in power_files if file.startswith("sw")]
node_idx = [int(os.path.splitext(os.path.basename(file))[0].split('_')[-1]) for file in node_results]
sw_idx = [int(os.path.splitext(os.path.basename(file))[0].split('_')[-1]) for file in sw_results]

if len(power_files) > len(node_results) + len(sw_results):
logging.warning("Detected %d total files in directory %s, but some do not conform", len(power_files), power_result_folder)

if not self.check_range(node_idx, len(node_results)):
logging.warning("Bad naming of node power files in directory %s, expected to be node_x with x in range [0, %d]", power_result_folder, len(node_results)-1)
errors_found += 1
errors_set.add(result_name)
if not self.check_range(sw_idx, len(sw_results)):
logging.warning("Bad naming of sw power files in directory %s, expected to be sw_x with x in range [0, %d]", power_result_folder, len(sw_results)-1)
errors_found += 1
errors_set.add(result_name)

node_lens.append(len(node_results))
sw_lens.append(len(sw_results))
pass
else:
logging.warning("Package does not contain power result for %s/%s: %s", system, benchmark, result_name)
errors_found += 1
errors_set.add(result_name)

result_names = [os.path.splitext(os.path.basename(result_file))[0] for result_file in result_files]

valid, errors = self.check_equals(node_lens)
node_errors = set([result_names[error] for error in errors])
for error_result in [result_names[error] for error in errors]:
logging.warning("Inconsistent number of nodes in directory %s/%s", power_folder, error_result)
logging.warning("Directory %s/%s does not comply", power_folder, error_result)

valid, errors = self.check_equals(sw_lens)
sw_errors = set([result_names[error] for error in errors])
for error_result in [result_names[error] for error in errors]:
logging.warning("Inconsistent number of sw in directory %s/%s", power_folder, error_result)
logging.warning("Directory %s/%s does not comply", power_folder, error_result)

errors_set = errors_set | node_errors | sw_errors
return errors_found == 0, errors_set
Loading