diff --git a/mlperf_logging/compliance_checker/mlp_parser/__init__.py b/mlperf_logging/compliance_checker/mlp_parser/__init__.py index 5b073d6..48e6bc1 100644 --- a/mlperf_logging/compliance_checker/mlp_parser/__init__.py +++ b/mlperf_logging/compliance_checker/mlp_parser/__init__.py @@ -29,4 +29,4 @@ def parse_file(filename, ruleset='0.6.0'): elif ruleset == '4.0.0': return parse_file_400(filename) else: - raise Exception(f'Ruleset "{ruleset}" is not supported') + raise Exception(f'Ruleset "{ruleset}" is not supported') \ No newline at end of file diff --git a/mlperf_logging/compliance_checker/training_4.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_4.0.0/closed_common.yaml index f1e6c7e..9136f33 100644 --- a/mlperf_logging/compliance_checker/training_4.0.0/closed_common.yaml +++ b/mlperf_logging/compliance_checker/training_4.0.0/closed_common.yaml @@ -8,4 +8,4 @@ - KEY: NAME: gradient_accumulation_steps REQ: EXACTLY_ONE - CHECK: " v['value'] > 0 " + CHECK: " v['value'] > 0 " \ No newline at end of file diff --git a/mlperf_logging/compliance_checker/training_4.0.0/closed_stable_diffusion.yaml b/mlperf_logging/compliance_checker/training_4.0.0/closed_stable_diffusion.yaml new file mode 100644 index 0000000..3cdc3e6 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/closed_stable_diffusion.yaml @@ -0,0 +1,74 @@ +# Stable diffusion uses two metrics, FID and CLIP. +# These metrics can be calculated offline, using different scripts +# and logged seperatly. Therefore, we create a virtual key +# called aggregated_eval_accuracy, which aggregates +# both metrics into a single log line + +- BEGIN: + CODE: | + from dataclasses import replace + agg_eval_lines = {} + for line in loglines: + if line.key == "eval_accuracy" and 'metric' in line.value['metadata']: + samples_count = line.value['metadata']['samples_count'] + if samples_count not in agg_eval_lines: + new_line = replace(line) # Make a copy + new_line.key = "aggregated_eval_accuracy" + new_line.full_string = "" # Not needed + new_line.lineno = -1 # Not needed + new_line.value = {'value': {'samples_count': samples_count}, 'metadata':{}} + agg_eval_lines[samples_count] = new_line + + agg_eval_lines[samples_count].timestamp = max(line.timestamp, agg_eval_lines[samples_count].timestamp) + agg_eval_lines[samples_count].value['value'][line.value['metadata']['metric']] = line.value['value'] + loglines.extend(agg_eval_lines.values()) + + +- KEY: + NAME: global_batch_size + REQ: AT_LEAST_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.999 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-08 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.01 " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0 " + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: aggregated_eval_accuracy + REQ: AT_LEAST(2) + CHECK: + - "'FID' in v['value']" + - "'CLIP' in v['value']" + - "'samples_count' in v['value']" + ATLEAST_ONE_CHECK: "(0.0 <= v['value']['FID'] <= 90.0) and (0.15 <= v['value']['CLIP'] <= 1.0)" diff --git a/mlperf_logging/compliance_checker/training_4.0.0/common.yaml b/mlperf_logging/compliance_checker/training_4.0.0/common.yaml index 1360fed..4949d29 100644 --- a/mlperf_logging/compliance_checker/training_4.0.0/common.yaml +++ b/mlperf_logging/compliance_checker/training_4.0.0/common.yaml @@ -97,16 +97,16 @@ REQ: AT_LEAST_ONE_OR(epoch_start) CHECK: - "s['run_started']" - - "('epoch_count' in v['metadata']) | ('step_num' in v['metadata'])" + - "('epoch_count' in v['metadata']) | ('samples_count' in v['metadata'])" - "'first_epoch_num' in v['metadata'] if 'epoch_count' in v['metadata'] else True" - "v['metadata']['epoch_count'] > 0 if 'epoch_count' in v['metadata'] else True" - - "v['metadata']['step_num'] >= 0 if 'step_num' in v['metadata'] else True" + - "v['metadata']['samples_count'] >= 0 if 'samples_count' in v['metadata'] else True" - KEY: NAME: block_stop REQ: AT_LEAST_ONE_OR(epoch_stop) CHECK: - - "('first_epoch_num' in v['metadata']) | ('step_num' in v['metadata'])" + - "('first_epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" - KEY: NAME: epoch_start @@ -125,19 +125,19 @@ NAME: eval_start REQ: AT_LEAST_ONE_OR(block_start) CHECK: - - "('epoch_num' in v['metadata']) | ('step_num' in v['metadata'])" + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" - KEY: NAME: eval_stop REQ: AT_LEAST_ONE_OR(block_stop) CHECK: - - "('epoch_num' in v['metadata']) | ('step_num' in v['metadata'])" + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" - KEY: NAME: eval_accuracy REQ: AT_LEAST_ONE CHECK: - - "('epoch_num' in v['metadata']) | ('step_num' in v['metadata'])" + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" - KEY: NAME: train_samples diff --git a/mlperf_logging/compliance_checker/training_4.0.0/open_common.yaml b/mlperf_logging/compliance_checker/training_4.0.0/open_common.yaml index 43f781d..e85f70c 100644 --- a/mlperf_logging/compliance_checker/training_4.0.0/open_common.yaml +++ b/mlperf_logging/compliance_checker/training_4.0.0/open_common.yaml @@ -3,5 +3,4 @@ NAME: submission_benchmark REQ: EXACTLY_ONE CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn'] " - POST: " enqueue_config('training_4.0.0/open_{}.yaml'.format(v['value'])) " - + POST: " enqueue_config('training_4.0.0/open_{}.yaml'.format(v['value'])) " \ No newline at end of file diff --git a/mlperf_logging/compliance_checker/training_4.0.0/open_stable_diffusion.yaml b/mlperf_logging/compliance_checker/training_4.0.0/open_stable_diffusion.yaml new file mode 100644 index 0000000..fe25e31 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/open_stable_diffusion.yaml @@ -0,0 +1,33 @@ +# Stable diffusion uses two metrics, FID and CLIP. +# These metrics can be calculated offline, using different scripts +# and logged seperatly. Therefore, we create a virtual key +# called aggregated_eval_accuracy, which aggregates +# both metrics into a single log line + +- BEGIN: + CODE: | + from dataclasses import replace + agg_eval_lines = {} + for line in loglines: + if line.key == "eval_accuracy" and 'metric' in line.value['metadata']: + samples_count = line.value['metadata']['samples_count'] + if samples_count not in agg_eval_lines: + new_line = replace(line) # Make a copy + new_line.key = "aggregated_eval_accuracy" + new_line.full_string = "" # Not needed + new_line.lineno = -1 # Not needed + new_line.value = {'value': {'samples_count': samples_count}, 'metadata':{}} + agg_eval_lines[samples_count] = new_line + + agg_eval_lines[samples_count].timestamp = max(line.timestamp, agg_eval_lines[samples_count].timestamp) + agg_eval_lines[samples_count].value['value'][line.value['metadata']['metric']] = line.value['value'] + loglines.extend(agg_eval_lines.values()) + +- KEY: + NAME: aggregated_eval_accuracy + REQ: AT_LEAST(2) + CHECK: + - "'FID' in v['value']" + - "'CLIP' in v['value']" + - "'samples_count' in v['value']" + ATLEAST_ONE_CHECK: "v['value']['FID'] >= 0.0 and v['value']['CLIP'] <= 1.0" diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index 4547518..2b94f14 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -179,6 +179,7 @@ FIRST_EPOCH_NUM = "first_epoch_num" STATUS = "status" STEP_NUM = "step_num" +SAMPLES_COUNT = "samples_count" # Power constants POWER_MEASUREMENT_START = "power_measurement_start" diff --git a/mlperf_logging/mllog/examples/power/compute_metric_example.py b/mlperf_logging/mllog/examples/power/compute_metric_example.py index 59f5e4b..add3627 100644 --- a/mlperf_logging/mllog/examples/power/compute_metric_example.py +++ b/mlperf_logging/mllog/examples/power/compute_metric_example.py @@ -6,7 +6,7 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--input-log", type=str, default=None) parser.add_argument("--hardware-type", type=str, choices=["node", "sw"], default="node") - parser.add_argument("--ruleset", type=str, choices=["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0"], default="3.1.0") + parser.add_argument("--ruleset", type=str, choices=["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0"], default="4.0.0") args = parser.parse_args() return args diff --git a/mlperf_logging/package_checker/README.md b/mlperf_logging/package_checker/README.md index aa03b41..6c7422c 100644 --- a/mlperf_logging/package_checker/README.md +++ b/mlperf_logging/package_checker/README.md @@ -10,7 +10,7 @@ To check an organization's submission package for compliance: python3 -m mlperf_logging.package_checker FOLDER USAGE RULESET ``` -Currently, USAGE in ["training"] and RULESET in ["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0"] are supported. +Currently, USAGE in ["training"] and RULESET in ["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0"] are supported. The package checker checks: 1. The number of result files for each benchmark matches the required count. If diff --git a/mlperf_logging/package_checker/package_checker.py b/mlperf_logging/package_checker/package_checker.py index fd8f033..262ace7 100644 --- a/mlperf_logging/package_checker/package_checker.py +++ b/mlperf_logging/package_checker/package_checker.py @@ -175,13 +175,13 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror, logging.error(" %d files do not comply, directory cannot be accepted", errors_found) # Check if each run use unique seeds. - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0'} and division == 'closed': + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0'} and division == 'closed': if not seed_checker.check_seeds(result_files, source_files): too_many_errors = True logging.error('Seed checker failed') # Run RCP checker for >= 1.0.0 - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0'} and division == 'closed' and benchmark != 'minigo': + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0'} and division == 'closed' and benchmark != 'minigo': # Now go again through result files to do RCP checks rcp_bypass = (global_rcp_bypass or system_rcp_bypass or result_rcp_bypass) rcp_pass, rcp_msg, _ = rcp_checker.check_directory( @@ -235,7 +235,7 @@ def check_training_package(folder, usage, ruleset, quiet, werror, rcp_bypass, rc ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0, etc. """ too_many_errors = False - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0'}: + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0'}: logging.info(' Checking System Description Files') system_description_pass = check_systems(folder, usage, ruleset) too_many_errors = too_many_errors or not system_description_pass diff --git a/mlperf_logging/rcp_checker/README.md b/mlperf_logging/rcp_checker/README.md index 7396f64..17af28e 100644 --- a/mlperf_logging/rcp_checker/README.md +++ b/mlperf_logging/rcp_checker/README.md @@ -8,10 +8,10 @@ Run Reference Convergence Point checks for a submission directory. This consists of testing whether a submission does not converge statistically faster than the reference. -For training, RCPs are loaded from directory mlperf_logging/rcp_checker/training_3.1.0/*.json +For training, RCPs are loaded from directory mlperf_logging/rcp_checker/training_4.0.0/*.json The RCP checker supports only the 1.0.0 version onwards. -The current training version is 3.1.0. +The current training version is 4.0.0. ## Usage diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 9fd06ff..4fd309b 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -73,7 +73,7 @@ def read_submission_file(result_file, use_train_samples): if benchmark == "stable_diffusion" and ("eval_error" in str or "eval_accuracy" in str): eval_accuracy_str = str - eval_step = json.loads(eval_accuracy_str)["metadata"]["step_num"] + eval_step = json.loads(eval_accuracy_str)["metadata"]["samples_count"] eval_metric = json.loads(eval_accuracy_str)["metadata"]["metric"] eval_score = json.loads(eval_accuracy_str)["value"] stable_diffusion_eval_results[eval_step][eval_metric] = eval_score @@ -84,7 +84,7 @@ def read_submission_file(result_file, use_train_samples): elif use_train_samples and "train_samples" in str: eval_accuracy_str = str conv_epoch = json.loads(eval_accuracy_str)["value"] - + if "run_stop" in str and json.loads(str)["key"] == "run_stop": conv_result = json.loads(str)["metadata"]["status"] if conv_result == "success": @@ -151,8 +151,8 @@ def get_submission_epochs(result_files, bert_train_samples): class RCP_Checker: def __init__(self, usage, ruleset, benchmark, verbose, rcp_file=None): - if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0"}: - raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0 and 3.1.0') + if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0"}: + raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0, 3.1.0 and 4.0.0') self.usage = usage self.ruleset = ruleset self.benchmark = benchmark @@ -519,7 +519,7 @@ def get_parser(): parser.add_argument('--rcp_usage', type=str, default='training', choices=['training', 'hpc'], help='what WG does the benchmark come from to check the log against') - parser.add_argument('--rcp_version', type=str, default='3.1.0', + parser.add_argument('--rcp_version', type=str, default='4.0.0', help='what version of rules to check the log against') parser.add_argument('--verbose', action='store_true') parser.add_argument('--bert_train_samples', action='store_true', diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_stable_diffusion.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_stable_diffusion.json new file mode 100644 index 0000000..7bd17f2 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_4.0.0/rcps_stable_diffusion.json @@ -0,0 +1,66 @@ +{ + + "sd_ref_512": + { + "Benchmark": "stable_diffusion", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.1", + "Platform": "32xDGX-A100", + "BS": 512, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.999, + "opt_adamw_epsilon": 1e-08, + "opt_adamw_weight_decay": 0.01, + "opt_base_learning_rate": 1.25e-7, + "opt_learning_rate_warmup_steps": 1000 + }, + "Epochs to converge": [ + 2560000, 2560000, 2560000, 2560000, 2560000, + 2560000, 2560000, 2560000, 2560000, 2560000, + 2560000, 2560000, 2560000, 3072000] + }, + + "sd_ref_1024": + { + "Benchmark": "stable_diffusion", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.1", + "Platform": "32xDGX-A100", + "BS": 1024, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.999, + "opt_adamw_epsilon": 1e-08, + "opt_adamw_weight_decay": 0.01, + "opt_base_learning_rate": 1.25e-7, + "opt_learning_rate_warmup_steps": 1000 + }, + "Epochs to converge": [ + 2560000, 2560000, 2560000, 2560000, 2560000, + 3072000, 3072000, 3072000, 3072000, 3072000, + 3072000, 3072000, 2560000] + }, + + "sd_ref_2048": + { + "Benchmark": "stable_diffusion", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.1", + "Platform": "32xDGX-A100", + "BS": 2048, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.999, + "opt_adamw_epsilon": 1e-08, + "opt_adamw_weight_decay": 0.01, + "opt_base_learning_rate": 1.25e-7, + "opt_learning_rate_warmup_steps": 1000 + }, + "Epochs to converge": [ + 3584000, 3584000, 3584000, 3584000, 4096000, + 4096000, 4096000, 4096000, 4096000, 4096000, + 4096000, 4608000, 4608000] + } + +} diff --git a/mlperf_logging/repo_checker/README.md b/mlperf_logging/repo_checker/README.md index a3a703b..c4f0fe5 100644 --- a/mlperf_logging/repo_checker/README.md +++ b/mlperf_logging/repo_checker/README.md @@ -12,12 +12,12 @@ review process. python3 -m mlperf_logging.repo_checker FOLDER USAGE RULESET ``` -Currently, USAGE in ["training", "hpc"] and RULESETS 2.0.0, 2.1.0, 3.0.0 and 3.1.0 are supported. +Currently, USAGE in ["training", "hpc"] and RULESETS 2.0.0, 2.1.0, 3.0.0, 3.1.0 and 4.0.0 are supported. The repo checker checks: 1. Whether the repo contains filenames that github does not like, e.g. files with spaces, files that start with '.' or '/.' -2. Files that violate the github file limit (50MB) +2. Files that violate the github file limit (50MB) ## Tested software versions Tested and confirmed working using the following software versions: diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index c801612..d6fc7a0 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -65,6 +65,8 @@ columns: bert: ["Benchmark results (minutes)", "NLP", "Wikipedia", "BERT"] gpt3: ["Benchmark results (minutes)", "LLM", "C4", "GPT3"] dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"] + maskrcnn: ["Benchmark results (minutes)", "Object detection, heavy-weight", "COCO", "Mask R-CNN"] + minigo: ["Benchmark results (minutes)", "Reinforcement Learning", "Go", "Minigo"] resnet: ["Benchmark results (minutes)", "Image classification", "ImageNet", "ResNet"] ssd: ["Benchmark results (minutes)", "Object detection, light-weight", "OpenImages", "RetinaNet"] rnnt: ["Benchmark results (minutes)", "Speech recognition", "LibriSpeech", "RNN-T"] @@ -72,6 +74,7 @@ columns: stable_diffusion: ["Benchmark results (minutes)", "Text to image", "Laion 400m and Coco-2017", "StableDiffusion"] gnn: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"] default: [" ", " ", " "] + hpc: "2.0.0": bert: ["Benchmark results (minutes)", "CosmoFlow"]