Skip to content

Commit

Permalink
Merge branch 'master' into gnn_logging_update
Browse files Browse the repository at this point in the history
  • Loading branch information
drcanchi authored Mar 28, 2024
2 parents 596cb01 + 99ba37a commit c9f3aae
Show file tree
Hide file tree
Showing 15 changed files with 200 additions and 24 deletions.
2 changes: 1 addition & 1 deletion mlperf_logging/compliance_checker/mlp_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ def parse_file(filename, ruleset='0.6.0'):
elif ruleset == '4.0.0':
return parse_file_400(filename)
else:
raise Exception(f'Ruleset "{ruleset}" is not supported')
raise Exception(f'Ruleset "{ruleset}" is not supported')
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
- KEY:
NAME: gradient_accumulation_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0 "
CHECK: " v['value'] > 0 "
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Stable diffusion uses two metrics, FID and CLIP.
# These metrics can be calculated offline, using different scripts
# and logged seperatly. Therefore, we create a virtual key
# called aggregated_eval_accuracy, which aggregates
# both metrics into a single log line

- BEGIN:
CODE: |
from dataclasses import replace
agg_eval_lines = {}
for line in loglines:
if line.key == "eval_accuracy" and 'metric' in line.value['metadata']:
samples_count = line.value['metadata']['samples_count']
if samples_count not in agg_eval_lines:
new_line = replace(line) # Make a copy
new_line.key = "aggregated_eval_accuracy"
new_line.full_string = "" # Not needed
new_line.lineno = -1 # Not needed
new_line.value = {'value': {'samples_count': samples_count}, 'metadata':{}}
agg_eval_lines[samples_count] = new_line
agg_eval_lines[samples_count].timestamp = max(line.timestamp, agg_eval_lines[samples_count].timestamp)
agg_eval_lines[samples_count].value['value'][line.value['metadata']['metric']] = line.value['value']
loglines.extend(agg_eval_lines.values())
- KEY:
NAME: global_batch_size
REQ: AT_LEAST_ONE
CHECK: " v['value'] >= 0 "

- KEY:
NAME: opt_name
REQ: EXACTLY_ONE
CHECK: " v['value'] == 'adamw' "

- KEY:
NAME: opt_adamw_beta_1
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0.9 "

- KEY:
NAME: opt_adamw_beta_2
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0.999 "

- KEY:
NAME: opt_adamw_epsilon
REQ: EXACTLY_ONE
CHECK: " v['value'] == 1e-08 "

- KEY:
NAME: opt_adamw_weight_decay
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0.01 "

- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0.0 "

- KEY:
NAME: opt_learning_rate_warmup_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0 "

- KEY:
NAME: aggregated_eval_accuracy
REQ: AT_LEAST(2)
CHECK:
- "'FID' in v['value']"
- "'CLIP' in v['value']"
- "'samples_count' in v['value']"
ATLEAST_ONE_CHECK: "(0.0 <= v['value']['FID'] <= 90.0) and (0.15 <= v['value']['CLIP'] <= 1.0)"
12 changes: 6 additions & 6 deletions mlperf_logging/compliance_checker/training_4.0.0/common.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,16 +97,16 @@
REQ: AT_LEAST_ONE_OR(epoch_start)
CHECK:
- "s['run_started']"
- "('epoch_count' in v['metadata']) | ('step_num' in v['metadata'])"
- "('epoch_count' in v['metadata']) | ('samples_count' in v['metadata'])"
- "'first_epoch_num' in v['metadata'] if 'epoch_count' in v['metadata'] else True"
- "v['metadata']['epoch_count'] > 0 if 'epoch_count' in v['metadata'] else True"
- "v['metadata']['step_num'] >= 0 if 'step_num' in v['metadata'] else True"
- "v['metadata']['samples_count'] >= 0 if 'samples_count' in v['metadata'] else True"

- KEY:
NAME: block_stop
REQ: AT_LEAST_ONE_OR(epoch_stop)
CHECK:
- "('first_epoch_num' in v['metadata']) | ('step_num' in v['metadata'])"
- "('first_epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])"

- KEY:
NAME: epoch_start
Expand All @@ -125,19 +125,19 @@
NAME: eval_start
REQ: AT_LEAST_ONE_OR(block_start)
CHECK:
- "('epoch_num' in v['metadata']) | ('step_num' in v['metadata'])"
- "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])"

- KEY:
NAME: eval_stop
REQ: AT_LEAST_ONE_OR(block_stop)
CHECK:
- "('epoch_num' in v['metadata']) | ('step_num' in v['metadata'])"
- "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])"

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "('epoch_num' in v['metadata']) | ('step_num' in v['metadata'])"
- "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])"

- KEY:
NAME: train_samples
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn'] "
POST: " enqueue_config('training_4.0.0/open_{}.yaml'.format(v['value'])) "

POST: " enqueue_config('training_4.0.0/open_{}.yaml'.format(v['value'])) "
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Stable diffusion uses two metrics, FID and CLIP.
# These metrics can be calculated offline, using different scripts
# and logged seperatly. Therefore, we create a virtual key
# called aggregated_eval_accuracy, which aggregates
# both metrics into a single log line

- BEGIN:
CODE: |
from dataclasses import replace
agg_eval_lines = {}
for line in loglines:
if line.key == "eval_accuracy" and 'metric' in line.value['metadata']:
samples_count = line.value['metadata']['samples_count']
if samples_count not in agg_eval_lines:
new_line = replace(line) # Make a copy
new_line.key = "aggregated_eval_accuracy"
new_line.full_string = "" # Not needed
new_line.lineno = -1 # Not needed
new_line.value = {'value': {'samples_count': samples_count}, 'metadata':{}}
agg_eval_lines[samples_count] = new_line
agg_eval_lines[samples_count].timestamp = max(line.timestamp, agg_eval_lines[samples_count].timestamp)
agg_eval_lines[samples_count].value['value'][line.value['metadata']['metric']] = line.value['value']
loglines.extend(agg_eval_lines.values())
- KEY:
NAME: aggregated_eval_accuracy
REQ: AT_LEAST(2)
CHECK:
- "'FID' in v['value']"
- "'CLIP' in v['value']"
- "'samples_count' in v['value']"
ATLEAST_ONE_CHECK: "v['value']['FID'] >= 0.0 and v['value']['CLIP'] <= 1.0"
1 change: 1 addition & 0 deletions mlperf_logging/mllog/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@
FIRST_EPOCH_NUM = "first_epoch_num"
STATUS = "status"
STEP_NUM = "step_num"
SAMPLES_COUNT = "samples_count"

# Power constants
POWER_MEASUREMENT_START = "power_measurement_start"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--input-log", type=str, default=None)
parser.add_argument("--hardware-type", type=str, choices=["node", "sw"], default="node")
parser.add_argument("--ruleset", type=str, choices=["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0"], default="3.1.0")
parser.add_argument("--ruleset", type=str, choices=["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0"], default="4.0.0")
args = parser.parse_args()
return args

Expand Down
2 changes: 1 addition & 1 deletion mlperf_logging/package_checker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ To check an organization's submission package for compliance:
python3 -m mlperf_logging.package_checker FOLDER USAGE RULESET
```

Currently, USAGE in ["training"] and RULESET in ["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0"] are supported.
Currently, USAGE in ["training"] and RULESET in ["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0"] are supported.

The package checker checks:
1. The number of result files for each benchmark matches the required count. If
Expand Down
6 changes: 3 additions & 3 deletions mlperf_logging/package_checker/package_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,13 +175,13 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,
logging.error(" %d files do not comply, directory cannot be accepted", errors_found)

# Check if each run use unique seeds.
if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0'} and division == 'closed':
if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0'} and division == 'closed':
if not seed_checker.check_seeds(result_files, source_files):
too_many_errors = True
logging.error('Seed checker failed')

# Run RCP checker for >= 1.0.0
if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0'} and division == 'closed' and benchmark != 'minigo':
if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0'} and division == 'closed' and benchmark != 'minigo':
# Now go again through result files to do RCP checks
rcp_bypass = (global_rcp_bypass or system_rcp_bypass or result_rcp_bypass)
rcp_pass, rcp_msg, _ = rcp_checker.check_directory(
Expand Down Expand Up @@ -235,7 +235,7 @@ def check_training_package(folder, usage, ruleset, quiet, werror, rcp_bypass, rc
ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0, etc.
"""
too_many_errors = False
if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0'}:
if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0'}:
logging.info(' Checking System Description Files')
system_description_pass = check_systems(folder, usage, ruleset)
too_many_errors = too_many_errors or not system_description_pass
Expand Down
4 changes: 2 additions & 2 deletions mlperf_logging/rcp_checker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ Run Reference Convergence Point checks for a submission directory.
This consists of testing whether a submission does not converge
statistically faster than the reference.

For training, RCPs are loaded from directory mlperf_logging/rcp_checker/training_3.1.0/*.json
For training, RCPs are loaded from directory mlperf_logging/rcp_checker/training_4.0.0/*.json

The RCP checker supports only the 1.0.0 version onwards.
The current training version is 3.1.0.
The current training version is 4.0.0.

## Usage

Expand Down
10 changes: 5 additions & 5 deletions mlperf_logging/rcp_checker/rcp_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def read_submission_file(result_file, use_train_samples):

if benchmark == "stable_diffusion" and ("eval_error" in str or "eval_accuracy" in str):
eval_accuracy_str = str
eval_step = json.loads(eval_accuracy_str)["metadata"]["step_num"]
eval_step = json.loads(eval_accuracy_str)["metadata"]["samples_count"]
eval_metric = json.loads(eval_accuracy_str)["metadata"]["metric"]
eval_score = json.loads(eval_accuracy_str)["value"]
stable_diffusion_eval_results[eval_step][eval_metric] = eval_score
Expand All @@ -84,7 +84,7 @@ def read_submission_file(result_file, use_train_samples):
elif use_train_samples and "train_samples" in str:
eval_accuracy_str = str
conv_epoch = json.loads(eval_accuracy_str)["value"]

if "run_stop" in str and json.loads(str)["key"] == "run_stop":
conv_result = json.loads(str)["metadata"]["status"]
if conv_result == "success":
Expand Down Expand Up @@ -151,8 +151,8 @@ def get_submission_epochs(result_files, bert_train_samples):
class RCP_Checker:

def __init__(self, usage, ruleset, benchmark, verbose, rcp_file=None):
if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0"}:
raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0 and 3.1.0')
if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0"}:
raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0, 3.1.0 and 4.0.0')
self.usage = usage
self.ruleset = ruleset
self.benchmark = benchmark
Expand Down Expand Up @@ -519,7 +519,7 @@ def get_parser():
parser.add_argument('--rcp_usage', type=str, default='training',
choices=['training', 'hpc'],
help='what WG does the benchmark come from to check the log against')
parser.add_argument('--rcp_version', type=str, default='3.1.0',
parser.add_argument('--rcp_version', type=str, default='4.0.0',
help='what version of rules to check the log against')
parser.add_argument('--verbose', action='store_true')
parser.add_argument('--bert_train_samples', action='store_true',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
{

"sd_ref_512":
{
"Benchmark": "stable_diffusion",
"Creator": "NVIDIA",
"When": "Reference RCPs before v3.1",
"Platform": "32xDGX-A100",
"BS": 512,
"Hyperparams": {
"opt_adamw_beta_1": 0.9,
"opt_adamw_beta_2": 0.999,
"opt_adamw_epsilon": 1e-08,
"opt_adamw_weight_decay": 0.01,
"opt_base_learning_rate": 1.25e-7,
"opt_learning_rate_warmup_steps": 1000
},
"Epochs to converge": [
2560000, 2560000, 2560000, 2560000, 2560000,
2560000, 2560000, 2560000, 2560000, 2560000,
2560000, 2560000, 2560000, 3072000]
},

"sd_ref_1024":
{
"Benchmark": "stable_diffusion",
"Creator": "NVIDIA",
"When": "Reference RCPs before v3.1",
"Platform": "32xDGX-A100",
"BS": 1024,
"Hyperparams": {
"opt_adamw_beta_1": 0.9,
"opt_adamw_beta_2": 0.999,
"opt_adamw_epsilon": 1e-08,
"opt_adamw_weight_decay": 0.01,
"opt_base_learning_rate": 1.25e-7,
"opt_learning_rate_warmup_steps": 1000
},
"Epochs to converge": [
2560000, 2560000, 2560000, 2560000, 2560000,
3072000, 3072000, 3072000, 3072000, 3072000,
3072000, 3072000, 2560000]
},

"sd_ref_2048":
{
"Benchmark": "stable_diffusion",
"Creator": "NVIDIA",
"When": "Reference RCPs before v3.1",
"Platform": "32xDGX-A100",
"BS": 2048,
"Hyperparams": {
"opt_adamw_beta_1": 0.9,
"opt_adamw_beta_2": 0.999,
"opt_adamw_epsilon": 1e-08,
"opt_adamw_weight_decay": 0.01,
"opt_base_learning_rate": 1.25e-7,
"opt_learning_rate_warmup_steps": 1000
},
"Epochs to converge": [
3584000, 3584000, 3584000, 3584000, 4096000,
4096000, 4096000, 4096000, 4096000, 4096000,
4096000, 4608000, 4608000]
}

}
4 changes: 2 additions & 2 deletions mlperf_logging/repo_checker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ review process.
python3 -m mlperf_logging.repo_checker FOLDER USAGE RULESET
```

Currently, USAGE in ["training", "hpc"] and RULESETS 2.0.0, 2.1.0, 3.0.0 and 3.1.0 are supported.
Currently, USAGE in ["training", "hpc"] and RULESETS 2.0.0, 2.1.0, 3.0.0, 3.1.0 and 4.0.0 are supported.

The repo checker checks:
1. Whether the repo contains filenames that github does not like, e.g. files with spaces,
files that start with '.' or '/.'
2. Files that violate the github file limit (50MB)
2. Files that violate the github file limit (50MB)

## Tested software versions
Tested and confirmed working using the following software versions:
Expand Down
3 changes: 3 additions & 0 deletions mlperf_logging/result_summarizer/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,16 @@ columns:
bert: ["Benchmark results (minutes)", "NLP", "Wikipedia", "BERT"]
gpt3: ["Benchmark results (minutes)", "LLM", "C4", "GPT3"]
dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"]
maskrcnn: ["Benchmark results (minutes)", "Object detection, heavy-weight", "COCO", "Mask R-CNN"]
minigo: ["Benchmark results (minutes)", "Reinforcement Learning", "Go", "Minigo"]
resnet: ["Benchmark results (minutes)", "Image classification", "ImageNet", "ResNet"]
ssd: ["Benchmark results (minutes)", "Object detection, light-weight", "OpenImages", "RetinaNet"]
rnnt: ["Benchmark results (minutes)", "Speech recognition", "LibriSpeech", "RNN-T"]
unet3d: ["Benchmark results (minutes)", "Image segmentation (medical)", "KiTS19", "3D U-Net"]
stable_diffusion: ["Benchmark results (minutes)", "Text to image", "Laion 400m and Coco-2017", "StableDiffusion"]
gnn: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"]
default: [" ", " ", " "]

hpc:
"2.0.0":
bert: ["Benchmark results (minutes)", "CosmoFlow"]
Expand Down

0 comments on commit c9f3aae

Please sign in to comment.