Skip to content

Commit

Permalink
Add L3 test with benchmarks
Browse files Browse the repository at this point in the history
Signed-off-by: Krzysztof Lecki <[email protected]>

Extend with accuracy threshold

Signed-off-by: Krzysztof Lecki <[email protected]>
  • Loading branch information
klecki committed Mar 7, 2023
1 parent 4dd2cbe commit a818ae7
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ def training_pipe(data_dir, interpolation, image_size, automatic_augmentation, d

@pipeline_def
def validation_pipe(data_dir, interpolation, image_size, image_crop, rank=0, world_size=1):
jpegs, label = fn.readers.file(file_root=data_dir, shard_id=rank, num_shards=world_size,
random_shuffle=False, pad_last_batch=True)
jpegs, label = fn.readers.file(name="Reader", file_root=data_dir, shard_id=rank,
num_shards=world_size, random_shuffle=False, pad_last_batch=True)

images = fn.decoders.image(jpegs, device="mixed", output_type=types.RGB)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@ def gdtl(
}

pipe = training_pipe(data_dir=traindir, interpolation=interpolation, image_size=image_size,
dali_device=dali_device, rank=rank, world_size=world_size,
**pipeline_kwargs)
automatic_augmentation=augmentation, dali_device=dali_device,
rank=rank, world_size=world_size, **pipeline_kwargs)

pipe.build()
train_loader = DALIClassificationIterator(
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/use_cases/pytorch/efficientnet/readme.rst
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ To run training benchmarks with different data loaders and automatic augmentatio
export RESULT_WORKSPACE=./
# synthetic benchmark
python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 1 --prof 100 --no-checkpoints --training-only --data-backend synthetic --workspace $RESULT_WORKSPACE --raport-file bench_report_synthetic.json $PATH_TO_IMAGENET
python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 1 --prof 1000 --no-checkpoints --training-only --data-backend synthetic --workspace $RESULT_WORKSPACE --raport-file bench_report_synthetic.json $PATH_TO_IMAGENET
# DALI without automatic augmentations
python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 4 --no-checkpoints --training-only --data-backend dali --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --raport-file bench_report_dali.json $PATH_TO_IMAGENET
Expand Down
128 changes: 128 additions & 0 deletions qa/TL3_EfficientNet_benchmark/test_pytorch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#!/bin/bash -e

# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -o nounset
set -o errexit
set -o pipefail

function CLEAN_AND_EXIT {
exit $1
}

cd /opt/dali/docs/examples/use_cases/pytorch/efficientnet

NUM_GPUS=$(nvidia-smi -L | wc -l)

if [ $NUM_GPUS -ne 8 ];
then
echo "This test requires DGX-1V with 8 GPUs to run correctly"
exit 1
fi

# Setup /imagenet/{train,val}

mkdir -p /imagenet
pushd /imagenet

if [ ! -d "val" ]; then
ln -sf /data/imagenet/val-jpeg/ val
fi
if [ ! -d "train" ]; then
ln -sf /data/imagenet/train-jpeg/ train
fi

popd

export PATH_TO_IMAGENET=/imagenet

export RESULT_WORKSPACE=./

# synthetic benchmark
python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 1 --prof 1000 --no-checkpoints --training-only --data-backend synthetic --workspace $RESULT_WORKSPACE --raport-file bench_report_synthetic.json $PATH_TO_IMAGENET

# DALI without automatic augmentations
python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 4 --no-checkpoints --training-only --data-backend dali --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --raport-file bench_report_dali.json $PATH_TO_IMAGENET

# DALI with AutoAugment
python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 4 --no-checkpoints --training-only --data-backend dali --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --raport-file bench_report_dali_aa.json $PATH_TO_IMAGENET

# DALI with TrivialAugment
python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 4 --no-checkpoints --training-only --data-backend dali --automatic-augmentation trivialaugment --workspace $RESULT_WORKSPACE --raport-file bench_report_dali_ta.json $PATH_TO_IMAGENET

# PyTorch without automatic augmentations
python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 4 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --raport-file bench_report_pytorch.json $PATH_TO_IMAGENET

# PyTorch with AutoAugment:
python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 4 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --raport-file bench_report_pytorch_aa.json $PATH_TO_IMAGENET


# The line below finds the lines with `train.total_ips`, takes the last one (with the result we
# want) cuts the DLLL (this is highly useful for JSON parsing) from the JSON logs, and parses it
# as JSON using Python. We can now parse the value or straight evaluate the thresholds.
# grep "train.total_ips" <filename>.json | tail -1 | cut -c 5- | python3 -c "import sys, json; print(json.load(sys.stdin))"

# Actual results are about 500 samples/s more
SYNTH_THRESHOLD=10800
DALI_NONE_THRESHOLD=6000 # TODO(klecki): Don't know the result here
DALI_AA_THRESHOLD=9000
DALI_TA_THRESHOLD=9000 # TODO(klecki): Tune to the current number of workers
PYTORCH_NONE_THRESHOLD=7200 # TODO(klecki): Don't know the result here
PYTORCH_AA_THRESHOLD=7200

function CHECK_PERF_THRESHOLD {
FILENAME=$1
THRESHOLD=$2
grep "train.total_ips" $FILENAME | tail -1 | cut -c 5- | python3 -c "import sys, json
total_ips = json.load(sys.stdin)[\"data\"][\"train.total_ips\"]
if total_ips < $THRESHOLD:
print(f\"[FAIL] $FILENAME below threshold: {total_ips} < $THRESHOLD\")
sys.exit(1)
else:
print(f\"[PASS] $FILENAME above threshold: {total_ips} >= $THRESHOLD\")
sys.exit(0)"
}


CHECK_PERF_THRESHOLD "bench_report_synthetic.json" $SYNTH_THRESHOLD
CHECK_PERF_THRESHOLD "bench_report_dali.json" $DALI_NONE_THRESHOLD
CHECK_PERF_THRESHOLD "bench_report_dali_aa.json" $DALI_AA_THRESHOLD
CHECK_PERF_THRESHOLD "bench_report_dali_ta.json" $DALI_TA_THRESHOLD
CHECK_PERF_THRESHOLD "bench_report_pytorch.json" $PYTORCH_NONE_THRESHOLD
CHECK_PERF_THRESHOLD "bench_report_pytorch_aa.json" $PYTORCH_AA_THRESHOLD


# In the initial training we get siginificant increase in accuracy on the first few epochs,
# after 10 epochs we typically cross 50%.
# Do an additional run of DALI + AA for 10 epochs and check against 48 top1 accuracy (with some
# safety margin).

python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 10 --no-checkpoints --data-backend dali --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --raport-file accuracy_report_dali_aa.json $PATH_TO_IMAGENET


function CHECK_ACCURACY_THRESHOLD {
FILENAME=$1
THRESHOLD=$2
grep "val.top1" $FILENAME | tail -1 | cut -c 5- | python3 -c "import sys, json
accuracy = json.load(sys.stdin)[\"data\"][\"val.top1\"]
if accuracy < $THRESHOLD:
print(f\"[FAIL] $FILENAME below threshold: {accuracy} < $THRESHOLD\")
sys.exit(1)
else:
print(f\"[PASS] $FILENAME above threshold: {accuracy} >= $THRESHOLD\")
sys.exit(0)"
}

CHECK_ACCURACY_THRESHOLD "accuracy_report_dali_aa.json" 48

0 comments on commit a818ae7

Please sign in to comment.