From ab9a50367f3622b562030ea4e6b1e9e4a20aff2c Mon Sep 17 00:00:00 2001 From: JiadeXin2021 Date: Fri, 19 Nov 2021 16:17:09 +0800 Subject: [PATCH] fix a bug of main.py (#518) * fix a bug of main.py * fix bugs of non-distributed training (cherry picked from commit de69c2dc0b0b7424c0a5b51f6d1aa4db7dcaa73f) --- .../imagenet/cpu/prune/main.py | 16 ++++++++--- .../imagenet/cpu/prune/requirements.txt | 1 + .../cpu/prune/run_pruning_distributed_cpu.sh | 27 +++++++++---------- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/examples/pytorch/eager/image_recognition/imagenet/cpu/prune/main.py b/examples/pytorch/eager/image_recognition/imagenet/cpu/prune/main.py index 2fd76d9612e..199a67bd302 100644 --- a/examples/pytorch/eager/image_recognition/imagenet/cpu/prune/main.py +++ b/examples/pytorch/eager/image_recognition/imagenet/cpu/prune/main.py @@ -70,7 +70,7 @@ help='number per node for distributed training') parser.add_argument('--seed', default=None, type=int, help='seed for initializing training. ') -parser.add_argument('--keep-batch-size', dest='keep-batch-size',i +parser.add_argument('--keep-batch-size', dest='keep_batch_size', action='store_true', help='keep the batch size rather than scale lr') @@ -98,6 +98,7 @@ def main_worker(args): if args.distributed: hvd.init() + print(hvd.size(), args.world_size, args.num_per_node) assert(hvd.size() == args.world_size * args.num_per_node) # create model @@ -278,7 +279,11 @@ def train(train_loader, model, criterion, optimizer, epoch, args, op): batch_time.update(time.time() - end) end = time.time() - if i % args.print_freq == 0 and hvd.rank() == 0: + + if (i % args.print_freq == 0 + and (not args.distributed + or (args.distributed and hvd.rank() == 0 + ))): progress.print(i) if args.iteration > 0 and i > args.iteration: @@ -315,10 +320,13 @@ def validate(val_loader, model, criterion, args): top1.update(acc1[0], input.size(0)) top5.update(acc5[0], input.size(0)) - if i % args.print_freq == 0 and hvd.rank() == 0: + if (i % args.print_freq == 0 + and (not args.distributed + or (args.distributed and hvd.rank() == 0 + ))): progress.print(i) - if hvd.rank() == 0: + if not args.distributed or (args.distributed and hvd.rank() == 0): # TODO: this should also be done with the ProgressMeter print('Accuracy: {top1:.5f} Accuracy@5 {top5:.5f}' .format(top1=(top1.avg / 100), top5=(top5.avg / 100))) diff --git a/examples/pytorch/eager/image_recognition/imagenet/cpu/prune/requirements.txt b/examples/pytorch/eager/image_recognition/imagenet/cpu/prune/requirements.txt index ac988bdf841..481619f24e8 100644 --- a/examples/pytorch/eager/image_recognition/imagenet/cpu/prune/requirements.txt +++ b/examples/pytorch/eager/image_recognition/imagenet/cpu/prune/requirements.txt @@ -1,2 +1,3 @@ torch torchvision +horovod diff --git a/examples/pytorch/eager/image_recognition/imagenet/cpu/prune/run_pruning_distributed_cpu.sh b/examples/pytorch/eager/image_recognition/imagenet/cpu/prune/run_pruning_distributed_cpu.sh index 8059d91e3e6..2d07c05af32 100644 --- a/examples/pytorch/eager/image_recognition/imagenet/cpu/prune/run_pruning_distributed_cpu.sh +++ b/examples/pytorch/eager/image_recognition/imagenet/cpu/prune/run_pruning_distributed_cpu.sh @@ -1,14 +1,13 @@ -horovodrun -np 2 - python -u main.py \ - /path/to/imagenet/ \ - --topology resnet18 \ - --prune \ - --config conf.yaml \ - --pretrained \ - --output-model model_final.pth \ - --world-size 1 \ - --num-per-node 2 \ - --batch-size 256 \ - --keep-batch-size \ - --lr 0.001 \ - --iteration 30 \ +horovodrun -np 2 python -u main.py \ + /path/to/imagenet/ \ + --topology resnet18 \ + --prune \ + --config conf.yaml \ + --pretrained \ + --output-model model_final.pth \ + --world-size 1 \ + --num-per-node 2 \ + --batch-size 256 \ + --keep-batch-size \ + --lr 0.001 \ + --iteration 30 \