From aff3f204e6e62ebf71d5fc4b2a937ce654b5b0df Mon Sep 17 00:00:00 2001
From: Brian Ko <briankosw@gmail.com>
Date: Tue, 1 Dec 2020 04:20:53 +0000
Subject: [PATCH 1/3] Transcribed imagenet code

---
 examples/imagenet.py | 363 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 363 insertions(+)
 create mode 100644 examples/imagenet.py

diff --git a/examples/imagenet.py b/examples/imagenet.py
new file mode 100644
index 0000000..3aac07e
--- /dev/null
+++ b/examples/imagenet.py
@@ -0,0 +1,363 @@
+import os
+import warnings
+
+import hydra
+import torch
+import torch.nn as nn
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torchvision.models as models
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch.optim import Adam
+
+best_acc1 = 0
+
+
+@hydra.main(config_name="imagenetconf")
+def main(cfg):
+    if cfg.seed is not None:
+        random.seed(cfg.seed)
+        torch.manual_seed(cfg.seed)
+        cudnn.deterministic = True
+        warnings.warn(
+            "You have chosen to seed training. "
+            "This will turn on the CUDNN deterministic setting, "
+            "which can slow down your training considerably! "
+            "You may see unexpected behavior when restarting "
+            "from checkpoints."
+        )
+    if cfg.gpu is not None:
+        warnings.warn(
+            "You have chosen a specific GPU. This will completely "
+            "disable data parallelism."
+        )
+    if cfg.dist_url == "env://" and cfg.world_size == -1:
+        cfg.world_size = int(os.environ["WORLD_SIZE"])
+
+    global best_acc1
+
+    if cfg.gpu is not None:
+        print("Use GPU: {} for training".format(cfg.gpu))
+
+    if cfg.distributed:
+        if cfg.dist_url == "env://" and cfg.rank == -1:
+            cfg.rank = int(os.environ["RANK"])
+        if cfg.multiprocessing_distributed:
+            cfg.rank = cfg.rank * cfg.ngpus_per_node + gpu
+        dist.init_process_group(
+            backend=cfg.dist_backend,
+            init_method=cfg.dist_url,
+            world_size=cfg.world_size,
+            rank=cfg.rank,
+        )
+    if cfg.pretrained:
+        print("=> using pre-trained model '{}'".format(cfg.arch))
+        model = models.__dict__[cfg.arch](pretrained=True)
+    else:
+        print("=> creating model '{}'".format(cfg.arch))
+        model = models.__dict__[cfg.arch]()
+
+    if not torch.cuda.is_available():
+        print("using CPU, this wil be slow")
+    elif cfg.distributed:
+        if cfg.gpu is not None:
+            torch.cuda.set_device(cfg.gpu)
+            model.cuda(cfg.gpu)
+            cfg.batch_size = int(cfg.batch_size / cfg.ngpus_per_node)
+            model = torch.nn.parallel.DistributedDataParallel(
+                model, device_ids=[cfg.gpu]
+            )
+        else:
+            model.cuda()
+            model = torch.nn.parallel.DistributedDataParallel(model)
+    elif cfg.gpu is not None:
+        torch.cuda.set_device(cfg.gpu)
+        model = model.cuda(cfg.gpu)
+    else:
+        if cfg.arch.startswith("alexnet") or cfg.arch.startswith("vgg"):
+            model.features = torch.nn.DataParallel(model.features)
+            model.cuda()
+        else:
+            model = torch.nn.DataParallel(model).cuda()
+
+    criterion = nn.CrossEntropyLoss().cuda(cfg.gpu)
+    optimizer = Adam(
+        lr=cfg.adam.lr,
+        rho=cfg.adam.rho,
+        eps=cfg.adam.eps,
+        weight_decay=cfg.adam.weight_decay,
+        params=model.parameters(),
+    )
+
+    if cfg.resume:
+        if os.path.isfile(cfg.resume):
+            print("=> loading checkpoint '{}'".format(cfg.resume))
+            if cfg.gpu is None:
+                checkpoint = torch.load(cfg.resume)
+            else:
+                loc = "cuda:{}".format(cfg.gpu)
+                checkpoint = torch.load(cfg.resume, map_location=loc)
+            cfg.start_epoch = checkpoint["epoch"]
+            best_acc1 = checkpoint["best_acc1"]
+            if cfg.gpu is not None:
+                best_acc1 = best_acc1.to(cfg.gpu)
+            model.load_state_dict(checkpoint["state_dict"])
+            optimizer.load_state_dict(checkpoint["optimizer"])
+            print(
+                "=> loaded checkpoint '{}' (epoch {})".format(
+                    cfg.resume, checkpoint["epoch"]
+                )
+            )
+        else:
+            print("=> no checkpoint found at '{}'".format(cfg.resume))
+
+    cudnn.benchmark = True
+
+    traindir = os.path.join(cfg.data, "train")
+    valdir = os.path.join(cfg.data, "val")
+    normalize = transforms.Normalize(
+        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+    )
+
+    train_dataset = datasets.ImageFolder(
+        train_dir,
+        transforms.Compose(
+            [
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]
+        ),
+    )
+
+    if cfg.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    else:
+        train_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=cfg.batch_size,
+        shuffle=(train_sampler is None),
+        num_workers=cfg.workers,
+        pin_memory=True,
+        sampler=train_sampler,
+    )
+
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(
+            valdir,
+            transforms.Compose(
+                [
+                    transforms.Resize(256),
+                    transforms.CenterCrop(224),
+                    transforms.ToTensor(),
+                    normalize,
+                ]
+            ),
+        ),
+        batch_size=args.batch_size,
+        shuffle=False,
+        num_workers=args.workers,
+        pin_memory=True,
+    )
+
+    if cfg.evaluate:
+        validate(val_loader, model, criterion, cfg)
+        return
+
+    for epoch in range(cfg.start_epoch, cfg.epochs):
+        if cfg.distributed:
+            train_sampler.set_epoch(epoch)
+        adjust_learning_rate(optimizer, epoch, cfg)
+
+        train(train_loader, model, criterion, optimizer, epoch, args)
+        acc1 = validate(val_loader, model, criterion, cfg)
+
+        is_best = acc1 > best_acc1
+        best_acc1 = max(acc1, best_acc1)
+
+        if not cfg.multiprocessing_distributed or (
+            cfg.multiprocessing_distributed and cfg.rank % cfg.ngpus_per_node == 0
+        ):
+            save_checkpoint(
+                {
+                    "epoch": epoch + 1,
+                    "arch": cfg.arch,
+                    "state_dict": model.state_dict(),
+                    "best_acc1": best_acc1,
+                    "optimizer": optimizer.state_dict(),
+                },
+                is_best,
+            )
+
+
+def train(train_loader, model, criterion, optimizer, epoch, cfg):
+    batch_time = AverageMeter("Time", ":6.3f")
+    data_time = AverageMeter("Data", ":6.3f")
+    losses = AverageMeter("Loss", ":.4e")
+    top1 = AverageMeter("Acc@1", ":6.2f")
+    top5 = AverageMeter("Acc@5", ":6.2f")
+    progress = ProgressMeter(
+        len(train_loader),
+        [batch_time, data_time, losses, top1, top5],
+        prefix="Epoch: [{}]".format(epoch),
+    )
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    for i, (images, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        if args.gpu is not None:
+            images = images.cuda(cfg.gpu, non_blocking=True)
+        if torch.cuda.is_available():
+            target = target.cuda(cfg.gpu, non_blocking=True)
+
+        # compute output
+        output = model(images)
+        loss = criterion(output, target)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            progress.display(i)
+
+
+def validate(val_loader, model, criterion, cfg):
+    batch_time = AverageMeter("Time", ":6.3f")
+    losses = AverageMeter("Loss", ":.4e")
+    top1 = AverageMeter("Acc@1", ":6.2f")
+    top5 = AverageMeter("Acc@5", ":6.2f")
+    progress = ProgressMeter(
+        len(val_loader), [batch_time, losses, top1, top5], prefix="Test: "
+    )
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        end = time.time()
+        for i, (images, target) in enumerate(val_loader):
+            if cfg.gpu is not None:
+                images = images.cuda(cfg.gpu, non_blocking=True)
+            if torch.cuda.is_available():
+                target = target.cuda(cfg.gpu, non_blocking=True)
+
+            # compute output
+            output = model(images)
+            loss = criterion(output, target)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % args.print_freq == 0:
+                progress.display(i)
+
+        # TODO: this should also be done with the ProgressMeter
+        print(
+            " * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}".format(top1=top1, top5=top5)
+        )
+
+    return top1.avg
+
+
+def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, "model_best.pth.tar")
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, name, fmt=":f"):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print("\t".join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = "{:" + str(num_digits) + "d}"
+        return "[" + fmt + "/" + fmt.format(num_batches) + "]"
+
+
+def adjust_learning_rate(optimizer, epoch, cfg):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    lr = cfg.lr * (0.1 ** (epoch // 30))
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+if __name__ == "__main__":
+    main()

From 54040e83d153d0023ae66464e1fd27301da1d377 Mon Sep 17 00:00:00 2001
From: Brian Ko <briankosw@gmail.com>
Date: Thu, 3 Dec 2020 08:16:54 +0000
Subject: [PATCH 2/3] Pruned most non-DDP code

---
 examples/{imagenet.py => imagenet_00.py} | 170 ++++++-----------------
 examples/imagenetconf.yaml               |  19 +++
 2 files changed, 62 insertions(+), 127 deletions(-)
 rename examples/{imagenet.py => imagenet_00.py} (61%)
 create mode 100644 examples/imagenetconf.yaml

diff --git a/examples/imagenet.py b/examples/imagenet_00.py
similarity index 61%
rename from examples/imagenet.py
rename to examples/imagenet_00.py
index 3aac07e..29f0295 100644
--- a/examples/imagenet.py
+++ b/examples/imagenet_00.py
@@ -1,7 +1,12 @@
+import logging
 import os
+import random
+import shutil
+import time
 import warnings
 
 import hydra
+from omegaconf import DictConfig
 import torch
 import torch.nn as nn
 import torch.backends.cudnn as cudnn
@@ -9,111 +14,49 @@
 import torchvision.models as models
 import torchvision.datasets as datasets
 import torchvision.transforms as transforms
-from torch.optim import Adam
+from torch.optim import SGD
 
-best_acc1 = 0
+logger = logging.getLogger("ImageNet")
 
 
 @hydra.main(config_name="imagenetconf")
-def main(cfg):
+def main(cfg: DictConfig):
     if cfg.seed is not None:
         random.seed(cfg.seed)
         torch.manual_seed(cfg.seed)
+        cudnn.benchmark = False
         cudnn.deterministic = True
-        warnings.warn(
-            "You have chosen to seed training. "
-            "This will turn on the CUDNN deterministic setting, "
-            "which can slow down your training considerably! "
-            "You may see unexpected behavior when restarting "
-            "from checkpoints."
-        )
-    if cfg.gpu is not None:
-        warnings.warn(
-            "You have chosen a specific GPU. This will completely "
-            "disable data parallelism."
-        )
-    if cfg.dist_url == "env://" and cfg.world_size == -1:
-        cfg.world_size = int(os.environ["WORLD_SIZE"])
-
-    global best_acc1
-
     if cfg.gpu is not None:
-        print("Use GPU: {} for training".format(cfg.gpu))
-
-    if cfg.distributed:
-        if cfg.dist_url == "env://" and cfg.rank == -1:
-            cfg.rank = int(os.environ["RANK"])
-        if cfg.multiprocessing_distributed:
-            cfg.rank = cfg.rank * cfg.ngpus_per_node + gpu
-        dist.init_process_group(
-            backend=cfg.dist_backend,
-            init_method=cfg.dist_url,
-            world_size=cfg.world_size,
-            rank=cfg.rank,
-        )
-    if cfg.pretrained:
-        print("=> using pre-trained model '{}'".format(cfg.arch))
-        model = models.__dict__[cfg.arch](pretrained=True)
-    else:
-        print("=> creating model '{}'".format(cfg.arch))
-        model = models.__dict__[cfg.arch]()
+        logger.info(f"Use GPU: {cfg.gpu} for training")
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12345"
+    dist.init_process_group(
+        backend=cfg.dist_backend,
+        # init_method=cfg.dist_url,
+        world_size=cfg.world_size,
+        rank=cfg.gpu,
+    )
+    return
 
-    if not torch.cuda.is_available():
-        print("using CPU, this wil be slow")
-    elif cfg.distributed:
-        if cfg.gpu is not None:
-            torch.cuda.set_device(cfg.gpu)
-            model.cuda(cfg.gpu)
-            cfg.batch_size = int(cfg.batch_size / cfg.ngpus_per_node)
-            model = torch.nn.parallel.DistributedDataParallel(
-                model, device_ids=[cfg.gpu]
-            )
-        else:
-            model.cuda()
-            model = torch.nn.parallel.DistributedDataParallel(model)
-    elif cfg.gpu is not None:
-        torch.cuda.set_device(cfg.gpu)
-        model = model.cuda(cfg.gpu)
-    else:
-        if cfg.arch.startswith("alexnet") or cfg.arch.startswith("vgg"):
-            model.features = torch.nn.DataParallel(model.features)
-            model.cuda()
-        else:
-            model = torch.nn.DataParallel(model).cuda()
+    model = models.__dict__[cfg.arch]()
+    torch.cuda.set_device(cfg.gpu)
+    model.cuda(cfg.gpu)
+    # When using a single GPU per process and per
+    # DistributedDataParallel, we need to divide the batch size
+    # ourselves based on the total number of GPUs we have
+    cfg.batch_size = int(cfg.batch_size / cfg.world_size)
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[cfg.gpu])
 
     criterion = nn.CrossEntropyLoss().cuda(cfg.gpu)
-    optimizer = Adam(
-        lr=cfg.adam.lr,
-        rho=cfg.adam.rho,
-        eps=cfg.adam.eps,
-        weight_decay=cfg.adam.weight_decay,
+    optimizer = SGD(
         params=model.parameters(),
+        lr=cfg.sgd.lr,
+        lambd=cfg.sgd.lambd,
+        alpha=cfg.sgd.alpha,
+        t0=cfg.sgd.t0,
+        weight_decay=cfg.sgd.weight_decay,
     )
 
-    if cfg.resume:
-        if os.path.isfile(cfg.resume):
-            print("=> loading checkpoint '{}'".format(cfg.resume))
-            if cfg.gpu is None:
-                checkpoint = torch.load(cfg.resume)
-            else:
-                loc = "cuda:{}".format(cfg.gpu)
-                checkpoint = torch.load(cfg.resume, map_location=loc)
-            cfg.start_epoch = checkpoint["epoch"]
-            best_acc1 = checkpoint["best_acc1"]
-            if cfg.gpu is not None:
-                best_acc1 = best_acc1.to(cfg.gpu)
-            model.load_state_dict(checkpoint["state_dict"])
-            optimizer.load_state_dict(checkpoint["optimizer"])
-            print(
-                "=> loaded checkpoint '{}' (epoch {})".format(
-                    cfg.resume, checkpoint["epoch"]
-                )
-            )
-        else:
-            print("=> no checkpoint found at '{}'".format(cfg.resume))
-
-    cudnn.benchmark = True
-
     traindir = os.path.join(cfg.data, "train")
     valdir = os.path.join(cfg.data, "val")
     normalize = transforms.Normalize(
@@ -121,7 +64,7 @@ def main(cfg):
     )
 
     train_dataset = datasets.ImageFolder(
-        train_dir,
+        traindir,
         transforms.Compose(
             [
                 transforms.RandomResizedCrop(224),
@@ -158,9 +101,9 @@ def main(cfg):
                 ]
             ),
         ),
-        batch_size=args.batch_size,
+        batch_size=cfg.batch_size,
         shuffle=False,
-        num_workers=args.workers,
+        num_workers=cfg.workers,
         pin_memory=True,
     )
 
@@ -173,25 +116,8 @@ def main(cfg):
             train_sampler.set_epoch(epoch)
         adjust_learning_rate(optimizer, epoch, cfg)
 
-        train(train_loader, model, criterion, optimizer, epoch, args)
-        acc1 = validate(val_loader, model, criterion, cfg)
-
-        is_best = acc1 > best_acc1
-        best_acc1 = max(acc1, best_acc1)
-
-        if not cfg.multiprocessing_distributed or (
-            cfg.multiprocessing_distributed and cfg.rank % cfg.ngpus_per_node == 0
-        ):
-            save_checkpoint(
-                {
-                    "epoch": epoch + 1,
-                    "arch": cfg.arch,
-                    "state_dict": model.state_dict(),
-                    "best_acc1": best_acc1,
-                    "optimizer": optimizer.state_dict(),
-                },
-                is_best,
-            )
+        train(train_loader, model, criterion, optimizer, epoch, cfg)
+        validate(val_loader, model, criterion, cfg)
 
 
 def train(train_loader, model, criterion, optimizer, epoch, cfg):
@@ -214,7 +140,7 @@ def train(train_loader, model, criterion, optimizer, epoch, cfg):
         # measure data loading time
         data_time.update(time.time() - end)
 
-        if args.gpu is not None:
+        if cfg.gpu is not None:
             images = images.cuda(cfg.gpu, non_blocking=True)
         if torch.cuda.is_available():
             target = target.cuda(cfg.gpu, non_blocking=True)
@@ -238,7 +164,7 @@ def train(train_loader, model, criterion, optimizer, epoch, cfg):
         batch_time.update(time.time() - end)
         end = time.time()
 
-        if i % args.print_freq == 0:
+        if i % cfg.print_freq == 0:
             progress.display(i)
 
 
@@ -276,22 +202,12 @@ def validate(val_loader, model, criterion, cfg):
             batch_time.update(time.time() - end)
             end = time.time()
 
-            if i % args.print_freq == 0:
+            if i % cfg.print_freq == 0:
                 progress.display(i)
-
-        # TODO: this should also be done with the ProgressMeter
-        print(
+        logger.info(
             " * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}".format(top1=top1, top5=top5)
         )
 
-    return top1.avg
-
-
-def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"):
-    torch.save(state, filename)
-    if is_best:
-        shutil.copyfile(filename, "model_best.pth.tar")
-
 
 class AverageMeter(object):
     """Computes and stores the average and current value"""
@@ -327,7 +243,7 @@ def __init__(self, num_batches, meters, prefix=""):
     def display(self, batch):
         entries = [self.prefix + self.batch_fmtstr.format(batch)]
         entries += [str(meter) for meter in self.meters]
-        print("\t".join(entries))
+        logger.info("\t".join(entries))
 
     def _get_batch_fmtstr(self, num_batches):
         num_digits = len(str(num_batches // 1))
diff --git a/examples/imagenetconf.yaml b/examples/imagenetconf.yaml
new file mode 100644
index 0000000..f2ba313
--- /dev/null
+++ b/examples/imagenetconf.yaml
@@ -0,0 +1,19 @@
+seed: ~
+gpu: ~
+world_size: 4
+dist_backend: nccl
+dist_url: "localhost:9999"
+batch_size: 256
+ngpus_per_node: 4
+sgd:
+    lr: 0.01
+    lambd: 0.0001
+    alpha: 0.75
+    t0: 1000000.0
+    weight_decay: 0
+
+hydra.launcher.joblib.backend: multiprocessing
+
+
+defaults:
+    - hydra/launcher: joblib

From f5dff54fb2257ca89bc5ee60294b0628ac118ff5 Mon Sep 17 00:00:00 2001
From: Brian Ko <briankosw@gmail.com>
Date: Thu, 3 Dec 2020 09:47:31 +0000
Subject: [PATCH 3/3] Fixing conf

---
 examples/imagenetconf.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/imagenetconf.yaml b/examples/imagenetconf.yaml
index f2ba313..5fab571 100644
--- a/examples/imagenetconf.yaml
+++ b/examples/imagenetconf.yaml
@@ -12,8 +12,6 @@ sgd:
     t0: 1000000.0
     weight_decay: 0
 
-hydra.launcher.joblib.backend: multiprocessing
-
 
 defaults:
     - hydra/launcher: joblib