From 3f7d22d2b01b2720acc3aeed8493394f708011c6 Mon Sep 17 00:00:00 2001 From: ancestor-mithril Date: Mon, 29 Apr 2024 16:51:01 +0300 Subject: [PATCH] Added experiment runner + support for StepBS and StepLR + seed everything --- experiment_runner.py | 133 +++++++++++++++++++++++++++++++++++++++++++ main.py | 2 +- utils/scheduler.py | 12 +++- utils/trainer.py | 2 + utils/utils.py | 15 +++++ 5 files changed, 161 insertions(+), 3 deletions(-) create mode 100644 experiment_runner.py create mode 100644 utils/utils.py diff --git a/experiment_runner.py b/experiment_runner.py new file mode 100644 index 0000000..609146b --- /dev/null +++ b/experiment_runner.py @@ -0,0 +1,133 @@ +import itertools +import os +import subprocess +import sys +import time +from concurrent.futures import ProcessPoolExecutor +from datetime import date +from multiprocessing import freeze_support, current_process + +import torch.cuda + +processes_per_gpu = 1 +gpu_count = 1 +max_batch_size = 1000 + +run_index = 0 +last_index = -1 +if len(sys.argv) >= 2: + run_index = int(sys.argv[1]) +if len(sys.argv) >= 3: + last_index = int(sys.argv[2]) + + +def run_command(command_idx): + command, idx = command_idx + gpu_index = current_process()._identity[0] % gpu_count + if torch.cuda.is_available(): + command += f' -device cuda:{gpu_index}' + print("Command:", idx, "on gpu", gpu_index, "on process", current_process()._identity[0]) + else: + command += ' -device cpu' + print("Command:", idx, "on cpu on process", current_process()._identity[0]) + + today = date.today() + os.makedirs('./logs', exist_ok=True) + try: + start = time.time() + with open(f"./logs/error_{idx}_{today}.txt", 'a+') as err: + subprocess.run(command, shell=True, check=True, stderr=err) + os.remove(f"./logs/error_{idx}_{today}.txt") + elapsed = (time.time() - start) + with open("./logs/finished_runs.txt", "a+") as fp: + fp.write(f"{idx} -> {today} -> " + str(elapsed) + "s + " + command + "\n") + except subprocess.CalledProcessError: + with open(f"./logs/failed_runs_{today}.txt", "a+") as fp: + fp.write(command + '\n') + + +def create_run(dataset, model, optimizer, seed, epochs, es_patience, batch_size, scheduler_params): + scheduler_name, scheduler_params = scheduler_params + scheduler_params = str(scheduler_params).replace(" ", "") + scheduler_params = str(scheduler_params).replace('"', '\'') + scheduler_params = '"' + scheduler_params + '"' + return ( + f" -lr 0.001" + f" -bs {batch_size}" + f" -epochs {epochs}" + f" -dataset {dataset}" + f" -data_path ../data" + f" -scheduler {scheduler_name}" + f" -scheduler_params {scheduler_params}" + f" -model {model}" + f" -seed {seed}" + f" -fill 0.5" + f" --cutout" + f" --autoaug" + f" --tta" + ) + " --half" if torch.cuda.is_available() else "" + + +def generate_runs(): + datasets = [ + 'cifar10', 'cifar100' + ] + models = [ + 'preresnet18_c10' + ] + optimizers = [ + 'sgd' + ] + seeds = [ + 2525 + ] + epochss = [ + 10 + ] + es_patiences = [ + 20 + ] + batch_sizes = [ + 10, 16, 32 + ] + schedulers = [ + ('IncreaseBSOnPlateau', {'mode': 'min', 'factor': 2.0, 'max_batch_size': max_batch_size}), + ('IncreaseBSOnPlateau', {'mode': 'min', 'factor': 5.0, 'max_batch_size': max_batch_size}), + ('ReduceLROnPlateau', {'mode': 'min', 'factor': 0.5}), + ('ReduceLROnPlateau', {'mode': 'min', 'factor': 0.2}), + + ('StepBS', {'step_size': 30, 'gamma': 2.0, 'max_batch_size': max_batch_size}), + ('StepBS', {'step_size': 50, 'gamma': 2.0, 'max_batch_size': max_batch_size}), + ('StepBS', {'step_size': 30, 'gamma': 5.0, 'max_batch_size': max_batch_size}), + ('StepBS', {'step_size': 50, 'gamma': 5.0, 'max_batch_size': max_batch_size}), + + ('StepLR', {'step_size': 30, 'gamma': 2.0}), + ('StepLR', {'step_size': 50, 'gamma': 2.0}), + ('StepLR', {'step_size': 30, 'gamma': 5.0}), + ('StepLR', {'step_size': 50, 'gamma': 5.0}), + ] + + runs = [] + for dataset, model, optimizer, seed, epochs, es_patience, batch_size, scheduler_params in \ + itertools.product(datasets, models, optimizers, seeds, epochss, es_patiences, batch_sizes, schedulers): + run = create_run(dataset=dataset, model=model, optimizer=optimizer, seed=seed, epochs=epochs, + es_patience=es_patience, batch_size=batch_size, scheduler_params=scheduler_params) + runs.append(run) + + return [f"python main.py {i}" for i in runs] + + +if __name__ == "__main__": + freeze_support() + runs = generate_runs() + + # # Debug + # for i in runs: + # print(i) + + print(len(runs)) + if last_index == -1 or last_index > len(runs): + last_index = len(runs) + + with ProcessPoolExecutor(max_workers=gpu_count * processes_per_gpu) as executor: + executor.map(run_command, [(runs[index], index) for index in range(run_index, last_index)]) diff --git a/main.py b/main.py index 872f3ad..34b835c 100644 --- a/main.py +++ b/main.py @@ -23,6 +23,7 @@ parser.add_argument('-model', default='preresnet18_c10', type=str, help='model') parser.add_argument('-fill', default=None, type=float, help='fill value for transformations') parser.add_argument('-num_threads', default=None, type=int, help='default number of threads used by pytorch') + parser.add_argument('-seed', default=3, type=int, help='seed') parser.add_argument('--cutout', action='store_true', default=False, help='apply cutout') parser.add_argument('--autoaug', action='store_true', default=False, help='apply autoaugment') parser.add_argument('--tta', action='store_true', default=False, help='use TTA') @@ -38,6 +39,5 @@ print(args) Trainer(args).run() -# TODO: Factor could be int # PYTHONOPTIMIZE=2 python main.py -device cuda:0 -lr 0.001 -bs 10 -epochs 200 -dataset cifar10 -data_path ../data -scheduler ReduceLROnPlateau -scheduler_params "{'mode':'min', 'factor':0.5}" -model preresnet18_c10 -fill 0.5 --cutout --autoaug --tta --half # noqa: E501 # PYTHONOPTIMIZE=2 python main.py -device cuda:0 -lr 0.001 -bs 10 -epochs 200 -dataset cifar10 -data_path ../data -scheduler IncreaseBSOnPlateau -scheduler_params "{'mode':'min', 'factor':2.0, 'max_batch_size': 1000}" -model preresnet18_c10 -fill 0.5 --cutout --autoaug --tta --half # noqa: E501 diff --git a/utils/scheduler.py b/utils/scheduler.py index daedca1..b493bfa 100644 --- a/utils/scheduler.py +++ b/utils/scheduler.py @@ -1,14 +1,22 @@ -from bs_scheduler import IncreaseBSOnPlateau +from bs_scheduler import IncreaseBSOnPlateau, StepBS from torch.optim import Optimizer -from torch.optim.lr_scheduler import ReduceLROnPlateau +from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR from torch.utils.data import DataLoader def init_scheduler(args, optimizer: Optimizer, train_loader: DataLoader): if args.scheduler == 'IncreaseBSOnPlateau': + # "{'mode':'min', 'factor':2.0, 'max_batch_size': 1000}" scheduler = IncreaseBSOnPlateau(train_loader, **args.scheduler_params) elif args.scheduler == 'ReduceLROnPlateau': + # "{'mode':'min', 'factor':0.5}" scheduler = ReduceLROnPlateau(optimizer, **args.scheduler_params) + elif args.scheduler == 'StepBS': + # "{'step_size':30, 'gamma': 2.0, 'max_batch_size': 1000}" + scheduler = StepBS(train_loader, **args.scheduler_params) + elif args.scheduler == 'StepLR': + # "{'step_size':30, 'gamma': 2.0}" + scheduler = StepLR(optimizer, **args.scheduler_params) else: raise NotImplementedError(f'Scheduler {args.scheduler} not implemented') return scheduler diff --git a/utils/trainer.py b/utils/trainer.py index 8a86ffa..b1d1db0 100644 --- a/utils/trainer.py +++ b/utils/trainer.py @@ -16,11 +16,13 @@ from utils.optimizer import init_optimizer from utils.scheduler import init_scheduler from utils.early_stopping import init_early_stopping +from utils.utils import seed_everything class Trainer: def __init__(self, args): self.args = args + seed_everything(args.seed) self.device = torch.device(args.device) print(f'Using {self.device}') diff --git a/utils/utils.py b/utils/utils.py new file mode 100644 index 0000000..0e32fb7 --- /dev/null +++ b/utils/utils.py @@ -0,0 +1,15 @@ +import random + +import numpy as np +import torch + + +def seed_everything(seed): + if seed is None: + return + torch.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + if torch.cuda.is_available(): + torch.backends.cudnn.deterministic = True + torch.cuda.manual_seed_all(seed)