Skip to content

Commit

Permalink
Added experiment runner + support for StepBS and StepLR + seed everyt…
Browse files Browse the repository at this point in the history
…hing
  • Loading branch information
ancestor-mithril committed Apr 29, 2024
1 parent 4a8719c commit 3f7d22d
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 3 deletions.
133 changes: 133 additions & 0 deletions experiment_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import itertools
import os
import subprocess
import sys
import time
from concurrent.futures import ProcessPoolExecutor
from datetime import date
from multiprocessing import freeze_support, current_process

import torch.cuda

processes_per_gpu = 1
gpu_count = 1
max_batch_size = 1000

run_index = 0
last_index = -1
if len(sys.argv) >= 2:
run_index = int(sys.argv[1])
if len(sys.argv) >= 3:
last_index = int(sys.argv[2])


def run_command(command_idx):
command, idx = command_idx
gpu_index = current_process()._identity[0] % gpu_count
if torch.cuda.is_available():
command += f' -device cuda:{gpu_index}'
print("Command:", idx, "on gpu", gpu_index, "on process", current_process()._identity[0])
else:
command += ' -device cpu'
print("Command:", idx, "on cpu on process", current_process()._identity[0])

today = date.today()
os.makedirs('./logs', exist_ok=True)
try:
start = time.time()
with open(f"./logs/error_{idx}_{today}.txt", 'a+') as err:
subprocess.run(command, shell=True, check=True, stderr=err)
os.remove(f"./logs/error_{idx}_{today}.txt")
elapsed = (time.time() - start)
with open("./logs/finished_runs.txt", "a+") as fp:
fp.write(f"{idx} -> {today} -> " + str(elapsed) + "s + " + command + "\n")
except subprocess.CalledProcessError:
with open(f"./logs/failed_runs_{today}.txt", "a+") as fp:
fp.write(command + '\n')


def create_run(dataset, model, optimizer, seed, epochs, es_patience, batch_size, scheduler_params):
scheduler_name, scheduler_params = scheduler_params
scheduler_params = str(scheduler_params).replace(" ", "")
scheduler_params = str(scheduler_params).replace('"', '\'')
scheduler_params = '"' + scheduler_params + '"'
return (
f" -lr 0.001"
f" -bs {batch_size}"
f" -epochs {epochs}"
f" -dataset {dataset}"
f" -data_path ../data"
f" -scheduler {scheduler_name}"
f" -scheduler_params {scheduler_params}"
f" -model {model}"
f" -seed {seed}"
f" -fill 0.5"
f" --cutout"
f" --autoaug"
f" --tta"
) + " --half" if torch.cuda.is_available() else ""


def generate_runs():
datasets = [
'cifar10', 'cifar100'
]
models = [
'preresnet18_c10'
]
optimizers = [
'sgd'
]
seeds = [
2525
]
epochss = [
10
]
es_patiences = [
20
]
batch_sizes = [
10, 16, 32
]
schedulers = [
('IncreaseBSOnPlateau', {'mode': 'min', 'factor': 2.0, 'max_batch_size': max_batch_size}),
('IncreaseBSOnPlateau', {'mode': 'min', 'factor': 5.0, 'max_batch_size': max_batch_size}),
('ReduceLROnPlateau', {'mode': 'min', 'factor': 0.5}),
('ReduceLROnPlateau', {'mode': 'min', 'factor': 0.2}),

('StepBS', {'step_size': 30, 'gamma': 2.0, 'max_batch_size': max_batch_size}),
('StepBS', {'step_size': 50, 'gamma': 2.0, 'max_batch_size': max_batch_size}),
('StepBS', {'step_size': 30, 'gamma': 5.0, 'max_batch_size': max_batch_size}),
('StepBS', {'step_size': 50, 'gamma': 5.0, 'max_batch_size': max_batch_size}),

('StepLR', {'step_size': 30, 'gamma': 2.0}),
('StepLR', {'step_size': 50, 'gamma': 2.0}),
('StepLR', {'step_size': 30, 'gamma': 5.0}),
('StepLR', {'step_size': 50, 'gamma': 5.0}),
]

runs = []
for dataset, model, optimizer, seed, epochs, es_patience, batch_size, scheduler_params in \
itertools.product(datasets, models, optimizers, seeds, epochss, es_patiences, batch_sizes, schedulers):
run = create_run(dataset=dataset, model=model, optimizer=optimizer, seed=seed, epochs=epochs,
es_patience=es_patience, batch_size=batch_size, scheduler_params=scheduler_params)
runs.append(run)

return [f"python main.py {i}" for i in runs]


if __name__ == "__main__":
freeze_support()
runs = generate_runs()

# # Debug
# for i in runs:
# print(i)

print(len(runs))
if last_index == -1 or last_index > len(runs):
last_index = len(runs)

with ProcessPoolExecutor(max_workers=gpu_count * processes_per_gpu) as executor:
executor.map(run_command, [(runs[index], index) for index in range(run_index, last_index)])
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
parser.add_argument('-model', default='preresnet18_c10', type=str, help='model')
parser.add_argument('-fill', default=None, type=float, help='fill value for transformations')
parser.add_argument('-num_threads', default=None, type=int, help='default number of threads used by pytorch')
parser.add_argument('-seed', default=3, type=int, help='seed')
parser.add_argument('--cutout', action='store_true', default=False, help='apply cutout')
parser.add_argument('--autoaug', action='store_true', default=False, help='apply autoaugment')
parser.add_argument('--tta', action='store_true', default=False, help='use TTA')
Expand All @@ -38,6 +39,5 @@
print(args)
Trainer(args).run()

# TODO: Factor could be int
# PYTHONOPTIMIZE=2 python main.py -device cuda:0 -lr 0.001 -bs 10 -epochs 200 -dataset cifar10 -data_path ../data -scheduler ReduceLROnPlateau -scheduler_params "{'mode':'min', 'factor':0.5}" -model preresnet18_c10 -fill 0.5 --cutout --autoaug --tta --half # noqa: E501
# PYTHONOPTIMIZE=2 python main.py -device cuda:0 -lr 0.001 -bs 10 -epochs 200 -dataset cifar10 -data_path ../data -scheduler IncreaseBSOnPlateau -scheduler_params "{'mode':'min', 'factor':2.0, 'max_batch_size': 1000}" -model preresnet18_c10 -fill 0.5 --cutout --autoaug --tta --half # noqa: E501
12 changes: 10 additions & 2 deletions utils/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
from bs_scheduler import IncreaseBSOnPlateau
from bs_scheduler import IncreaseBSOnPlateau, StepBS
from torch.optim import Optimizer
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
from torch.utils.data import DataLoader


def init_scheduler(args, optimizer: Optimizer, train_loader: DataLoader):
if args.scheduler == 'IncreaseBSOnPlateau':
# "{'mode':'min', 'factor':2.0, 'max_batch_size': 1000}"
scheduler = IncreaseBSOnPlateau(train_loader, **args.scheduler_params)
elif args.scheduler == 'ReduceLROnPlateau':
# "{'mode':'min', 'factor':0.5}"
scheduler = ReduceLROnPlateau(optimizer, **args.scheduler_params)
elif args.scheduler == 'StepBS':
# "{'step_size':30, 'gamma': 2.0, 'max_batch_size': 1000}"
scheduler = StepBS(train_loader, **args.scheduler_params)
elif args.scheduler == 'StepLR':
# "{'step_size':30, 'gamma': 2.0}"
scheduler = StepLR(optimizer, **args.scheduler_params)
else:
raise NotImplementedError(f'Scheduler {args.scheduler} not implemented')
return scheduler
2 changes: 2 additions & 0 deletions utils/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@
from utils.optimizer import init_optimizer
from utils.scheduler import init_scheduler
from utils.early_stopping import init_early_stopping
from utils.utils import seed_everything


class Trainer:
def __init__(self, args):
self.args = args
seed_everything(args.seed)

self.device = torch.device(args.device)
print(f'Using {self.device}')
Expand Down
15 changes: 15 additions & 0 deletions utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import random

import numpy as np
import torch


def seed_everything(seed):
if seed is None:
return
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
if torch.cuda.is_available():
torch.backends.cudnn.deterministic = True
torch.cuda.manual_seed_all(seed)

0 comments on commit 3f7d22d

Please sign in to comment.