Skip to content
This repository has been archived by the owner on Jul 1, 2024. It is now read-only.

Move use_gpu from ClassyTrainer to ClassificationTask #468

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ jobs:
pip install .
classy-project my-project
pushd my-project
./classy_train.py --device cpu --config configs/template_config.json
./classy_train.py --config configs/template_config.json
popd
rm -rf my-project

Expand Down
7 changes: 1 addition & 6 deletions classy_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,18 +93,13 @@ def main(args, config):
# Configure hooks to do tensorboard logging, checkpoints and so on
task.set_hooks(configure_hooks(args, config))

use_gpu = None
if args.device is not None:
use_gpu = args.device == "gpu"
assert torch.cuda.is_available() or not use_gpu, "CUDA is unavailable"

# LocalTrainer is used for a single node. DistributedTrainer will setup
# training to use PyTorch's DistributedDataParallel.
trainer_class = {"none": LocalTrainer, "ddp": DistributedTrainer}[
args.distributed_backend
]

trainer = trainer_class(use_gpu=use_gpu, num_dataloader_workers=args.num_workers)
trainer = trainer_class(num_dataloader_workers=args.num_workers)

logging.info(
f"Starting training on rank {get_rank()} worker. "
Expand Down
13 changes: 0 additions & 13 deletions classy_vision/generic/opts.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,6 @@ def add_generic_args(parser):
parser.add_argument(
"--config_file", type=str, help="path to config file for model", required=True
)
parser.add_argument(
"--device",
default=None,
type=str,
help="device to use: either 'cpu' or 'gpu'. If unspecified, will use GPU when available and CPU otherwise.",
)
parser.add_argument(
"--num_workers",
default=4,
Expand Down Expand Up @@ -145,13 +139,6 @@ def check_generic_args(args):
# check types and values:
assert is_pos_int(args.num_workers), "incorrect number of workers"
assert is_pos_int(args.visdom_port), "incorrect visdom port"
assert (
args.device is None or args.device == "cpu" or args.device == "gpu"
), "unknown device"

# check that CUDA is available:
if args.device == "gpu":
assert torch.cuda.is_available(), "CUDA required to train on GPUs"

# create checkpoint folder if it does not exist:
if args.checkpoint_folder != "" and not os.path.exists(args.checkpoint_folder):
Expand Down
43 changes: 24 additions & 19 deletions classy_vision/tasks/classification_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,16 @@ def __init__(self):
self.perf_log = []
self.last_batch = None
self.batch_norm_sync_mode = BatchNormSyncMode.DISABLED
self.use_gpu = torch.cuda.is_available()

def set_use_gpu(self, use_gpu: bool):
self.use_gpu = use_gpu

assert (
not self.use_gpu or torch.cuda.is_available()
), "CUDA required to train on GPUs"

return self

def set_checkpoint(self, checkpoint):
"""Sets checkpoint on task.
Expand Down Expand Up @@ -359,6 +369,10 @@ def from_config(cls, config: Dict[str, Any]) -> "ClassificationTask":
.set_hooks(hooks)
)

use_gpu = config.get("use_gpu")
if use_gpu is not None:
task.set_use_gpu(use_gpu)

for phase_type in phase_types:
task.set_dataset(datasets[phase_type], phase_type)

Expand Down Expand Up @@ -508,24 +522,19 @@ def build_dataloaders(
for phase_type in self.datasets.keys()
}

def prepare(
self,
num_dataloader_workers=0,
pin_memory=False,
use_gpu=False,
dataloader_mp_context=None,
):
def prepare(self, num_dataloader_workers=0, dataloader_mp_context=None):
"""Prepares task for training, populates all derived attributes

Args:
num_dataloader_workers: Number of dataloading processes. If 0,
dataloading is done on main process
pin_memory: if true pin memory on GPU
use_gpu: if true, load model, optimizer, loss, etc on GPU
dataloader_mp_context: Determines how processes are spawned.
Value must be one of None, "spawn", "fork", "forkserver".
If None, then context is inherited from parent process
"""

pin_memory = self.use_gpu and torch.cuda.device_count() > 1

self.phases = self._build_phases()
self.dataloaders = self.build_dataloaders(
num_workers=num_dataloader_workers,
Expand All @@ -539,7 +548,7 @@ def prepare(
self.base_model = apex.parallel.convert_syncbn_model(self.base_model)

# move the model and loss to the right device
if use_gpu:
if self.use_gpu:
self.base_model, self.loss = copy_model_to_gpu(self.base_model, self.loss)
else:
self.loss.cpu()
Expand Down Expand Up @@ -686,7 +695,7 @@ def set_classy_state(self, state):
# Set up pytorch module in train vs eval mode, update optimizer.
self._set_model_train_mode()

def eval_step(self, use_gpu):
def eval_step(self):
self.last_batch = None

# Process next sample
Expand All @@ -699,7 +708,7 @@ def eval_step(self, use_gpu):

# Copy sample to GPU
target = sample["target"]
if use_gpu:
if self.use_gpu:
for key, value in sample.items():
sample[key] = recursive_copy_to_gpu(value, non_blocking=True)

Expand All @@ -726,12 +735,8 @@ def check_inf_nan(self, loss):
if loss == float("inf") or loss == float("-inf") or loss != loss:
raise FloatingPointError(f"Loss is infinity or NaN: {loss}")

def train_step(self, use_gpu):
"""Train step to be executed in train loop

Args:
use_gpu: if true, execute training on GPU
"""
def train_step(self):
"""Train step to be executed in train loop."""

self.last_batch = None

Expand All @@ -745,7 +750,7 @@ def train_step(self, use_gpu):

# Copy sample to GPU
target = sample["target"]
if use_gpu:
if self.use_gpu:
for key, value in sample.items():
sample[key] = recursive_copy_to_gpu(value, non_blocking=True)

Expand Down
23 changes: 6 additions & 17 deletions classy_vision/tasks/classy_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,7 @@ def set_classy_state(self, state):

@abstractmethod
def prepare(
self,
num_dataloader_workers=0,
pin_memory=False,
use_gpu=False,
dataloader_mp_context=None,
self, num_dataloader_workers=0, dataloader_mp_context=None
) -> None:
"""
Prepares the task for training.
Expand All @@ -102,19 +98,15 @@ def prepare(
num_dataloader_workers: Number of workers to create for the dataloaders
pin_memory: Whether the dataloaders should copy the Tensors into CUDA
pinned memory (default False)
use_gpu: True if training on GPUs, False otherwise
"""
pass

@abstractmethod
def train_step(self, use_gpu) -> None:
def train_step(self) -> None:
"""
Run a train step.

This corresponds to training over one batch of data from the dataloaders.

Args:
use_gpu: True if training on GPUs, False otherwise
"""
pass

Expand Down Expand Up @@ -155,24 +147,21 @@ def on_end(self):
pass

@abstractmethod
def eval_step(self, use_gpu) -> None:
def eval_step(self) -> None:
"""
Run an evaluation step.

This corresponds to evaluating the model over one batch of data.

Args:
use_gpu: True if training on GPUs, False otherwise
"""
pass

def step(self, use_gpu) -> None:
def step(self) -> None:
from classy_vision.hooks import ClassyHookFunctions

if self.train:
self.train_step(use_gpu)
self.train_step()
else:
self.eval_step(use_gpu)
self.eval_step()

for hook in self.hooks:
hook.on_step(self)
Expand Down
10 changes: 2 additions & 8 deletions classy_vision/tasks/fine_tuning_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,18 +67,12 @@ def _set_model_train_mode(self):
self.base_model.train(phase["train"])

def prepare(
self,
num_dataloader_workers: int = 0,
pin_memory: bool = False,
use_gpu: bool = False,
dataloader_mp_context=None,
self, num_dataloader_workers: int = 0, dataloader_mp_context=None
) -> None:
assert (
self.pretrained_checkpoint is not None
), "Need a pretrained checkpoint for fine tuning"
super().prepare(
num_dataloader_workers, pin_memory, use_gpu, dataloader_mp_context
)
super().prepare(num_dataloader_workers, dataloader_mp_context)
if self.checkpoint is None:
# no checkpoint exists, load the model's state from the pretrained
# checkpoint
Expand Down
12 changes: 1 addition & 11 deletions classy_vision/trainer/classy_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,18 @@ class ClassyTrainer:

def __init__(
self,
use_gpu: Optional[bool] = None,
num_dataloader_workers: int = 0,
dataloader_mp_context: Optional[str] = None,
):
"""Constructor for ClassyTrainer.

Args:
use_gpu: If true, then use GPUs for training.
If None, then check if we have GPUs available, if we do
then use GPU for training.
num_dataloader_workers: Number of CPU processes doing dataloading
per GPU. If 0, then dataloading is done on main thread.
dataloader_mp_context: Determines how to launch
new processes for dataloading. Must be one of "fork", "forkserver",
"spawn". If None, process launching is inherited from parent.
"""
if use_gpu is None:
use_gpu = torch.cuda.is_available()
self.use_gpu = use_gpu
self.num_dataloader_workers = num_dataloader_workers
self.dataloader_mp_context = dataloader_mp_context

Expand All @@ -57,11 +50,8 @@ def train(self, task: ClassyTask):
everything that is needed for training
"""

pin_memory = self.use_gpu and torch.cuda.device_count() > 1
task.prepare(
num_dataloader_workers=self.num_dataloader_workers,
pin_memory=pin_memory,
use_gpu=self.use_gpu,
dataloader_mp_context=self.dataloader_mp_context,
)
assert isinstance(task, ClassyTask)
Expand All @@ -75,7 +65,7 @@ def train(self, task: ClassyTask):
task.on_phase_start()
while True:
try:
task.step(self.use_gpu)
task.step()
except StopIteration:
break
task.on_phase_end()
Expand Down
30 changes: 5 additions & 25 deletions classy_vision/trainer/distributed_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,39 +56,19 @@ class DistributedTrainer(ClassyTrainer):
"""Distributed trainer for using multiple training processes
"""

def __init__(
self,
use_gpu: Optional[bool] = None,
num_dataloader_workers: int = 0,
dataloader_mp_context: Optional[str] = None,
):
"""Constructor for DistributedTrainer.

Args:
use_gpu: If true, then use GPU 0 for training.
If None, then check if we have GPUs available, if we do
then use GPU for training.
num_dataloader_workers: Number of CPU processes doing dataloading
per GPU. If 0, then dataloading is done on main thread.
dataloader_mp_context: Determines how to launch
new processes for dataloading. Must be one of "fork", "forkserver",
"spawn". If None, process launching is inherited from parent.
"""
super().__init__(
use_gpu=use_gpu,
num_dataloader_workers=num_dataloader_workers,
dataloader_mp_context=dataloader_mp_context,
)
def train(self, task):
_init_env_vars()
_init_distributed(self.use_gpu)
_init_distributed(task.use_gpu)
logging.info(
f"Done setting up distributed process_group with rank {get_rank()}"
+ f", world_size {get_world_size()}"
)
local_rank = int(os.environ["LOCAL_RANK"])
if self.use_gpu:
if task.use_gpu:
logging.info("Using GPU, CUDA device index: {}".format(local_rank))
set_cuda_device_index(local_rank)
else:
logging.info("Using CPU")
set_cpu_device()

super().train(task)
28 changes: 4 additions & 24 deletions classy_vision/trainer/local_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,12 @@ class LocalTrainer(ClassyTrainer):
"""Trainer to be used if you want want use only a single training process.
"""

def __init__(
self,
use_gpu: Optional[bool] = None,
num_dataloader_workers: int = 0,
dataloader_mp_context: Optional[str] = None,
):
"""Constructor for LocalTrainer.

Args:
use_gpu: If true, then use GPU 0 for training.
If None, then check if we have GPUs available, if we do
then use GPU for training.
num_dataloader_workers: Number of CPU processes doing dataloading
per GPU. If 0, then dataloading is done on main thread.
dataloader_mp_context: Determines how to launch
new processes for dataloading. Must be one of "fork", "forkserver",
"spawn". If None, process launching is inherited from parent.
"""
super().__init__(
use_gpu=use_gpu,
num_dataloader_workers=num_dataloader_workers,
dataloader_mp_context=dataloader_mp_context,
)
if self.use_gpu:
def train(self, task):
if task.use_gpu:
logging.info("Using GPU, CUDA device index: {}".format(0))
set_cuda_device_index(0)
else:
logging.info("Using CPU")
set_cpu_device()

super().train(task)
Loading