Skip to content

Commit

Permalink
Support optimizer AdamW type (#670)
Browse files Browse the repository at this point in the history
  • Loading branch information
tjruwase authored Jan 15, 2021
1 parent f032e56 commit 865104b
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 65 deletions.
5 changes: 3 additions & 2 deletions deepspeed/runtime/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,18 @@
TENSOR_CORE_ALIGN_SIZE = 8

ADAM_OPTIMIZER = 'adam'
ADAMW_OPTIMIZER = 'adamw'
LAMB_OPTIMIZER = 'lamb'
ONEBIT_ADAM_OPTIMIZER = 'onebitadam'
DEEPSPEED_OPTIMIZERS = [
ADAM_OPTIMIZER,
ADAMW_OPTIMIZER,
LAMB_OPTIMIZER,
ONEBIT_ADAM_OPTIMIZER,
]

# extra optimizer parameters for adam
# extra optimizer parameters for adam/adamw
TORCH_ADAM_PARAM = "torch_adam"
ADAM_W_MODE_PARAM = "adam_w_mode"


class DeepSpeedConfigError(Exception):
Expand Down
11 changes: 5 additions & 6 deletions deepspeed/runtime/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
ADAM_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, \
TORCH_ADAM_PARAM, ADAM_W_MODE_PARAM
ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, \
TORCH_ADAM_PARAM

from deepspeed.runtime.dataloader import DeepSpeedDataLoader
from deepspeed.runtime.constants import \
Expand Down Expand Up @@ -582,10 +582,9 @@ def _configure_basic_optimizer(self, model_parameters):
"'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details"
)

if self.optimizer_name() == ADAM_OPTIMIZER:
if self.optimizer_name() in [ADAM_OPTIMIZER, ADAMW_OPTIMIZER]:
torch_adam = optimizer_parameters.pop(TORCH_ADAM_PARAM, False)
adam_w_mode = optimizer_parameters.pop(ADAM_W_MODE_PARAM, True)

adam_w_mode = self.optimizer_name() == ADAMW_OPTIMIZER
# zero-offload torch-adam adam_w_mode optimizer
# T|F T T torch.optim.AdamW
# T|F T F torch.optim.Adam
Expand All @@ -603,7 +602,7 @@ def _configure_basic_optimizer(self, model_parameters):
**optimizer_parameters,
adamw_mode=adam_w_mode)
else:
optimizer_parameters[ADAM_W_MODE_PARAM] = adam_w_mode
optimizer_parameters['adam_w_mode'] = adam_w_mode
optimizer = FusedAdam(model_parameters, **optimizer_parameters)

elif self.optimizer_name() == LAMB_OPTIMIZER:
Expand Down
7 changes: 6 additions & 1 deletion deepspeed/runtime/zero/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@ def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
return my_group


ZERO_SUPPORTED_OPTIMIZERS = [torch.optim.Adam, FusedAdam, DeepSpeedCPUAdam]
ZERO_SUPPORTED_OPTIMIZERS = [
torch.optim.Adam,
torch.optim.AdamW,
FusedAdam,
DeepSpeedCPUAdam
]

# Add apex FusedAdam to supported list if apex is installed
try:
Expand Down
162 changes: 106 additions & 56 deletions tests/unit/test_fp16.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def test_lamb_fp32_grad_clip(tmpdir):

@distributed_test(world_size=[1, 2])
def _test_lamb_fp32_grad_clip(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -73,9 +73,9 @@ def test_lamb_fp16_basic(tmpdir):

@distributed_test(world_size=[1, 2])
def _test_lamb_fp16_basic(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -110,9 +110,9 @@ def test_lamb_fp16_empty_grad(tmpdir):

@distributed_test(world_size=[2])
def _test_lamb_fp16_empty_grad(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -147,9 +147,9 @@ def test_adam_fp32_empty_grad(tmpdir):

@distributed_test(world_size=[2])
def _test_adam_fp32_empty_grad(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -179,9 +179,9 @@ def test_adamw_fp16_basic(tmpdir):
@distributed_test(world_size=[1])
def _test_adamw_fp16_basic(args, model, hidden_dim):
optimizer = torch.optim.AdamW(params=model.parameters())
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -210,10 +210,10 @@ def test_dict_config_adamw_fp16_basic():
@distributed_test(world_size=[1])
def _test_adamw_fp16_basic(args, model, hidden_dim, config_dict):
optimizer = torch.optim.AdamW(params=model.parameters())
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer,
config_params=config_dict)
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer,
config_params=config_dict)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -245,9 +245,9 @@ def test_adamw_fp16_empty_grad(tmpdir):
@distributed_test(world_size=[1])
def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
optimizer = torch.optim.AdamW(params=model.parameters())
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand All @@ -270,7 +270,7 @@ def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
True),
])
def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_batch_size": 1,
Expand Down Expand Up @@ -311,9 +311,9 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offlo

@distributed_test(world_size=[1])
def _test_adam_fp16_zero_onecycle_compatibility(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand All @@ -338,7 +338,7 @@ def _test_adam_fp16_zero_onecycle_compatibility(args, model, hidden_dim):
True),
])
def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_batch_size": 4,
Expand All @@ -364,9 +364,9 @@ def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
def _test_zero_static_scale(args):
hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=True)
model, optim, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, optim, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())

# Ensure the static scaler is configured.
assert optim.dynamic_loss_scale == False
Expand Down Expand Up @@ -407,9 +407,9 @@ def test_zero_static_scale_deprecated_format(tmpdir):
def _test_zero_static_scale(args):
hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=True)
model, optim, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, optim, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())

# Ensure the static scaler is configured.
assert optim.dynamic_loss_scale == False
Expand Down Expand Up @@ -438,7 +438,7 @@ def _test_zero_static_scale(args):
True),
])
def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_batch_size": 4,
Expand All @@ -460,10 +460,10 @@ def _test_zero_allow_untested_optimizer(args):
model = SimpleModel(hidden_dim, empty_grad=True)
optimizer = SimpleOptimizer(model.parameters())
with pytest.raises(AssertionError):
model, optim, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer,
model_parameters=model.parameters())
model, optim, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer,
model_parameters=model.parameters())

_test_zero_allow_untested_optimizer(args)

Expand All @@ -478,7 +478,7 @@ def _test_zero_allow_untested_optimizer(args):
True),
])
def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_micro_batch_size_per_gpu": 1,
Expand Down Expand Up @@ -536,9 +536,9 @@ def test_adam_amp_basic(tmpdir):
@distributed_test(world_size=[1])
def _test_adam_amp_basic(args, model, hidden_dim):
optimizer = torch.optim.Adam(params=model.parameters())
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -574,9 +574,9 @@ def test_lamb_amp_basic(tmpdir):

@distributed_test(world_size=[1, 2])
def _test_lamb_amp_basic(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -613,9 +613,9 @@ def test_adam_amp_o2(tmpdir):

@distributed_test(world_size=[1, 2])
def _test_adam_amp_o2(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -652,9 +652,9 @@ def test_adam_amp_o2_empty_grad(tmpdir):

@distributed_test(world_size=[2])
def _test_adam_amp_o2_empty_grad(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -694,8 +694,8 @@ def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_construct
def _test_zero_supported_client_optimizer(args, model, optimizer_constructor):
client_optimizer = optimizer_constructor(params=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=client_optimizer)
model=model,
optimizer=client_optimizer)

_test_zero_supported_client_optimizer(args=args,
model=model,
Expand Down Expand Up @@ -732,9 +732,9 @@ def test_zero2_reduce_scatter_off(tmpdir):

@distributed_test(world_size=[2])
def _helper(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand All @@ -745,3 +745,53 @@ def _helper(args, model, hidden_dim):
model.step()

_helper(args=args, model=model, hidden_dim=hidden_dim)


@pytest.mark.parametrize('adam_type, torch_impl',
[('Adam',
True),
('Adam',
False),
('AdamW',
True),
('AdamW',
False)])
def test_fp16_adam_types(tmpdir, adam_type, torch_impl):
config_dict = {
"train_batch_size": 1,
"steps_per_print": 1,
"fp16": {
"enabled": True,
"initial_scale_power": 10
},
"optimizer": {
"type": adam_type,
"torch_adam": torch_impl,
"params": {
"lr": 0.00015
}
}
}
args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10

model = SimpleModel(hidden_dim, empty_grad=False)

@distributed_test(world_size=[1])
def _test_fp16_adam_types(args, model, hidden_dim):

model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())

data_loader = random_dataloader(model=model,
total_samples=10,
hidden_dim=hidden_dim,
device=model.device)

for _, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
model.step()

_test_fp16_adam_types(args=args, model=model, hidden_dim=hidden_dim)

0 comments on commit 865104b

Please sign in to comment.