Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support AdamW optimizer type #670

Merged
merged 2 commits into from
Jan 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions deepspeed/runtime/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,18 @@
TENSOR_CORE_ALIGN_SIZE = 8

ADAM_OPTIMIZER = 'adam'
ADAMW_OPTIMIZER = 'adamw'
LAMB_OPTIMIZER = 'lamb'
ONEBIT_ADAM_OPTIMIZER = 'onebitadam'
DEEPSPEED_OPTIMIZERS = [
ADAM_OPTIMIZER,
ADAMW_OPTIMIZER,
LAMB_OPTIMIZER,
ONEBIT_ADAM_OPTIMIZER,
]

# extra optimizer parameters for adam
# extra optimizer parameters for adam/adamw
TORCH_ADAM_PARAM = "torch_adam"
ADAM_W_MODE_PARAM = "adam_w_mode"


class DeepSpeedConfigError(Exception):
Expand Down
11 changes: 5 additions & 6 deletions deepspeed/runtime/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
ADAM_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, \
TORCH_ADAM_PARAM, ADAM_W_MODE_PARAM
ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, \
TORCH_ADAM_PARAM

from deepspeed.runtime.dataloader import DeepSpeedDataLoader
from deepspeed.runtime.constants import \
Expand Down Expand Up @@ -582,10 +582,9 @@ def _configure_basic_optimizer(self, model_parameters):
"'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details"
)

if self.optimizer_name() == ADAM_OPTIMIZER:
if self.optimizer_name() in [ADAM_OPTIMIZER, ADAMW_OPTIMIZER]:
torch_adam = optimizer_parameters.pop(TORCH_ADAM_PARAM, False)
adam_w_mode = optimizer_parameters.pop(ADAM_W_MODE_PARAM, True)

adam_w_mode = self.optimizer_name() == ADAMW_OPTIMIZER
# zero-offload torch-adam adam_w_mode optimizer
# T|F T T torch.optim.AdamW
# T|F T F torch.optim.Adam
Expand All @@ -603,7 +602,7 @@ def _configure_basic_optimizer(self, model_parameters):
**optimizer_parameters,
adamw_mode=adam_w_mode)
else:
optimizer_parameters[ADAM_W_MODE_PARAM] = adam_w_mode
optimizer_parameters['adam_w_mode'] = adam_w_mode
optimizer = FusedAdam(model_parameters, **optimizer_parameters)

elif self.optimizer_name() == LAMB_OPTIMIZER:
Expand Down
7 changes: 6 additions & 1 deletion deepspeed/runtime/zero/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@ def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
return my_group


ZERO_SUPPORTED_OPTIMIZERS = [torch.optim.Adam, FusedAdam, DeepSpeedCPUAdam]
ZERO_SUPPORTED_OPTIMIZERS = [
torch.optim.Adam,
torch.optim.AdamW,
FusedAdam,
DeepSpeedCPUAdam
]

# Add apex FusedAdam to supported list if apex is installed
try:
Expand Down
162 changes: 106 additions & 56 deletions tests/unit/test_fp16.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def test_lamb_fp32_grad_clip(tmpdir):

@distributed_test(world_size=[1, 2])
def _test_lamb_fp32_grad_clip(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -73,9 +73,9 @@ def test_lamb_fp16_basic(tmpdir):

@distributed_test(world_size=[1, 2])
def _test_lamb_fp16_basic(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -110,9 +110,9 @@ def test_lamb_fp16_empty_grad(tmpdir):

@distributed_test(world_size=[2])
def _test_lamb_fp16_empty_grad(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -147,9 +147,9 @@ def test_adam_fp32_empty_grad(tmpdir):

@distributed_test(world_size=[2])
def _test_adam_fp32_empty_grad(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -179,9 +179,9 @@ def test_adamw_fp16_basic(tmpdir):
@distributed_test(world_size=[1])
def _test_adamw_fp16_basic(args, model, hidden_dim):
optimizer = torch.optim.AdamW(params=model.parameters())
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -210,10 +210,10 @@ def test_dict_config_adamw_fp16_basic():
@distributed_test(world_size=[1])
def _test_adamw_fp16_basic(args, model, hidden_dim, config_dict):
optimizer = torch.optim.AdamW(params=model.parameters())
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer,
config_params=config_dict)
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer,
config_params=config_dict)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -245,9 +245,9 @@ def test_adamw_fp16_empty_grad(tmpdir):
@distributed_test(world_size=[1])
def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
optimizer = torch.optim.AdamW(params=model.parameters())
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand All @@ -270,7 +270,7 @@ def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
True),
])
def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_batch_size": 1,
Expand Down Expand Up @@ -311,9 +311,9 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offlo

@distributed_test(world_size=[1])
def _test_adam_fp16_zero_onecycle_compatibility(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand All @@ -338,7 +338,7 @@ def _test_adam_fp16_zero_onecycle_compatibility(args, model, hidden_dim):
True),
])
def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_batch_size": 4,
Expand All @@ -364,9 +364,9 @@ def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
def _test_zero_static_scale(args):
hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=True)
model, optim, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, optim, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())

# Ensure the static scaler is configured.
assert optim.dynamic_loss_scale == False
Expand Down Expand Up @@ -407,9 +407,9 @@ def test_zero_static_scale_deprecated_format(tmpdir):
def _test_zero_static_scale(args):
hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=True)
model, optim, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, optim, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())

# Ensure the static scaler is configured.
assert optim.dynamic_loss_scale == False
Expand Down Expand Up @@ -438,7 +438,7 @@ def _test_zero_static_scale(args):
True),
])
def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_batch_size": 4,
Expand All @@ -460,10 +460,10 @@ def _test_zero_allow_untested_optimizer(args):
model = SimpleModel(hidden_dim, empty_grad=True)
optimizer = SimpleOptimizer(model.parameters())
with pytest.raises(AssertionError):
model, optim, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer,
model_parameters=model.parameters())
model, optim, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer,
model_parameters=model.parameters())

_test_zero_allow_untested_optimizer(args)

Expand All @@ -478,7 +478,7 @@ def _test_zero_allow_untested_optimizer(args):
True),
])
def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_micro_batch_size_per_gpu": 1,
Expand Down Expand Up @@ -536,9 +536,9 @@ def test_adam_amp_basic(tmpdir):
@distributed_test(world_size=[1])
def _test_adam_amp_basic(args, model, hidden_dim):
optimizer = torch.optim.Adam(params=model.parameters())
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -574,9 +574,9 @@ def test_lamb_amp_basic(tmpdir):

@distributed_test(world_size=[1, 2])
def _test_lamb_amp_basic(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -613,9 +613,9 @@ def test_adam_amp_o2(tmpdir):

@distributed_test(world_size=[1, 2])
def _test_adam_amp_o2(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -652,9 +652,9 @@ def test_adam_amp_o2_empty_grad(tmpdir):

@distributed_test(world_size=[2])
def _test_adam_amp_o2_empty_grad(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand Down Expand Up @@ -694,8 +694,8 @@ def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_construct
def _test_zero_supported_client_optimizer(args, model, optimizer_constructor):
client_optimizer = optimizer_constructor(params=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=client_optimizer)
model=model,
optimizer=client_optimizer)

_test_zero_supported_client_optimizer(args=args,
model=model,
Expand Down Expand Up @@ -732,9 +732,9 @@ def test_zero2_reduce_scatter_off(tmpdir):

@distributed_test(world_size=[2])
def _helper(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
Expand All @@ -745,3 +745,53 @@ def _helper(args, model, hidden_dim):
model.step()

_helper(args=args, model=model, hidden_dim=hidden_dim)


@pytest.mark.parametrize('adam_type, torch_impl',
[('Adam',
True),
('Adam',
False),
('AdamW',
True),
('AdamW',
False)])
def test_fp16_adam_types(tmpdir, adam_type, torch_impl):
config_dict = {
"train_batch_size": 1,
"steps_per_print": 1,
"fp16": {
"enabled": True,
"initial_scale_power": 10
},
"optimizer": {
"type": adam_type,
"torch_adam": torch_impl,
"params": {
"lr": 0.00015
}
}
}
args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10

model = SimpleModel(hidden_dim, empty_grad=False)

@distributed_test(world_size=[1])
def _test_fp16_adam_types(args, model, hidden_dim):

model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())

data_loader = random_dataloader(model=model,
total_samples=10,
hidden_dim=hidden_dim,
device=model.device)

for _, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
model.step()

_test_fp16_adam_types(args=args, model=model, hidden_dim=hidden_dim)