Skip to content

Commit

Permalink
add additional validation checks in elastic config (microsoft#646)
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffra authored Jan 8, 2021
1 parent 828d75b commit bc046dc
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 2 deletions.
30 changes: 30 additions & 0 deletions deepspeed/elasticity/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,39 @@ def __init__(self, param_dict):
MAX_ACCEPTABLE_BATCH_SIZE,
MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT)
self.micro_batches = param_dict.get(MICRO_BATCHES, MICRO_BATCHES_DEFAULT)

if not isinstance(self.micro_batches, list):
raise ElasticityConfigError(
f"Elasticity expected value of {MICRO_BATCHES} to be a "
f"list of micro batches, instead is: {type(self.micro_batches)}, containing: {self.micro_batches}"
)

if not all(map(lambda m: isinstance(m, int), self.micro_batches)):
raise ElasticityConfigError(
f"Elasticity expected {MICRO_BATCHES} to only contain a list of integers, "
f"instead contains: f{self.micro_batches}")

if not all(map(lambda m: m > 0, self.micro_batches)):
raise ElasticityConfigError(
f"Elasticity expected {MICRO_BATCHES} to only contain positive integers, "
f"instead contains: f{self.micro_batches}")

self.min_gpus = param_dict.get(MIN_GPUS, MIN_GPUS_DEFAULT)
self.max_gpus = param_dict.get(MAX_GPUS, MAX_GPUS_DEFAULT)
if self.min_gpus < 1 or self.max_gpus < 1:
raise ElasticityConfigError(
"Elasticity min/max gpus must be > 0, "
f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")
if self.max_gpus < self.min_gpus:
raise ElasticityConfigError(
"Elasticity min_gpus cannot be greater than max_gpus, "
f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")

self.min_time = param_dict.get(MIN_TIME, MIN_TIME_DEFAULT)
if self.min_time < 0:
raise ElasticityConfigError(
f"Elasticity min time needs to be >= 0: given {self.min_time}")

self.version = param_dict.get(VERSION, VERSION_DEFAULT)
self.prefer_larger_batch_size = param_dict.get(PREFER_LARGER_BATCH,
PREFER_LARGER_BATCH_DEFAULT)
Expand Down
4 changes: 2 additions & 2 deletions deepspeed/elasticity/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@
MAX_GPUS = 'max_gpus'
MAX_GPUS_DEFAULT = 10000

# Minimum running time (minutes) before the scheduler will scale us
# Minimum running time (minutes) before the scheduler will scale us, 0 implies it's unknown
MIN_TIME = "min_time"
MIN_TIME_DEFAULT = "20"
MIN_TIME_DEFAULT = 0

# When finding a suitable batch size, attempt to find one that is closest
# to the max train batch size given.
Expand Down
29 changes: 29 additions & 0 deletions tests/unit/test_elastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,35 @@ def test_empty_config():
target_deepspeed_version=ds_version)


@pytest.mark.parametrize('key, value',
[('micro_batch_sizes',
[1,
4,
-1,
2,
-10]),
('min_gpus',
-1),
('max_gpus',
-1),
('micro_batch_sizes',
5),
('micro_batch_sizes',
['a',
None,
0.5]),
('micro_batch_sizes',
[2,
0.5,
4])])
def test_invalid_config_values(key, value):
ds_config = base_ds_config.copy()
ds_config['elasticity'][key] = value
with pytest.raises(deepspeed.elasticity.config.ElasticityError):
deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
target_deepspeed_version=ds_version)


def test_proper_mbsz():
ds_config = base_ds_config.copy()
ds_config["elasticity"]["max_train_batch_size"] = 32
Expand Down

0 comments on commit bc046dc

Please sign in to comment.