diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py index 6f3b2840152b..24c0f250a4b9 100644 --- a/deepspeed/runtime/pipe/p2p.py +++ b/deepspeed/runtime/pipe/p2p.py @@ -14,7 +14,7 @@ def init_process_groups(grid): global _groups, _grid _grid = grid - assert _grid.pipe_parallel_size > 1, "There is no model parallelism" + assert _grid.pipe_parallel_size > 1, "There is no pipeline parallelism" _groups = [dist.new_group(ranks=group) for group in _grid.p2p_groups] diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index c8835d178eff..1a590ae51827 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -260,7 +260,7 @@ def __init__(self, for p, q in zip(self.fp16_groups[i], updated_params): p.data = q.data - #divide the flat weights into near equal paritition equal to the data parallel degree + #divide the flat weights into near equal partition equal to the data parallel degree #each process will compute on a different part of the partition data_parallel_partitions = self.get_data_parallel_partitions( self.fp16_groups_flat[i]) @@ -367,10 +367,10 @@ def __init__(self, #stores the offset at which a parameter gradient needs to be inserted in a partition self.grad_partition_insertion_offset = {} - #the offset in the gradient at which it must be inserted at the beginning of the paritition + #the offset in the gradient at which it must be inserted at the beginning of the partition self.grad_start_offset = {} - #will store the averaged gradients required by this parititon + #will store the averaged gradients required by this partition self.averaged_gradients = {} # store index of first parameter in each partition