fix typo in comments with deepspeed/ (#3537)

* fix spelling error with deepspeed/runtime/ * fix typo docs/ * fix typo in comments with deepspeed/ --------- Co-authored-by: Olatunji Ruwase <[email protected]> Co-authored-by: Logan Adams <[email protected]>
deepspeedai · May 15, 2023 · c8d3f5e · c8d3f5e
1 parent 9685eb9
commit c8d3f5e
Show file tree

Hide file tree

Showing 9 changed files with 13 additions and 13 deletions.
diff --git a/deepspeed/compression/basic_layer.py b/deepspeed/compression/basic_layer.py
@@ -673,7 +673,7 @@ def _split(input_):
 
 
 def _gather(input_):
-    """Gather tensors and concatinate along the last dimension."""
+    """Gather tensors and concatenate along the last dimension."""
     group = g_mpu.get_model_parallel_group()
 
     # Bypass the function if we are using only 1 GPU.
@@ -708,7 +708,7 @@ def backward(ctx, grad_output):
 
 
 class _ReduceFromModelParallelRegion(torch.autograd.Function):
-    """All-redcue the input from the model parallel region."""
+    """All-reduce the input from the model parallel region."""
 
     @staticmethod
     def forward(ctx, input_):
@@ -732,7 +732,7 @@ def backward(ctx, grad_output):
 
 
 class _GatherFromModelParallelRegion(torch.autograd.Function):
-    """Gather the input from model parallel region and concatinate."""
+    """Gather the input from model parallel region and concatenate."""
 
     @staticmethod
     def forward(ctx, input_):

diff --git a/deepspeed/compression/compress.py b/deepspeed/compression/compress.py
@@ -212,7 +212,7 @@ def student_initialization(student_model, teacher_model, deepspeed_config):
             The prefix name before the layer #.
             Example 1: bert.encoder.layer, for BERT_base model's prefix name
             Example 2: transformer.h, for GPT-2 hugging face prefix name
-        teacher_layer (`list of intergers`)
+        teacher_layer (`list of integers`)
             The layer of teacher will be used for student's reinitializedion
             Example 1: [1,3,5,7,9], means we want to matches the 2nd/4th/6th/8th/10th layer of teacher to the first 5 layers of student
         student_layer (`list` or None)

diff --git a/deepspeed/nebula/constants.py b/deepspeed/nebula/constants.py
@@ -29,7 +29,7 @@
 # There is a case where customer want to load the checkpoint saved
 # by raw torch. Because nebula cannot load torch checkpoint directly
 # as they have different folder structures to bring the gap for
-# loading(the data are totaly same in bytes for torch and enbula s
+# loading(the data are totally same in bytes for torch and nebula s
 # aving).
 # In this case, we must disable nebula load to use raw torch load.
 # Customer can just set NEBULA_ENABLE_NEBULA_LOAD to False. Then use
@@ -60,7 +60,7 @@
 NEBULA_NUM_OF_VERSION_IN_RETENTION = "num_of_version_in_retention"
 NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT = 2
 
-# Neubla envs
+# Nebula envs
 NEBULA_EXPORT_ENVS = [
     'DLTS_JOB_ID', 'DLTS_NUM_WORKER', 'NEBULA_PERSISTENT_STORAGE_PATH', 'NEBULA_PERSISTENT_TIME_INTERVAL',
     'AML_RUN_ID', 'AZUREML_RUN_TOKEN', 'AZUREML_WORKSPACE_SCOPE', 'AZUREML_EXPERIMENT_SCOPE',

diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py
@@ -63,7 +63,7 @@ def __init__(self,
                 algorithm from the paper `On the Convergence of Adam and Beyond`_
                 (default: False) NOT SUPPORTED in DeepSpeed CPUAdam!
             adamw_mode: select between Adam and AdamW implementations (default: AdamW)
-            full_precision_optimizer_states: creates momementum and variance in full precision regardless of
+            full_precision_optimizer_states: creates momentum and variance in full precision regardless of
                         the precision of the parameters (default: True)
         """
 

diff --git a/deepspeed/ops/adam/fused_adam.py b/deepspeed/ops/adam/fused_adam.py
@@ -148,7 +148,7 @@ def step(self, closure=None, grads=None, output_params=None, scale=None, grad_no
                 # State initialization
                 if len(state) == 0:
                     # DeepSpeed ZeRO 3 processes each subgroup a time, so we need to keep tracking step count for each tensor separately.
-                    # While this is not an issue for ZeRO 1 & 2, since they apply a single optimizatin step to the whole param group at the same time.
+                    # While this is not an issue for ZeRO 1 & 2, since they apply a single optimization step to the whole param group at the same time.
                     # In order to keep backward compatibility for the existing checkpoints, we use group['state'] to initialize state['step'] if it exists.
                     state['step'] = group.get('step', 0)
                     # Exponential moving average of gradient values

diff --git a/deepspeed/ops/random_ltd/dropping_utils.py b/deepspeed/ops/random_ltd/dropping_utils.py
@@ -32,7 +32,7 @@ def gpt_sample_tokens(reserved_length: int,
     sampled_indices = random_ltd_module.token_sort_(sampled_indices, seq_length)
 
     # Not certain the optimized kernel is actually better here, cause it kind of screws
-    # with alignment right if the sequence length is not divisble by like 16
+    # with alignment right if the sequence length is not divisible by like 16
     # new_mask = random_ltd_module.mask_gather_gpt(attn_mask, reserved_length)
     if attn_mask is not None:
         new_mask = attn_mask[:, :, :reserved_length, :reserved_length]

diff --git a/deepspeed/ops/sparse_attention/sparsity_config.py b/deepspeed/ops/sparse_attention/sparsity_config.py
@@ -78,7 +78,7 @@ def __init__(self, num_heads, block=16, different_layout_per_head=False):
         super().__init__(num_heads, block, different_layout_per_head)
 
     def make_layout(self, seq_len):
-        """Set 1 to all blocks of the layout meanins the pattern is dense; not sparse.
+        """Set 1 to all blocks of the layout meaning the pattern is dense; not sparse.
 
         Arguments:
              seq_len: required: an integer determining the underling sequence length; must be <= max sequence length
@@ -702,7 +702,7 @@ def set_sliding_window_layout(self, h, layout):
         num_blocks = layout.shape[1]
         if (num_blocks < self.num_sliding_window_blocks):
             raise ValueError(
-                f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
+                f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!'
             )
 
         w = self.num_sliding_window_blocks // 2

diff --git a/scripts/replace_copyright.py b/scripts/replace_copyright.py
@@ -115,7 +115,7 @@ def get_header_c(fp):
                 # multiline comment not closed on same line
                 in_multiline = True
         elif l.endswith(C_ML_CLOSE):
-            # Ended a multline comment
+            # Ended a multiline comment
             in_multiline = False
         elif not in_multiline or l.startswith(C_SL_COMMENT) or l.isspace():
             # Not in a comment

diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
@@ -54,7 +54,7 @@
 ]
 _opt_models = [
     "facebook/opt-125m",  # 125m, 1.7B, ..., 175B variants have the same model architecture.
-    "facebook/opt-350m",  # 350m applies layer norm after attnention layer which is different than other variants.
+    "facebook/opt-350m",  # 350m applies layer norm after attention layer which is different than other variants.
 ]
 _all_models = HfApi().list_models()