diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py index 51bab58a2a..9114bc47aa 100644 --- a/llmfoundry/models/layers/attention.py +++ b/llmfoundry/models/layers/attention.py @@ -5,7 +5,7 @@ import math import warnings -from typing import Any, List, Optional, Tuple +from typing import List, Optional, Tuple import torch import torch.nn as nn @@ -18,7 +18,7 @@ def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, - original_is_causal: bool) -> bool: + original_is_causal: bool): # disable causal when it is not needed # necessary for flash & triton for generation with kv_cache if original_is_causal and num_query_tokens != num_key_tokens: @@ -31,23 +31,6 @@ def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, return original_is_causal -def repeat_kv_for_gqa(hidden: torch.Tensor, n_rep: int) -> torch.Tensor: - """Perform repeat of kv heads along a particular dimension. - - hidden.shape expected to be: (batch size, seq len, kv_n_heads, head_dim) - n_rep: amount of repetitions of kv_n_heads - Unlike torch.repeat_interleave, this function avoids allocating new memory. - """ - if n_rep == 1: - return hidden - - b, s, kv_n_heads, d = hidden.shape - - hidden = hidden[:, :, :, None, :].expand(b, s, kv_n_heads, n_rep, d) - - return hidden.reshape(b, s, kv_n_heads * n_rep, d) - - def scaled_multihead_dot_product_attention( query: torch.Tensor, key: torch.Tensor, @@ -101,11 +84,8 @@ def scaled_multihead_dot_product_attention( # grouped query case if kv_n_heads > 1 and kv_n_heads < n_heads: - # necessary to do a transpose to swap (b h s d) -> (b s h d) for repeat_kv_for_gqa function - k = repeat_kv_for_gqa(k.transpose(1, 2), - n_heads // kv_n_heads).transpose(1, 2) - v = repeat_kv_for_gqa(v.transpose(1, 2), - n_heads // kv_n_heads).transpose(1, 2) + k = k.repeat_interleave(n_heads // kv_n_heads, dim=1) + v = v.repeat_interleave(n_heads // kv_n_heads, dim=1) if softmax_scale is None: softmax_scale = 1 / math.sqrt(d) @@ -263,16 +243,10 @@ def flash_attn_fn( elif kv_n_heads < n_heads: # Each query belong to a group of kv heads of group size n_heads // kv_n_heads # We repeat each kv head by the group size number to use the underlying MHA kernels - - # since repeat_kv_for_gqa expects input dims of (b, s, kv_n_heads, d) - # we use .view to modify {key, value}_unpad appropriately - - key_unpad = repeat_kv_for_gqa( - key_unpad.view(batch_size, seqlen, kv_n_heads, -1), - n_heads // kv_n_heads).view(batch_size * seqlen, n_heads, -1) - value_unpad = repeat_kv_for_gqa( - value_unpad.view(batch_size, seqlen, kv_n_heads, -1), - n_heads // kv_n_heads).view(batch_size * seqlen, n_heads, -1) + # done along the head dimension = 1 + key_unpad = key_unpad.repeat_interleave(n_heads // kv_n_heads, dim=1) + value_unpad = value_unpad.repeat_interleave(n_heads // kv_n_heads, + dim=1) dropout_p = dropout_p if training else 0.0 @@ -399,108 +373,6 @@ def triton_flash_attn_fn( key = rearrange(key, 'b s (h d) -> b s h d', h=kv_n_heads) value = rearrange(value, 'b s (h d) -> b s h d', h=kv_n_heads) - # multi-query case - if kv_n_heads == 1: - # necessary to repeat instead of expand tensor because - # output contains NaN in edge cases such as with head dimension = 8 - key = key.repeat(1, 1, n_heads, 1) - value = value.repeat(1, 1, n_heads, 1) - # grouped query case - elif kv_n_heads < n_heads: - # Each query belong to a group of kv heads of group size n_heads // kv_n_heads - # We repeat each kv head by the group size number to use the underlying MHA kernels - key = repeat_kv_for_gqa(key, n_heads // kv_n_heads) - value = repeat_kv_for_gqa(value, n_heads // kv_n_heads) - - reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal) - attn_output = flash_attn_func( # type: ignore - query, key, value, attn_bias, reset_is_causal, softmax_scale) - - output = attn_output.view(*attn_output.shape[:2], -1) # type: ignore - - return output, None, past_key_value - -def xformers_attn_fn(query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - n_heads: int, - kv_n_heads: Optional[int] = None, - past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, - softmax_scale: Optional[float] = None, - attn_bias: Optional[torch.Tensor] = None, - key_padding_mask: Optional[torch.Tensor] = None, - is_causal: bool = False, - dropout_p: float = 0.0, - training: bool = False, - needs_weights: bool = False, - multiquery: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor, - torch.Tensor]]]: - - try: - from xformers.ops import memory_efficient_attention - except: - raise RuntimeError( - 'Please install xformers.' - ) - - check_valid_inputs(query, key, value) - - if multiquery: - warnings.warn( - DeprecationWarning( - 'The direct use of the multiquery arg is deprecated. Setting kv_n_heads=1 automatically. Please set kv_n_heads=1 explicitly to remove this warning.' - )) - kv_n_heads = 1 - elif kv_n_heads is None: - warnings.warn( - DeprecationWarning( - 'Not specifying a value for the kv_n_heads arg is deprecated. Setting kv_n_heads=n_heads automatically. Please set kv_n_heads=n_heads explicitly to remove this warning.' - )) - kv_n_heads = n_heads - - if past_key_value is not None: - if len(past_key_value) != 0: - key = torch.cat([past_key_value[0], key], dim=1) - value = torch.cat([past_key_value[1], value], dim=1) - - past_key_value = (key, value) - - if attn_bias is not None: - # clamp to 0 necessary for torch 2.0 compile() - _s_q = max(0, attn_bias.size(2) - query.size(1)) - _s_k = max(0, attn_bias.size(3) - key.size(1)) - attn_bias = attn_bias[:, :, _s_q:, _s_k:] - attn_bias.expand(axis=0, query.shape.0) - if dropout_p: - raise NotImplementedError( - f'Dropout not implemented for attn_impl: triton.') - dropout_p = dropout_p if training else 0.0 - - if needs_weights: - raise NotImplementedError( - f'attn_impl: triton cannot return attn weights.') - - if key_padding_mask is not None: - warnings.warn( - 'Propagating key_padding_mask to the attention module ' +\ - 'and applying it within the attention module can cause ' +\ - 'unnecessary computation/memory usage. Consider integrating ' +\ - 'into attn_bias once and passing that to each attention ' +\ - 'module instead.' - ) - b_size, s_k = key_padding_mask.shape[:2] - - if attn_bias is None: - attn_bias = query.new_zeros(b_size, 1, 1, s_k) - - attn_bias = attn_bias.masked_fill( - ~key_padding_mask.view((b_size, 1, 1, s_k)), - torch.finfo(query.dtype).min) - - query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads) - key = rearrange(key, 'b s (h d) -> b s h d', h=kv_n_heads) - value = rearrange(value, 'b s (h d) -> b s h d', h=kv_n_heads) # multi-query case if kv_n_heads == 1: # necessary to repeat instead of expand tensor because @@ -516,13 +388,14 @@ def xformers_attn_fn(query: torch.Tensor, value = value.repeat_interleave(n_heads // kv_n_heads, dim=2) reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal) - attn_output = memory_efficient_attention( # type: ignore - query, key, value, attn_bias, p=dropout_p) + attn_output = flash_attn_func( # type: ignore + query, key, value, attn_bias, reset_is_causal, softmax_scale) output = attn_output.view(*attn_output.shape[:2], -1) # type: ignore return output, None, past_key_value + class GroupedQueryAttention(nn.Module): """Grouped Query Attention (GQA) is a generalization of Multi-head (MHA). @@ -545,8 +418,8 @@ def __init__( attn_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', fc_type: str = 'torch', + verbose: int = 0, device: Optional[str] = None, - bias: bool = True, ): super().__init__() @@ -578,9 +451,7 @@ def __init__( self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads) self.attn_dropout_p = attn_pdrop - fc_kwargs: dict[str, Any] = { - 'bias': bias, - } + fc_kwargs = {} if fc_type != 'te': fc_kwargs['device'] = device self.Wqkv = FC_CLASS_REGISTRY[fc_type]( @@ -593,7 +464,7 @@ def __init__( i * self.head_dim for i in range(1, self.n_heads + 2 * self.kv_n_heads) ] - self.Wqkv._fused = (0, fuse_splits) + self.Wqkv._fused = (0, fuse_splits) # type: ignore if self.qk_ln: norm_class = NORM_CLASS_REGISTRY[norm_type.lower()] @@ -605,8 +476,21 @@ def __init__( self.attn_fn = flash_attn_fn elif self.attn_impl == 'triton': self.attn_fn = triton_flash_attn_fn + if verbose: + warnings.warn( + 'While `attn_impl: triton` can be faster than `attn_impl: flash` ' +\ + 'it uses more memory. When training larger models this can trigger ' +\ + 'alloc retries which hurts performance. If encountered, we recommend ' +\ + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.' + ) elif self.attn_impl == 'torch': self.attn_fn = scaled_multihead_dot_product_attention + if torch.cuda.is_available() and verbose: + warnings.warn( + 'Using `attn_impl: torch`. If your model does not use `alibi` or ' +\ + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' +\ + 'we recommend using `attn_impl: triton`.' + ) else: raise ValueError(f'{attn_impl=} is an invalid setting.') @@ -615,7 +499,7 @@ def __init__( self.d_model, **fc_kwargs, ) - self.out_proj._is_residual = True + self.out_proj._is_residual = True # type: ignore def forward( self, @@ -625,8 +509,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, is_causal: bool = True, needs_weights: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[ - torch.Tensor, torch.Tensor]]]: + ): qkv = self.Wqkv(x) if self.clip_qkv: @@ -686,8 +569,8 @@ def __init__( attn_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', fc_type: str = 'torch', + verbose: int = 0, device: Optional[str] = None, - bias: bool = True, ): super().__init__( d_model=d_model, @@ -700,9 +583,8 @@ def __init__( attn_pdrop=attn_pdrop, norm_type=norm_type, fc_type=fc_type, - device=device, - bias=bias, - ) + verbose=verbose, + device=device) class MultiQueryAttention(GroupedQueryAttention): @@ -723,8 +605,8 @@ def __init__( attn_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', fc_type: str = 'torch', + verbose: int = 0, device: Optional[str] = None, - bias: bool = True, ): super().__init__( d_model=d_model, @@ -737,15 +619,12 @@ def __init__( attn_pdrop=attn_pdrop, norm_type=norm_type, fc_type=fc_type, - device=device, - bias=bias, - ) + verbose=verbose, + device=device) -def attn_bias_shape( - attn_impl: str, n_heads: int, seq_len: int, alibi: bool, - prefix_lm: bool, causal: bool, - use_sequence_id: bool) -> Optional[Tuple[int, int, int, int]]: +def attn_bias_shape(attn_impl: str, n_heads: int, seq_len: int, alibi: bool, + prefix_lm: bool, causal: bool, use_sequence_id: bool): if attn_impl == 'flash': return None elif attn_impl in ['torch', 'triton']: @@ -768,7 +647,7 @@ def build_attn_bias( causal: bool = False, alibi: bool = False, alibi_bias_max: int = 8, -) -> Optional[torch.Tensor]: +): if attn_impl == 'flash': return None elif attn_impl in ['torch', 'triton']: @@ -791,7 +670,7 @@ def build_attn_bias( def gen_slopes(n_heads: int, alibi_bias_max: int = 8, - device: Optional[torch.device] = None) -> torch.Tensor: + device: Optional[torch.device] = None): _n_heads = 2**math.ceil(math.log2(n_heads)) m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device) m = m.mul(alibi_bias_max / _n_heads) @@ -813,7 +692,7 @@ def build_alibi_bias( alibi_bias_max: int = 8, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, -) -> torch.Tensor: +): alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len) if full: diff --git a/scripts/train/benchmarking/README.md b/scripts/train/benchmarking/README.md index 7164e93bd8..1cb8b5a045 100644 --- a/scripts/train/benchmarking/README.md +++ b/scripts/train/benchmarking/README.md @@ -69,6 +69,87 @@ Our microbatching engine enables microbatch sizes that do not divde Global Batch [comment]: # (TODO: Update tables with torch 2.0 after next Composer release) +## H100 80GB +| Model | SeqLen (T) | # GPUs | GPU | MFU | HFU | MicroBatchSize | GradAccum | GlobalBatchSize | Throughput (S/s) | Throughput (T/s) | Throughput (T/s/GPU) | GlobalBatchSize (T) | Precision | MP Mode | Sharding Strategy | Activation Checkpointing | Activation CPUOffload | NumParams | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 30b | 8192 | 8 | h100_80gb | 27.37 | 36.5 | 1 | 21 | 168 | 1 | 10128 | 1266 | 1376256 | amp_bf16 | PURE | FULL_SHARD | True | False | 30019254272 | +| 30b | 8192 | 8 | h100_80gb | 27.51 | 36.68 | 1 | 21 | 168 | 1 | 10179 | 1272 | 1376256 | amp_bf16 | PURE | FULL_SHARD | True | False | 30019254272 | +| 30b | 8192 | 8 | h100_80gb | 27.87 | 37.16 | 1 | 21 | 168 | 1 | 10311 | 1288 | 1376256 | amp_bf16 | PURE | FULL_SHARD | True | False | 30019254272 | +| 30b | 4096 | 8 | h100_80gb | 34.5 | 46.0 | 1 | 21 | 168 | 3 | 13873 | 1734 | 688128 | amp_bf16 | PURE | FULL_SHARD | True | False | 29989894144 | +| 30b | 4096 | 8 | h100_80gb | 34.61 | 46.15 | 1 | 21 | 168 | 3 | 13917 | 1739 | 688128 | amp_bf16 | PURE | FULL_SHARD | True | False | 29989894144 | +| 30b | 4096 | 8 | h100_80gb | 34.51 | 46.02 | 1 | 21 | 168 | 3 | 13878 | 1734 | 688128 | amp_bf16 | PURE | FULL_SHARD | True | False | 29989894144 | +| 30b | 2048 | 8 | h100_80gb | 37.35 | 49.8 | 3 | 21 | 504 | 7 | 15700 | 1962 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29975214080 | +| 30b | 2048 | 8 | h100_80gb | 37.33 | 49.78 | 3 | 21 | 504 | 7 | 15693 | 1961 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29975214080 | +| 30b | 2048 | 8 | h100_80gb | 37.54 | 50.05 | 3 | 21 | 504 | 7 | 15781 | 1972 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29975214080 | +| 30b | 1024 | 8 | h100_80gb | 38.21 | 50.94 | 6 | 21 | 1008 | 16 | 16433 | 2054 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29967874048 | +| 30b | 1024 | 8 | h100_80gb | 37.83 | 50.44 | 6 | 21 | 1008 | 15 | 16271 | 2033 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29967874048 | +| 30b | 1024 | 8 | h100_80gb | 38.07 | 50.76 | 6 | 21 | 1008 | 15 | 16376 | 2047 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29967874048 | +| 30b | 512 | 8 | h100_80gb | 38.64 | 51.52 | 12 | 21 | 2016 | 32 | 16816 | 2102 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29964204032 | +| 30b | 512 | 8 | h100_80gb | 38.38 | 51.17 | 12 | 21 | 2016 | 32 | 16702 | 2087 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29964204032 | +| 30b | 512 | 8 | h100_80gb | 38.47 | 51.29 | 12 | 21 | 2016 | 32 | 16741 | 2092 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29964204032 | +| 13b | 32768 | 8 | h100_80gb | 30.55 | 40.74 | 1 | 3 | 24 | 0 | 15250 | 1906 | 786432 | amp_bf16 | PURE | FULL_SHARD | True | False | 13011240960 | +| 13b | 32768 | 8 | h100_80gb | 30.73 | 40.97 | 1 | 3 | 24 | 0 | 15338 | 1917 | 786432 | amp_bf16 | PURE | FULL_SHARD | True | False | 13011240960 | +| 13b | 32768 | 8 | h100_80gb | 30.67 | 40.9 | 1 | 3 | 24 | 0 | 15309 | 1913 | 786432 | amp_bf16 | PURE | FULL_SHARD | True | False | 13011240960 | +| 13b | 8192 | 8 | h100_80gb | 37.02 | 49.36 | 5 | 3 | 120 | 3 | 30075 | 3759 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12885411840 | +| 13b | 8192 | 8 | h100_80gb | 36.97 | 49.29 | 5 | 3 | 120 | 3 | 30030 | 3753 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12885411840 | +| 13b | 8192 | 8 | h100_80gb | 37.15 | 49.53 | 5 | 3 | 120 | 3 | 30179 | 3772 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12885411840 | +| 13b | 2048 | 8 | h100_80gb | 41.03 | 54.71 | 20 | 3 | 480 | 19 | 39532 | 4941 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12853954560 | +| 13b | 2048 | 8 | h100_80gb | 41.29 | 55.05 | 20 | 3 | 480 | 19 | 39779 | 4972 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12853954560 | +| 13b | 2048 | 8 | h100_80gb | 40.97 | 54.63 | 20 | 3 | 480 | 19 | 39478 | 4934 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12853954560 | +| 13b | 512 | 8 | h100_80gb | 42.63 | 56.83 | 80 | 3 | 1920 | 84 | 43074 | 5384 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12846090240 | +| 13b | 512 | 8 | h100_80gb | 42.51 | 56.68 | 80 | 3 | 1920 | 83 | 42954 | 5369 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12846090240 | +| 13b | 512 | 8 | h100_80gb | 42.24 | 56.32 | 80 | 3 | 1920 | 83 | 42684 | 5335 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12846090240 | +| 7b | 32768 | 8 | h100_80gb | 30.28 | 40.37 | 2 | 2 | 32 | 0 | 25983 | 3247 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6784688128 | +| 7b | 32768 | 8 | h100_80gb | 30.45 | 40.6 | 2 | 2 | 32 | 0 | 26127 | 3265 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6784688128 | +| 7b | 32768 | 8 | h100_80gb | 30.27 | 40.36 | 2 | 2 | 32 | 0 | 25973 | 3246 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6784688128 | +| 7b | 8192 | 8 | h100_80gb | 36.39 | 48.51 | 8 | 2 | 128 | 6 | 54355 | 6794 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6684024832 | +| 7b | 8192 | 8 | h100_80gb | 36.12 | 48.16 | 8 | 2 | 128 | 6 | 53957 | 6744 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6684024832 | +| 7b | 8192 | 8 | h100_80gb | 36.43 | 48.57 | 8 | 2 | 128 | 6 | 54419 | 6802 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6684024832 | +| 7b | 2048 | 8 | h100_80gb | 40.48 | 53.97 | 32 | 2 | 512 | 36 | 74217 | 9277 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6658859008 | +| 7b | 2048 | 8 | h100_80gb | 40.43 | 53.91 | 32 | 2 | 512 | 36 | 74132 | 9266 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6658859008 | +| 7b | 2048 | 8 | h100_80gb | 40.46 | 53.94 | 32 | 2 | 512 | 36 | 74180 | 9272 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6658859008 | +| 7b | 512 | 8 | h100_80gb | 42.02 | 56.02 | 128 | 2 | 2048 | 159 | 81676 | 10209 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6652567552 | +| 7b | 512 | 8 | h100_80gb | 42.02 | 56.02 | 128 | 2 | 2048 | 159 | 81679 | 10209 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6652567552 | +| 7b | 512 | 8 | h100_80gb | 41.88 | 55.84 | 128 | 2 | 2048 | 159 | 81417 | 10177 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6652567552 | +| 3b | 32768 | 8 | h100_80gb | 28.03 | 37.37 | 3 | 6 | 144 | 1 | 45654 | 5706 | 4718592 | amp_bf16 | PURE | FULL_SHARD | True | False | 2730480640 | +| 3b | 32768 | 8 | h100_80gb | 28.0 | 37.33 | 3 | 6 | 144 | 1 | 45607 | 5700 | 4718592 | amp_bf16 | PURE | FULL_SHARD | True | False | 2730480640 | +| 3b | 32768 | 8 | h100_80gb | 27.91 | 37.21 | 3 | 6 | 144 | 1 | 45465 | 5683 | 4718592 | amp_bf16 | PURE | FULL_SHARD | True | False | 2730480640 | +| 3b | 32768 | 8 | h100_80gb | 14.38 | 19.18 | 3 | 6 | 144 | 1 | 46853 | 5856 | 4718592 | amp_fp8 | DEFAULT | FULL_SHARD | True | False | 2730480640 | +| 3b | 8192 | 8 | h100_80gb | 40.35 | 40.35 | 3 | 6 | 144 | 16 | 132753 | 16594 | 1179648 | amp_bf16 | PURE | FULL_SHARD | False | False | 2667566080 | +| 3b | 8192 | 8 | h100_80gb | 40.1 | 40.1 | 3 | 6 | 144 | 16 | 131934 | 16491 | 1179648 | amp_bf16 | PURE | FULL_SHARD | False | False | 2667566080 | +| 3b | 8192 | 8 | h100_80gb | 40.33 | 40.33 | 3 | 6 | 144 | 16 | 132697 | 16587 | 1179648 | amp_bf16 | PURE | FULL_SHARD | False | False | 2667566080 | +| 3b | 8192 | 8 | h100_80gb | 23.28 | 23.28 | 3 | 6 | 144 | 18 | 153174 | 19146 | 1179648 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 2667566080 | +| 3b | 2048 | 8 | h100_80gb | 44.43 | 44.43 | 10 | 6 | 480 | 95 | 196229 | 24528 | 983040 | amp_bf16 | PURE | FULL_SHARD | False | False | 2651837440 | +| 3b | 2048 | 8 | h100_80gb | 44.42 | 44.42 | 10 | 6 | 480 | 95 | 196171 | 24521 | 983040 | amp_bf16 | PURE | FULL_SHARD | False | False | 2651837440 | +| 3b | 2048 | 8 | h100_80gb | 44.2 | 44.2 | 10 | 6 | 480 | 95 | 195192 | 24399 | 983040 | amp_bf16 | PURE | FULL_SHARD | False | False | 2651837440 | +| 3b | 2048 | 8 | h100_80gb | 27.7 | 27.7 | 10 | 6 | 480 | 119 | 244692 | 30586 | 983040 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 2651837440 | +| 3b | 512 | 8 | h100_80gb | 46.05 | 46.05 | 40 | 6 | 1920 | 434 | 222413 | 27801 | 983040 | amp_bf16 | PURE | FULL_SHARD | False | False | 2647905280 | +| 3b | 512 | 8 | h100_80gb | 46.38 | 46.38 | 40 | 6 | 1920 | 437 | 223994 | 27999 | 983040 | amp_bf16 | PURE | FULL_SHARD | False | False | 2647905280 | +| 3b | 512 | 8 | h100_80gb | 46.14 | 46.14 | 40 | 6 | 1920 | 435 | 222834 | 27854 | 983040 | amp_bf16 | PURE | FULL_SHARD | False | False | 2647905280 | +| 3b | 512 | 8 | h100_80gb | 30.25 | 30.25 | 40 | 6 | 1920 | 570 | 292217 | 36527 | 983040 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 2647905280 | +| 1b | 32768 | 8 | h100_80gb | 33.6 | 33.6 | 1 | 4 | 32 | 2 | 96354 | 12044 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 1378865152 | +| 1b | 32768 | 8 | h100_80gb | 33.54 | 33.54 | 1 | 4 | 32 | 2 | 96182 | 12022 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 1378865152 | +| 1b | 32768 | 8 | h100_80gb | 33.51 | 33.51 | 1 | 4 | 32 | 2 | 96105 | 12013 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 1378865152 | +| 1b | 32768 | 8 | h100_80gb | 17.55 | 17.55 | 1 | 4 | 32 | 3 | 100643 | 12580 | 1048576 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 1378865152 | +| 1b | 8192 | 8 | h100_80gb | 36.66 | 36.66 | 2 | 4 | 64 | 27 | 226682 | 28335 | 524288 | amp_bf16 | PURE | FULL_SHARD | False | False | 1328533504 | +| 1b | 8192 | 8 | h100_80gb | 36.74 | 36.74 | 2 | 4 | 64 | 27 | 227183 | 28397 | 524288 | amp_bf16 | PURE | FULL_SHARD | False | False | 1328533504 | +| 1b | 8192 | 8 | h100_80gb | 36.39 | 36.39 | 2 | 4 | 64 | 27 | 225010 | 28126 | 524288 | amp_bf16 | PURE | FULL_SHARD | False | False | 1328533504 | +| 1b | 8192 | 8 | h100_80gb | 20.71 | 20.71 | 2 | 4 | 64 | 31 | 256087 | 32010 | 524288 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 1328533504 | +| 1b | 512 | 8 | h100_80gb | 29.06 | 29.06 | 56 | 4 | 1792 | 1098 | 562523 | 70315 | 917504 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 1312804864 | +| 350m | 32768 | 8 | h100_80gb | 28.95 | 28.95 | 1 | 4 | 32 | 5 | 191165 | 23895 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 387442688 | +| 350m | 32768 | 8 | h100_80gb | 28.88 | 28.88 | 1 | 4 | 32 | 5 | 190718 | 23839 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 387442688 | +| 350m | 32768 | 8 | h100_80gb | 28.98 | 28.98 | 1 | 4 | 32 | 5 | 191350 | 23918 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 387442688 | +| 350m | 32768 | 8 | h100_80gb | 14.8 | 14.8 | 1 | 4 | 32 | 5 | 195516 | 24439 | 1048576 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 387442688 | +| 350m | 16384 | 8 | h100_80gb | 29.9 | 29.9 | 2 | 4 | 64 | 20 | 335478 | 41934 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 370665472 | +| 350m | 16384 | 8 | h100_80gb | 29.76 | 29.76 | 2 | 4 | 64 | 20 | 333921 | 41740 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 370665472 | +| 350m | 16384 | 8 | h100_80gb | 29.95 | 29.95 | 2 | 4 | 64 | 20 | 336016 | 42002 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 370665472 | +| 350m | 16384 | 8 | h100_80gb | 15.31 | 15.31 | 2 | 4 | 64 | 20 | 343435 | 42929 | 1048576 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 370665472 | +| 350m | 2048 | 8 | h100_80gb | 6.05 | 8.06 | 3 | 21 | 504 | 170 | 349409 | 43676 | 1032192 | amp_fp8 | DEFAULT | FULL_SHARD | True | False | 355985408 | +| 350m | 512 | 8 | h100_80gb | 32.32 | 32.32 | 56 | 4 | 1792 | 2194 | 1123449 | 140431 | 917504 | amp_bf16 | PURE | FULL_SHARD | False | False | 354412544 | +| 350m | 512 | 8 | h100_80gb | 32.79 | 32.79 | 56 | 4 | 1792 | 2226 | 1139870 | 142483 | 917504 | amp_bf16 | PURE | FULL_SHARD | False | False | 354412544 | +| 350m | 512 | 8 | h100_80gb | 32.77 | 32.77 | 56 | 4 | 1792 | 2224 | 1138963 | 142370 | 917504 | amp_bf16 | PURE | FULL_SHARD | False | False | 354412544 | +| 350m | 512 | 8 | h100_80gb | 17.77 | 17.77 | 56 | 4 | 1792 | 2412 | 1235360 | 154420 | 917504 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 354412544 | + ## A100 80GB with 1600 Gbps node-node interconnect (RoCE) | Model | SeqLen (T) | # GPUs | GPU | MFU | HFU | MicroBatchSize | GradAccum | GlobalBatchSize | Throughput (S/s) | Throughput (T/s) | Throughput (T/s/GPU) | GlobalBatchSize (T) | Precision | MP Mode | Sharding Strategy | Activation Checkpointing | Activation CPUOffload | NumParams | diff --git a/scripts/train/benchmarking/Torchcompiledebugging.numbers b/scripts/train/benchmarking/Torchcompiledebugging.numbers deleted file mode 100755 index b1a0787f02..0000000000 Binary files a/scripts/train/benchmarking/Torchcompiledebugging.numbers and /dev/null differ diff --git a/scripts/train/benchmarking/benchmark_results.csv b/scripts/train/benchmarking/benchmark_results.csv index d7e8396573..a39035d320 100644 --- a/scripts/train/benchmarking/benchmark_results.csv +++ b/scripts/train/benchmarking/benchmark_results.csv @@ -1,21 +1,78 @@ Model,SeqLen (T),# GPUs,GPU,MFU,HFU,MicroBatchSize,GradAccum,GlobalBatchSize,Throughput (S/s),Throughput (T/s),Throughput (T/s/GPU),GlobalBatchSize (T),Precision,MP Mode,Sharding Strategy,Activation Checkpointing,Activation CPUOffload,NumParams -7b,2048,8,a100_40gb,50.42,67.23,12,4,384,14,29150,3643,786432,amp_bf16,PURE,FULL_SHARD,True,False,6658859008 -3b,2048,8,a100_40gb,57.21,57.21,5,8,320,38,79667,9958,655360,amp_bf16,PURE,FULL_SHARD,False,False,2651837440 -1b,2048,8,a100_40gb,56.21,56.21,8,8,512,75,154114,19264,1048576,amp_bf16,PURE,FULL_SHARD,False,False,1315950592 -1b,2048,8,a100_40gb,55.86,55.86,8,8,512,74,153151,19143,1048576,amp_bf16,PURE,FULL_SHARD,False,False,1315950592 -1b,2048,8,a100_40gb,45.91,61.21,8,8,512,61,125871,15733,1048576,amp_bf16,PURE,FULL_SHARD,True,False,1315950592 -760m,2048,8,a100_40gb,51.5,51.5,12,4,384,114,235052,29381,786432,amp_bf16,PURE,FULL_SHARD,False,False,760470528 -760m,2048,8,a100_40gb,51.96,51.96,12,4,384,115,237143,29642,786432,amp_bf16,PURE,FULL_SHARD,False,False,760470528 -760m,2048,8,a100_40gb,42.72,56.96,12,4,384,95,194992,24374,786432,amp_bf16,PURE,FULL_SHARD,True,False,760470528 -350m,2048,8,a100_40gb,36.26,36.26,16,4,512,161,330354,41294,1048576,amp_bf16,PURE,FULL_SHARD,False,False,355985408 -350m,2048,8,a100_40gb,39.45,39.45,16,4,512,175,359403,44925,1048576,amp_bf16,PURE,FULL_SHARD,False,False,355985408 -350m,2048,8,a100_40gb,39.31,52.42,16,4,512,174,358147,44768,1048576,amp_bf16,PURE,FULL_SHARD,True,False,355985408 -125m,2048,8,a100_40gb,35.6,47.47,26,3,624,443,908206,113525,1277952,amp_bf16,PURE,FULL_SHARD,True,False,125311488 -125m,2048,8,a100_40gb,42.12,42.12,24,3,576,524,1074518,134314,1179648,amp_bf16,PURE,FULL_SHARD,False,False,125311488 -125m,2048,8,a100_40gb,42.21,42.21,24,3,576,525,1076846,134605,1179648,amp_bf16,PURE,FULL_SHARD,False,False,125311488 -125m,2048,8,a100_40gb,40.8,40.8,24,3,576,508,1040826,130103,1179648,amp_bf16,PURE,FULL_SHARD,False,False,125311488 -125m,2048,8,a100_40gb,32.74,32.74,24,3,576,407,835286,104410,1179648,amp_bf16,PURE,FULL_SHARD,False,False,125311488 -125m,2048,8,a100_40gb,35.46,47.28,24,3,576,441,904742,113092,1179648,amp_bf16,PURE,FULL_SHARD,True,False,125311488 -125m,2048,8,a100_40gb,41.89,41.89,24,3,576,521,1068638,133579,1179648,amp_bf16,PURE,FULL_SHARD,False,False,125311488 -125m,2048,8,a100_40gb,11.95,11.95,12,3,288,148,304881,38110,589824,amp_bf16,PURE,FULL_SHARD,False,False,125311488 -125m,2048,8,a100_40gb,10.74,10.74,12,3,288,133,274018,34252,589824,amp_bf16,PURE,FULL_SHARD,False,False,125311488 +30b,8192,8,h100_80gb,27.37,36.5,1,21,168,1,10128,1266,1376256,amp_bf16,PURE,FULL_SHARD,True,False,30019254272 +30b,8192,8,h100_80gb,27.51,36.68,1,21,168,1,10179,1272,1376256,amp_bf16,PURE,FULL_SHARD,True,False,30019254272 +30b,8192,8,h100_80gb,27.87,37.16,1,21,168,1,10311,1288,1376256,amp_bf16,PURE,FULL_SHARD,True,False,30019254272 +30b,4096,8,h100_80gb,34.5,46.0,1,21,168,3,13873,1734,688128,amp_bf16,PURE,FULL_SHARD,True,False,29989894144 +30b,4096,8,h100_80gb,34.61,46.15,1,21,168,3,13917,1739,688128,amp_bf16,PURE,FULL_SHARD,True,False,29989894144 +30b,4096,8,h100_80gb,34.51,46.02,1,21,168,3,13878,1734,688128,amp_bf16,PURE,FULL_SHARD,True,False,29989894144 +30b,2048,8,h100_80gb,37.35,49.8,3,21,504,7,15700,1962,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29975214080 +30b,2048,8,h100_80gb,37.33,49.78,3,21,504,7,15693,1961,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29975214080 +30b,2048,8,h100_80gb,37.54,50.05,3,21,504,7,15781,1972,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29975214080 +30b,1024,8,h100_80gb,38.21,50.94,6,21,1008,16,16433,2054,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29967874048 +30b,1024,8,h100_80gb,37.83,50.44,6,21,1008,15,16271,2033,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29967874048 +30b,1024,8,h100_80gb,38.07,50.76,6,21,1008,15,16376,2047,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29967874048 +30b,512,8,h100_80gb,38.64,51.52,12,21,2016,32,16816,2102,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29964204032 +30b,512,8,h100_80gb,38.38,51.17,12,21,2016,32,16702,2087,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29964204032 +30b,512,8,h100_80gb,38.47,51.29,12,21,2016,32,16741,2092,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29964204032 +13b,32768,8,h100_80gb,30.55,40.74,1,3,24,0,15250,1906,786432,amp_bf16,PURE,FULL_SHARD,True,False,13011240960 +13b,32768,8,h100_80gb,30.73,40.97,1,3,24,0,15338,1917,786432,amp_bf16,PURE,FULL_SHARD,True,False,13011240960 +13b,32768,8,h100_80gb,30.67,40.9,1,3,24,0,15309,1913,786432,amp_bf16,PURE,FULL_SHARD,True,False,13011240960 +13b,8192,8,h100_80gb,37.02,49.36,5,3,120,3,30075,3759,983040,amp_bf16,PURE,FULL_SHARD,True,False,12885411840 +13b,8192,8,h100_80gb,36.97,49.29,5,3,120,3,30030,3753,983040,amp_bf16,PURE,FULL_SHARD,True,False,12885411840 +13b,8192,8,h100_80gb,37.15,49.53,5,3,120,3,30179,3772,983040,amp_bf16,PURE,FULL_SHARD,True,False,12885411840 +13b,2048,8,h100_80gb,41.03,54.71,20,3,480,19,39532,4941,983040,amp_bf16,PURE,FULL_SHARD,True,False,12853954560 +13b,2048,8,h100_80gb,41.29,55.05,20,3,480,19,39779,4972,983040,amp_bf16,PURE,FULL_SHARD,True,False,12853954560 +13b,2048,8,h100_80gb,40.97,54.63,20,3,480,19,39478,4934,983040,amp_bf16,PURE,FULL_SHARD,True,False,12853954560 +13b,512,8,h100_80gb,42.63,56.83,80,3,1920,84,43074,5384,983040,amp_bf16,PURE,FULL_SHARD,True,False,12846090240 +13b,512,8,h100_80gb,42.51,56.68,80,3,1920,83,42954,5369,983040,amp_bf16,PURE,FULL_SHARD,True,False,12846090240 +13b,512,8,h100_80gb,42.24,56.32,80,3,1920,83,42684,5335,983040,amp_bf16,PURE,FULL_SHARD,True,False,12846090240 +7b,32768,8,h100_80gb,30.28,40.37,2,2,32,0,25983,3247,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6784688128 +7b,32768,8,h100_80gb,30.45,40.6,2,2,32,0,26127,3265,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6784688128 +7b,32768,8,h100_80gb,30.27,40.36,2,2,32,0,25973,3246,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6784688128 +7b,8192,8,h100_80gb,36.39,48.51,8,2,128,6,54355,6794,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6684024832 +7b,8192,8,h100_80gb,36.12,48.16,8,2,128,6,53957,6744,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6684024832 +7b,8192,8,h100_80gb,36.43,48.57,8,2,128,6,54419,6802,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6684024832 +7b,2048,8,h100_80gb,40.48,53.97,32,2,512,36,74217,9277,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6658859008 +7b,2048,8,h100_80gb,40.43,53.91,32,2,512,36,74132,9266,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6658859008 +7b,2048,8,h100_80gb,40.46,53.94,32,2,512,36,74180,9272,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6658859008 +7b,512,8,h100_80gb,42.02,56.02,128,2,2048,159,81676,10209,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6652567552 +7b,512,8,h100_80gb,42.02,56.02,128,2,2048,159,81679,10209,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6652567552 +7b,512,8,h100_80gb,41.88,55.84,128,2,2048,159,81417,10177,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6652567552 +3b,32768,8,h100_80gb,28.03,37.37,3,6,144,1,45654,5706,4718592,amp_bf16,PURE,FULL_SHARD,True,False,2730480640 +3b,32768,8,h100_80gb,28.0,37.33,3,6,144,1,45607,5700,4718592,amp_bf16,PURE,FULL_SHARD,True,False,2730480640 +3b,32768,8,h100_80gb,27.91,37.21,3,6,144,1,45465,5683,4718592,amp_bf16,PURE,FULL_SHARD,True,False,2730480640 +3b,32768,8,h100_80gb,14.38,19.18,3,6,144,1,46853,5856,4718592,amp_fp8,DEFAULT,FULL_SHARD,True,False,2730480640 +3b,8192,8,h100_80gb,40.35,40.35,3,6,144,16,132753,16594,1179648,amp_bf16,PURE,FULL_SHARD,False,False,2667566080 +3b,8192,8,h100_80gb,40.1,40.1,3,6,144,16,131934,16491,1179648,amp_bf16,PURE,FULL_SHARD,False,False,2667566080 +3b,8192,8,h100_80gb,40.33,40.33,3,6,144,16,132697,16587,1179648,amp_bf16,PURE,FULL_SHARD,False,False,2667566080 +3b,8192,8,h100_80gb,23.28,23.28,3,6,144,18,153174,19146,1179648,amp_fp8,DEFAULT,FULL_SHARD,False,False,2667566080 +3b,2048,8,h100_80gb,44.43,44.43,10,6,480,95,196229,24528,983040,amp_bf16,PURE,FULL_SHARD,False,False,2651837440 +3b,2048,8,h100_80gb,44.42,44.42,10,6,480,95,196171,24521,983040,amp_bf16,PURE,FULL_SHARD,False,False,2651837440 +3b,2048,8,h100_80gb,44.2,44.2,10,6,480,95,195192,24399,983040,amp_bf16,PURE,FULL_SHARD,False,False,2651837440 +3b,2048,8,h100_80gb,27.7,27.7,10,6,480,119,244692,30586,983040,amp_fp8,DEFAULT,FULL_SHARD,False,False,2651837440 +3b,512,8,h100_80gb,46.05,46.05,40,6,1920,434,222413,27801,983040,amp_bf16,PURE,FULL_SHARD,False,False,2647905280 +3b,512,8,h100_80gb,46.38,46.38,40,6,1920,437,223994,27999,983040,amp_bf16,PURE,FULL_SHARD,False,False,2647905280 +3b,512,8,h100_80gb,46.14,46.14,40,6,1920,435,222834,27854,983040,amp_bf16,PURE,FULL_SHARD,False,False,2647905280 +3b,512,8,h100_80gb,30.25,30.25,40,6,1920,570,292217,36527,983040,amp_fp8,DEFAULT,FULL_SHARD,False,False,2647905280 +1b,32768,8,h100_80gb,33.6,33.6,1,4,32,2,96354,12044,1048576,amp_bf16,PURE,FULL_SHARD,False,False,1378865152 +1b,32768,8,h100_80gb,33.54,33.54,1,4,32,2,96182,12022,1048576,amp_bf16,PURE,FULL_SHARD,False,False,1378865152 +1b,32768,8,h100_80gb,33.51,33.51,1,4,32,2,96105,12013,1048576,amp_bf16,PURE,FULL_SHARD,False,False,1378865152 +1b,32768,8,h100_80gb,17.55,17.55,1,4,32,3,100643,12580,1048576,amp_fp8,DEFAULT,FULL_SHARD,False,False,1378865152 +1b,8192,8,h100_80gb,36.66,36.66,2,4,64,27,226682,28335,524288,amp_bf16,PURE,FULL_SHARD,False,False,1328533504 +1b,8192,8,h100_80gb,36.74,36.74,2,4,64,27,227183,28397,524288,amp_bf16,PURE,FULL_SHARD,False,False,1328533504 +1b,8192,8,h100_80gb,36.39,36.39,2,4,64,27,225010,28126,524288,amp_bf16,PURE,FULL_SHARD,False,False,1328533504 +1b,8192,8,h100_80gb,20.71,20.71,2,4,64,31,256087,32010,524288,amp_fp8,DEFAULT,FULL_SHARD,False,False,1328533504 +1b,512,8,h100_80gb,29.06,29.06,56,4,1792,1098,562523,70315,917504,amp_fp8,DEFAULT,FULL_SHARD,False,False,1312804864 +350m,32768,8,h100_80gb,28.95,28.95,1,4,32,5,191165,23895,1048576,amp_bf16,PURE,FULL_SHARD,False,False,387442688 +350m,32768,8,h100_80gb,28.88,28.88,1,4,32,5,190718,23839,1048576,amp_bf16,PURE,FULL_SHARD,False,False,387442688 +350m,32768,8,h100_80gb,28.98,28.98,1,4,32,5,191350,23918,1048576,amp_bf16,PURE,FULL_SHARD,False,False,387442688 +350m,32768,8,h100_80gb,14.8,14.8,1,4,32,5,195516,24439,1048576,amp_fp8,DEFAULT,FULL_SHARD,False,False,387442688 +350m,16384,8,h100_80gb,29.9,29.9,2,4,64,20,335478,41934,1048576,amp_bf16,PURE,FULL_SHARD,False,False,370665472 +350m,16384,8,h100_80gb,29.76,29.76,2,4,64,20,333921,41740,1048576,amp_bf16,PURE,FULL_SHARD,False,False,370665472 +350m,16384,8,h100_80gb,29.95,29.95,2,4,64,20,336016,42002,1048576,amp_bf16,PURE,FULL_SHARD,False,False,370665472 +350m,16384,8,h100_80gb,15.31,15.31,2,4,64,20,343435,42929,1048576,amp_fp8,DEFAULT,FULL_SHARD,False,False,370665472 +350m,2048,8,h100_80gb,6.05,8.06,3,21,504,170,349409,43676,1032192,amp_fp8,DEFAULT,FULL_SHARD,True,False,355985408 +350m,512,8,h100_80gb,32.32,32.32,56,4,1792,2194,1123449,140431,917504,amp_bf16,PURE,FULL_SHARD,False,False,354412544 +350m,512,8,h100_80gb,32.79,32.79,56,4,1792,2226,1139870,142483,917504,amp_bf16,PURE,FULL_SHARD,False,False,354412544 +350m,512,8,h100_80gb,32.77,32.77,56,4,1792,2224,1138963,142370,917504,amp_bf16,PURE,FULL_SHARD,False,False,354412544 +350m,512,8,h100_80gb,17.77,17.77,56,4,1792,2412,1235360,154420,917504,amp_fp8,DEFAULT,FULL_SHARD,False,False,354412544 diff --git a/scripts/train/benchmarking/benchmark_results.md b/scripts/train/benchmarking/benchmark_results.md index e16547bee3..2a320d0b2d 100644 --- a/scripts/train/benchmarking/benchmark_results.md +++ b/scripts/train/benchmarking/benchmark_results.md @@ -1,22 +1,79 @@ | Model | SeqLen (T) | # GPUs | GPU | MFU | HFU | MicroBatchSize | GradAccum | GlobalBatchSize | Throughput (S/s) | Throughput (T/s) | Throughput (T/s/GPU) | GlobalBatchSize (T) | Precision | MP Mode | Sharding Strategy | Activation Checkpointing | Activation CPUOffload | NumParams | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| 7b | 2048 | 8 | a100_40gb | 50.42 | 67.23 | 12 | 4 | 384 | 14 | 29150 | 3643 | 786432 | amp_bf16 | PURE | FULL_SHARD | True | False | 6658859008 | -| 3b | 2048 | 8 | a100_40gb | 57.21 | 57.21 | 5 | 8 | 320 | 38 | 79667 | 9958 | 655360 | amp_bf16 | PURE | FULL_SHARD | False | False | 2651837440 | -| 1b | 2048 | 8 | a100_40gb | 56.21 | 56.21 | 8 | 8 | 512 | 75 | 154114 | 19264 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 1315950592 | -| 1b | 2048 | 8 | a100_40gb | 55.86 | 55.86 | 8 | 8 | 512 | 74 | 153151 | 19143 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 1315950592 | -| 1b | 2048 | 8 | a100_40gb | 45.91 | 61.21 | 8 | 8 | 512 | 61 | 125871 | 15733 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 1315950592 | -| 760m | 2048 | 8 | a100_40gb | 51.5 | 51.5 | 12 | 4 | 384 | 114 | 235052 | 29381 | 786432 | amp_bf16 | PURE | FULL_SHARD | False | False | 760470528 | -| 760m | 2048 | 8 | a100_40gb | 51.96 | 51.96 | 12 | 4 | 384 | 115 | 237143 | 29642 | 786432 | amp_bf16 | PURE | FULL_SHARD | False | False | 760470528 | -| 760m | 2048 | 8 | a100_40gb | 42.72 | 56.96 | 12 | 4 | 384 | 95 | 194992 | 24374 | 786432 | amp_bf16 | PURE | FULL_SHARD | True | False | 760470528 | -| 350m | 2048 | 8 | a100_40gb | 36.26 | 36.26 | 16 | 4 | 512 | 161 | 330354 | 41294 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 355985408 | -| 350m | 2048 | 8 | a100_40gb | 39.45 | 39.45 | 16 | 4 | 512 | 175 | 359403 | 44925 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 355985408 | -| 350m | 2048 | 8 | a100_40gb | 39.31 | 52.42 | 16 | 4 | 512 | 174 | 358147 | 44768 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 355985408 | -| 125m | 2048 | 8 | a100_40gb | 35.6 | 47.47 | 26 | 3 | 624 | 443 | 908206 | 113525 | 1277952 | amp_bf16 | PURE | FULL_SHARD | True | False | 125311488 | -| 125m | 2048 | 8 | a100_40gb | 42.12 | 42.12 | 24 | 3 | 576 | 524 | 1074518 | 134314 | 1179648 | amp_bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 2048 | 8 | a100_40gb | 42.21 | 42.21 | 24 | 3 | 576 | 525 | 1076846 | 134605 | 1179648 | amp_bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 2048 | 8 | a100_40gb | 40.8 | 40.8 | 24 | 3 | 576 | 508 | 1040826 | 130103 | 1179648 | amp_bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 2048 | 8 | a100_40gb | 32.74 | 32.74 | 24 | 3 | 576 | 407 | 835286 | 104410 | 1179648 | amp_bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 2048 | 8 | a100_40gb | 35.46 | 47.28 | 24 | 3 | 576 | 441 | 904742 | 113092 | 1179648 | amp_bf16 | PURE | FULL_SHARD | True | False | 125311488 | -| 125m | 2048 | 8 | a100_40gb | 41.89 | 41.89 | 24 | 3 | 576 | 521 | 1068638 | 133579 | 1179648 | amp_bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 2048 | 8 | a100_40gb | 11.95 | 11.95 | 12 | 3 | 288 | 148 | 304881 | 38110 | 589824 | amp_bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 2048 | 8 | a100_40gb | 10.74 | 10.74 | 12 | 3 | 288 | 133 | 274018 | 34252 | 589824 | amp_bf16 | PURE | FULL_SHARD | False | False | 125311488 | +| 30b | 8192 | 8 | h100_80gb | 27.37 | 36.5 | 1 | 21 | 168 | 1 | 10128 | 1266 | 1376256 | amp_bf16 | PURE | FULL_SHARD | True | False | 30019254272 | +| 30b | 8192 | 8 | h100_80gb | 27.51 | 36.68 | 1 | 21 | 168 | 1 | 10179 | 1272 | 1376256 | amp_bf16 | PURE | FULL_SHARD | True | False | 30019254272 | +| 30b | 8192 | 8 | h100_80gb | 27.87 | 37.16 | 1 | 21 | 168 | 1 | 10311 | 1288 | 1376256 | amp_bf16 | PURE | FULL_SHARD | True | False | 30019254272 | +| 30b | 4096 | 8 | h100_80gb | 34.5 | 46.0 | 1 | 21 | 168 | 3 | 13873 | 1734 | 688128 | amp_bf16 | PURE | FULL_SHARD | True | False | 29989894144 | +| 30b | 4096 | 8 | h100_80gb | 34.61 | 46.15 | 1 | 21 | 168 | 3 | 13917 | 1739 | 688128 | amp_bf16 | PURE | FULL_SHARD | True | False | 29989894144 | +| 30b | 4096 | 8 | h100_80gb | 34.51 | 46.02 | 1 | 21 | 168 | 3 | 13878 | 1734 | 688128 | amp_bf16 | PURE | FULL_SHARD | True | False | 29989894144 | +| 30b | 2048 | 8 | h100_80gb | 37.35 | 49.8 | 3 | 21 | 504 | 7 | 15700 | 1962 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29975214080 | +| 30b | 2048 | 8 | h100_80gb | 37.33 | 49.78 | 3 | 21 | 504 | 7 | 15693 | 1961 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29975214080 | +| 30b | 2048 | 8 | h100_80gb | 37.54 | 50.05 | 3 | 21 | 504 | 7 | 15781 | 1972 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29975214080 | +| 30b | 1024 | 8 | h100_80gb | 38.21 | 50.94 | 6 | 21 | 1008 | 16 | 16433 | 2054 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29967874048 | +| 30b | 1024 | 8 | h100_80gb | 37.83 | 50.44 | 6 | 21 | 1008 | 15 | 16271 | 2033 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29967874048 | +| 30b | 1024 | 8 | h100_80gb | 38.07 | 50.76 | 6 | 21 | 1008 | 15 | 16376 | 2047 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29967874048 | +| 30b | 512 | 8 | h100_80gb | 38.64 | 51.52 | 12 | 21 | 2016 | 32 | 16816 | 2102 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29964204032 | +| 30b | 512 | 8 | h100_80gb | 38.38 | 51.17 | 12 | 21 | 2016 | 32 | 16702 | 2087 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29964204032 | +| 30b | 512 | 8 | h100_80gb | 38.47 | 51.29 | 12 | 21 | 2016 | 32 | 16741 | 2092 | 1032192 | amp_bf16 | PURE | FULL_SHARD | True | False | 29964204032 | +| 13b | 32768 | 8 | h100_80gb | 30.55 | 40.74 | 1 | 3 | 24 | 0 | 15250 | 1906 | 786432 | amp_bf16 | PURE | FULL_SHARD | True | False | 13011240960 | +| 13b | 32768 | 8 | h100_80gb | 30.73 | 40.97 | 1 | 3 | 24 | 0 | 15338 | 1917 | 786432 | amp_bf16 | PURE | FULL_SHARD | True | False | 13011240960 | +| 13b | 32768 | 8 | h100_80gb | 30.67 | 40.9 | 1 | 3 | 24 | 0 | 15309 | 1913 | 786432 | amp_bf16 | PURE | FULL_SHARD | True | False | 13011240960 | +| 13b | 8192 | 8 | h100_80gb | 37.02 | 49.36 | 5 | 3 | 120 | 3 | 30075 | 3759 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12885411840 | +| 13b | 8192 | 8 | h100_80gb | 36.97 | 49.29 | 5 | 3 | 120 | 3 | 30030 | 3753 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12885411840 | +| 13b | 8192 | 8 | h100_80gb | 37.15 | 49.53 | 5 | 3 | 120 | 3 | 30179 | 3772 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12885411840 | +| 13b | 2048 | 8 | h100_80gb | 41.03 | 54.71 | 20 | 3 | 480 | 19 | 39532 | 4941 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12853954560 | +| 13b | 2048 | 8 | h100_80gb | 41.29 | 55.05 | 20 | 3 | 480 | 19 | 39779 | 4972 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12853954560 | +| 13b | 2048 | 8 | h100_80gb | 40.97 | 54.63 | 20 | 3 | 480 | 19 | 39478 | 4934 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12853954560 | +| 13b | 512 | 8 | h100_80gb | 42.63 | 56.83 | 80 | 3 | 1920 | 84 | 43074 | 5384 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12846090240 | +| 13b | 512 | 8 | h100_80gb | 42.51 | 56.68 | 80 | 3 | 1920 | 83 | 42954 | 5369 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12846090240 | +| 13b | 512 | 8 | h100_80gb | 42.24 | 56.32 | 80 | 3 | 1920 | 83 | 42684 | 5335 | 983040 | amp_bf16 | PURE | FULL_SHARD | True | False | 12846090240 | +| 7b | 32768 | 8 | h100_80gb | 30.28 | 40.37 | 2 | 2 | 32 | 0 | 25983 | 3247 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6784688128 | +| 7b | 32768 | 8 | h100_80gb | 30.45 | 40.6 | 2 | 2 | 32 | 0 | 26127 | 3265 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6784688128 | +| 7b | 32768 | 8 | h100_80gb | 30.27 | 40.36 | 2 | 2 | 32 | 0 | 25973 | 3246 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6784688128 | +| 7b | 8192 | 8 | h100_80gb | 36.39 | 48.51 | 8 | 2 | 128 | 6 | 54355 | 6794 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6684024832 | +| 7b | 8192 | 8 | h100_80gb | 36.12 | 48.16 | 8 | 2 | 128 | 6 | 53957 | 6744 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6684024832 | +| 7b | 8192 | 8 | h100_80gb | 36.43 | 48.57 | 8 | 2 | 128 | 6 | 54419 | 6802 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6684024832 | +| 7b | 2048 | 8 | h100_80gb | 40.48 | 53.97 | 32 | 2 | 512 | 36 | 74217 | 9277 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6658859008 | +| 7b | 2048 | 8 | h100_80gb | 40.43 | 53.91 | 32 | 2 | 512 | 36 | 74132 | 9266 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6658859008 | +| 7b | 2048 | 8 | h100_80gb | 40.46 | 53.94 | 32 | 2 | 512 | 36 | 74180 | 9272 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6658859008 | +| 7b | 512 | 8 | h100_80gb | 42.02 | 56.02 | 128 | 2 | 2048 | 159 | 81676 | 10209 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6652567552 | +| 7b | 512 | 8 | h100_80gb | 42.02 | 56.02 | 128 | 2 | 2048 | 159 | 81679 | 10209 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6652567552 | +| 7b | 512 | 8 | h100_80gb | 41.88 | 55.84 | 128 | 2 | 2048 | 159 | 81417 | 10177 | 1048576 | amp_bf16 | PURE | FULL_SHARD | True | False | 6652567552 | +| 3b | 32768 | 8 | h100_80gb | 28.03 | 37.37 | 3 | 6 | 144 | 1 | 45654 | 5706 | 4718592 | amp_bf16 | PURE | FULL_SHARD | True | False | 2730480640 | +| 3b | 32768 | 8 | h100_80gb | 28.0 | 37.33 | 3 | 6 | 144 | 1 | 45607 | 5700 | 4718592 | amp_bf16 | PURE | FULL_SHARD | True | False | 2730480640 | +| 3b | 32768 | 8 | h100_80gb | 27.91 | 37.21 | 3 | 6 | 144 | 1 | 45465 | 5683 | 4718592 | amp_bf16 | PURE | FULL_SHARD | True | False | 2730480640 | +| 3b | 32768 | 8 | h100_80gb | 14.38 | 19.18 | 3 | 6 | 144 | 1 | 46853 | 5856 | 4718592 | amp_fp8 | DEFAULT | FULL_SHARD | True | False | 2730480640 | +| 3b | 8192 | 8 | h100_80gb | 40.35 | 40.35 | 3 | 6 | 144 | 16 | 132753 | 16594 | 1179648 | amp_bf16 | PURE | FULL_SHARD | False | False | 2667566080 | +| 3b | 8192 | 8 | h100_80gb | 40.1 | 40.1 | 3 | 6 | 144 | 16 | 131934 | 16491 | 1179648 | amp_bf16 | PURE | FULL_SHARD | False | False | 2667566080 | +| 3b | 8192 | 8 | h100_80gb | 40.33 | 40.33 | 3 | 6 | 144 | 16 | 132697 | 16587 | 1179648 | amp_bf16 | PURE | FULL_SHARD | False | False | 2667566080 | +| 3b | 8192 | 8 | h100_80gb | 23.28 | 23.28 | 3 | 6 | 144 | 18 | 153174 | 19146 | 1179648 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 2667566080 | +| 3b | 2048 | 8 | h100_80gb | 44.43 | 44.43 | 10 | 6 | 480 | 95 | 196229 | 24528 | 983040 | amp_bf16 | PURE | FULL_SHARD | False | False | 2651837440 | +| 3b | 2048 | 8 | h100_80gb | 44.42 | 44.42 | 10 | 6 | 480 | 95 | 196171 | 24521 | 983040 | amp_bf16 | PURE | FULL_SHARD | False | False | 2651837440 | +| 3b | 2048 | 8 | h100_80gb | 44.2 | 44.2 | 10 | 6 | 480 | 95 | 195192 | 24399 | 983040 | amp_bf16 | PURE | FULL_SHARD | False | False | 2651837440 | +| 3b | 2048 | 8 | h100_80gb | 27.7 | 27.7 | 10 | 6 | 480 | 119 | 244692 | 30586 | 983040 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 2651837440 | +| 3b | 512 | 8 | h100_80gb | 46.05 | 46.05 | 40 | 6 | 1920 | 434 | 222413 | 27801 | 983040 | amp_bf16 | PURE | FULL_SHARD | False | False | 2647905280 | +| 3b | 512 | 8 | h100_80gb | 46.38 | 46.38 | 40 | 6 | 1920 | 437 | 223994 | 27999 | 983040 | amp_bf16 | PURE | FULL_SHARD | False | False | 2647905280 | +| 3b | 512 | 8 | h100_80gb | 46.14 | 46.14 | 40 | 6 | 1920 | 435 | 222834 | 27854 | 983040 | amp_bf16 | PURE | FULL_SHARD | False | False | 2647905280 | +| 3b | 512 | 8 | h100_80gb | 30.25 | 30.25 | 40 | 6 | 1920 | 570 | 292217 | 36527 | 983040 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 2647905280 | +| 1b | 32768 | 8 | h100_80gb | 33.6 | 33.6 | 1 | 4 | 32 | 2 | 96354 | 12044 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 1378865152 | +| 1b | 32768 | 8 | h100_80gb | 33.54 | 33.54 | 1 | 4 | 32 | 2 | 96182 | 12022 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 1378865152 | +| 1b | 32768 | 8 | h100_80gb | 33.51 | 33.51 | 1 | 4 | 32 | 2 | 96105 | 12013 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 1378865152 | +| 1b | 32768 | 8 | h100_80gb | 17.55 | 17.55 | 1 | 4 | 32 | 3 | 100643 | 12580 | 1048576 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 1378865152 | +| 1b | 8192 | 8 | h100_80gb | 36.66 | 36.66 | 2 | 4 | 64 | 27 | 226682 | 28335 | 524288 | amp_bf16 | PURE | FULL_SHARD | False | False | 1328533504 | +| 1b | 8192 | 8 | h100_80gb | 36.74 | 36.74 | 2 | 4 | 64 | 27 | 227183 | 28397 | 524288 | amp_bf16 | PURE | FULL_SHARD | False | False | 1328533504 | +| 1b | 8192 | 8 | h100_80gb | 36.39 | 36.39 | 2 | 4 | 64 | 27 | 225010 | 28126 | 524288 | amp_bf16 | PURE | FULL_SHARD | False | False | 1328533504 | +| 1b | 8192 | 8 | h100_80gb | 20.71 | 20.71 | 2 | 4 | 64 | 31 | 256087 | 32010 | 524288 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 1328533504 | +| 1b | 512 | 8 | h100_80gb | 29.06 | 29.06 | 56 | 4 | 1792 | 1098 | 562523 | 70315 | 917504 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 1312804864 | +| 350m | 32768 | 8 | h100_80gb | 28.95 | 28.95 | 1 | 4 | 32 | 5 | 191165 | 23895 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 387442688 | +| 350m | 32768 | 8 | h100_80gb | 28.88 | 28.88 | 1 | 4 | 32 | 5 | 190718 | 23839 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 387442688 | +| 350m | 32768 | 8 | h100_80gb | 28.98 | 28.98 | 1 | 4 | 32 | 5 | 191350 | 23918 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 387442688 | +| 350m | 32768 | 8 | h100_80gb | 14.8 | 14.8 | 1 | 4 | 32 | 5 | 195516 | 24439 | 1048576 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 387442688 | +| 350m | 16384 | 8 | h100_80gb | 29.9 | 29.9 | 2 | 4 | 64 | 20 | 335478 | 41934 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 370665472 | +| 350m | 16384 | 8 | h100_80gb | 29.76 | 29.76 | 2 | 4 | 64 | 20 | 333921 | 41740 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 370665472 | +| 350m | 16384 | 8 | h100_80gb | 29.95 | 29.95 | 2 | 4 | 64 | 20 | 336016 | 42002 | 1048576 | amp_bf16 | PURE | FULL_SHARD | False | False | 370665472 | +| 350m | 16384 | 8 | h100_80gb | 15.31 | 15.31 | 2 | 4 | 64 | 20 | 343435 | 42929 | 1048576 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 370665472 | +| 350m | 2048 | 8 | h100_80gb | 6.05 | 8.06 | 3 | 21 | 504 | 170 | 349409 | 43676 | 1032192 | amp_fp8 | DEFAULT | FULL_SHARD | True | False | 355985408 | +| 350m | 512 | 8 | h100_80gb | 32.32 | 32.32 | 56 | 4 | 1792 | 2194 | 1123449 | 140431 | 917504 | amp_bf16 | PURE | FULL_SHARD | False | False | 354412544 | +| 350m | 512 | 8 | h100_80gb | 32.79 | 32.79 | 56 | 4 | 1792 | 2226 | 1139870 | 142483 | 917504 | amp_bf16 | PURE | FULL_SHARD | False | False | 354412544 | +| 350m | 512 | 8 | h100_80gb | 32.77 | 32.77 | 56 | 4 | 1792 | 2224 | 1138963 | 142370 | 917504 | amp_bf16 | PURE | FULL_SHARD | False | False | 354412544 | +| 350m | 512 | 8 | h100_80gb | 17.77 | 17.77 | 56 | 4 | 1792 | 2412 | 1235360 | 154420 | 917504 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 354412544 | diff --git a/scripts/train/benchmarking/collect_results.py b/scripts/train/benchmarking/collect_results.py index c9a089e6e4..64ca38b920 100644 --- a/scripts/train/benchmarking/collect_results.py +++ b/scripts/train/benchmarking/collect_results.py @@ -57,7 +57,7 @@ def sort_key(r: msdk.Run): print(model_name) raise ValueError model_size = int(model_name[:-1]) - return (r.image, model_name_size, model_size, r.submitted_config.parameters['max_seq_len'], + return (r.gpu_type, r.image, model_name_size, model_size, r.submitted_config.parameters['max_seq_len'], num_gpu, r.submitted_config.parameters['global_train_batch_size']) runs.sort(reverse=True, key=sort_key) @@ -93,7 +93,7 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]: model_name = run.name.split('-')[2] gpus = run.gpus gpu_type = run.gpu_type - GPU_AVAILABLE_FLOPS = 312_000_000_000_000 if (gpu_type != "h100_80gb") else (1_979_000_000_000_000/2 if run.submitted_config.parameters['precision'] == 'bf16' else 1_979_000_000_000_000)# NOTE: This is accurate for BF16 or FP8 only + GPU_AVAILABLE_FLOPS = 312_000_000_000_000 if (gpu_type != "h100_80gb") else (1_979_000_000_000_000/2 if run.submitted_config.parameters['precision'] == 'amp_bf16' else 1_979_000_000_000_000)# NOTE: This is accurate for BF16 or FP8 only fsdp_config = run.submitted_config.parameters['fsdp_config'] seq_len = run.submitted_config.parameters['max_seq_len'] @@ -190,8 +190,6 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]: str(fsdp_config['activation_cpu_offload']), 'NumParams': n_params, - 'Image': - image, # 'Compile Mode': # compile_mode, # 'Compile Fullgraph': diff --git a/scripts/train/benchmarking/h100.csv b/scripts/train/benchmarking/h100.csv deleted file mode 100644 index c4d115cee6..0000000000 --- a/scripts/train/benchmarking/h100.csv +++ /dev/null @@ -1,78 +0,0 @@ -Model,SeqLen (T),# GPUs,GPU,MFU,HFU,MicroBatchSize,GradAccum,GlobalBatchSize,Throughput (S/s),Throughput (T/s),Throughput (T/s/GPU),GlobalBatchSize (T),Precision,MP Mode,Sharding Strategy,Activation Checkpointing,Activation CPUOffload,NumParams -3b,32768,8,h100_80gb,14.38,19.18,3,6,144,1,46853,5856,4718592,amp_fp8,DEFAULT,FULL_SHARD,True,False,2730480640 -3b,8192,8,h100_80gb,23.28,23.28,3,6,144,18,153174,19146,1179648,amp_fp8,DEFAULT,FULL_SHARD,False,False,2667566080 -3b,2048,8,h100_80gb,27.7,27.7,10,6,480,119,244692,30586,983040,amp_fp8,DEFAULT,FULL_SHARD,False,False,2651837440 -3b,512,8,h100_80gb,30.25,30.25,40,6,1920,570,292217,36527,983040,amp_fp8,DEFAULT,FULL_SHARD,False,False,2647905280 -1b,32768,8,h100_80gb,17.55,17.55,1,4,32,3,100643,12580,1048576,amp_fp8,DEFAULT,FULL_SHARD,False,False,1378865152 -1b,8192,8,h100_80gb,20.71,20.71,2,4,64,31,256087,32010,524288,amp_fp8,DEFAULT,FULL_SHARD,False,False,1328533504 -1b,512,8,h100_80gb,29.06,29.06,56,4,1792,1098,562523,70315,917504,amp_fp8,DEFAULT,FULL_SHARD,False,False,1312804864 -350m,32768,8,h100_80gb,14.8,14.8,1,4,32,5,195516,24439,1048576,amp_fp8,DEFAULT,FULL_SHARD,False,False,387442688 -350m,16384,8,h100_80gb,15.31,15.31,2,4,64,20,343435,42929,1048576,amp_fp8,DEFAULT,FULL_SHARD,False,False,370665472 -350m,2048,8,h100_80gb,6.05,8.06,3,21,504,170,349409,43676,1032192,amp_fp8,DEFAULT,FULL_SHARD,True,False,355985408 -350m,512,8,h100_80gb,17.77,17.77,56,4,1792,2412,1235360,154420,917504,amp_fp8,DEFAULT,FULL_SHARD,False,False,354412544 -30b,8192,8,h100_80gb,13.69,18.25,1,21,168,1,10128,1266,1376256,amp_bf16,PURE,FULL_SHARD,True,False,30019254272 -30b,8192,8,h100_80gb,13.76,18.34,1,21,168,1,10179,1272,1376256,amp_bf16,PURE,FULL_SHARD,True,False,30019254272 -30b,8192,8,h100_80gb,13.93,18.58,1,21,168,1,10311,1288,1376256,amp_bf16,PURE,FULL_SHARD,True,False,30019254272 -30b,4096,8,h100_80gb,17.25,23.0,1,21,168,3,13873,1734,688128,amp_bf16,PURE,FULL_SHARD,True,False,29989894144 -30b,4096,8,h100_80gb,17.3,23.07,1,21,168,3,13917,1739,688128,amp_bf16,PURE,FULL_SHARD,True,False,29989894144 -30b,4096,8,h100_80gb,17.26,23.01,1,21,168,3,13878,1734,688128,amp_bf16,PURE,FULL_SHARD,True,False,29989894144 -30b,2048,8,h100_80gb,18.67,24.9,3,21,504,7,15700,1962,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29975214080 -30b,2048,8,h100_80gb,18.67,24.89,3,21,504,7,15693,1961,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29975214080 -30b,2048,8,h100_80gb,18.77,25.03,3,21,504,7,15781,1972,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29975214080 -30b,1024,8,h100_80gb,19.1,25.47,6,21,1008,16,16433,2054,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29967874048 -30b,1024,8,h100_80gb,18.91,25.22,6,21,1008,15,16271,2033,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29967874048 -30b,1024,8,h100_80gb,19.04,25.38,6,21,1008,15,16376,2047,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29967874048 -30b,512,8,h100_80gb,19.32,25.76,12,21,2016,32,16816,2102,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29964204032 -30b,512,8,h100_80gb,19.19,25.59,12,21,2016,32,16702,2087,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29964204032 -30b,512,8,h100_80gb,19.24,25.65,12,21,2016,32,16741,2092,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29964204032 -13b,32768,8,h100_80gb,15.28,20.37,1,3,24,0,15250,1906,786432,amp_bf16,PURE,FULL_SHARD,True,False,13011240960 -13b,32768,8,h100_80gb,15.36,20.49,1,3,24,0,15338,1917,786432,amp_bf16,PURE,FULL_SHARD,True,False,13011240960 -13b,32768,8,h100_80gb,15.34,20.45,1,3,24,0,15309,1913,786432,amp_bf16,PURE,FULL_SHARD,True,False,13011240960 -13b,8192,8,h100_80gb,18.51,24.68,5,3,120,3,30075,3759,983040,amp_bf16,PURE,FULL_SHARD,True,False,12885411840 -13b,8192,8,h100_80gb,18.48,24.64,5,3,120,3,30030,3753,983040,amp_bf16,PURE,FULL_SHARD,True,False,12885411840 -13b,8192,8,h100_80gb,18.58,24.77,5,3,120,3,30179,3772,983040,amp_bf16,PURE,FULL_SHARD,True,False,12885411840 -13b,2048,8,h100_80gb,20.51,27.35,20,3,480,19,39532,4941,983040,amp_bf16,PURE,FULL_SHARD,True,False,12853954560 -13b,2048,8,h100_80gb,20.64,27.52,20,3,480,19,39779,4972,983040,amp_bf16,PURE,FULL_SHARD,True,False,12853954560 -13b,2048,8,h100_80gb,20.49,27.32,20,3,480,19,39478,4934,983040,amp_bf16,PURE,FULL_SHARD,True,False,12853954560 -13b,512,8,h100_80gb,21.31,28.42,80,3,1920,84,43074,5384,983040,amp_bf16,PURE,FULL_SHARD,True,False,12846090240 -13b,512,8,h100_80gb,21.25,28.34,80,3,1920,83,42954,5369,983040,amp_bf16,PURE,FULL_SHARD,True,False,12846090240 -13b,512,8,h100_80gb,21.12,28.16,80,3,1920,83,42684,5335,983040,amp_bf16,PURE,FULL_SHARD,True,False,12846090240 -7b,32768,8,h100_80gb,15.14,20.19,2,2,32,0,25983,3247,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6784688128 -7b,32768,8,h100_80gb,15.22,20.3,2,2,32,0,26127,3265,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6784688128 -7b,32768,8,h100_80gb,15.13,20.18,2,2,32,0,25973,3246,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6784688128 -7b,8192,8,h100_80gb,18.19,24.26,8,2,128,6,54355,6794,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6684024832 -7b,8192,8,h100_80gb,18.06,24.08,8,2,128,6,53957,6744,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6684024832 -7b,8192,8,h100_80gb,18.21,24.29,8,2,128,6,54419,6802,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6684024832 -7b,2048,8,h100_80gb,20.24,26.99,32,2,512,36,74217,9277,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6658859008 -7b,2048,8,h100_80gb,20.22,26.95,32,2,512,36,74132,9266,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6658859008 -7b,2048,8,h100_80gb,20.23,26.97,32,2,512,36,74180,9272,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6658859008 -7b,512,8,h100_80gb,21.01,28.01,128,2,2048,159,81676,10209,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6652567552 -7b,512,8,h100_80gb,21.01,28.01,128,2,2048,159,81679,10209,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6652567552 -7b,512,8,h100_80gb,20.94,27.92,128,2,2048,159,81417,10177,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6652567552 -3b,32768,8,h100_80gb,14.01,18.68,3,6,144,1,45654,5706,4718592,amp_bf16,PURE,FULL_SHARD,True,False,2730480640 -3b,32768,8,h100_80gb,14.0,18.67,3,6,144,1,45607,5700,4718592,amp_bf16,PURE,FULL_SHARD,True,False,2730480640 -3b,32768,8,h100_80gb,13.96,18.61,3,6,144,1,45465,5683,4718592,amp_bf16,PURE,FULL_SHARD,True,False,2730480640 -3b,8192,8,h100_80gb,20.17,20.17,3,6,144,16,132753,16594,1179648,amp_bf16,PURE,FULL_SHARD,False,False,2667566080 -3b,8192,8,h100_80gb,20.05,20.05,3,6,144,16,131934,16491,1179648,amp_bf16,PURE,FULL_SHARD,False,False,2667566080 -3b,8192,8,h100_80gb,20.16,20.16,3,6,144,16,132697,16587,1179648,amp_bf16,PURE,FULL_SHARD,False,False,2667566080 -3b,2048,8,h100_80gb,22.22,22.22,10,6,480,95,196229,24528,983040,amp_bf16,PURE,FULL_SHARD,False,False,2651837440 -3b,2048,8,h100_80gb,22.21,22.21,10,6,480,95,196171,24521,983040,amp_bf16,PURE,FULL_SHARD,False,False,2651837440 -3b,2048,8,h100_80gb,22.1,22.1,10,6,480,95,195192,24399,983040,amp_bf16,PURE,FULL_SHARD,False,False,2651837440 -3b,512,8,h100_80gb,23.03,23.03,40,6,1920,434,222413,27801,983040,amp_bf16,PURE,FULL_SHARD,False,False,2647905280 -3b,512,8,h100_80gb,23.19,23.19,40,6,1920,437,223994,27999,983040,amp_bf16,PURE,FULL_SHARD,False,False,2647905280 -3b,512,8,h100_80gb,23.07,23.07,40,6,1920,435,222834,27854,983040,amp_bf16,PURE,FULL_SHARD,False,False,2647905280 -1b,32768,8,h100_80gb,16.8,16.8,1,4,32,2,96354,12044,1048576,amp_bf16,PURE,FULL_SHARD,False,False,1378865152 -1b,32768,8,h100_80gb,16.77,16.77,1,4,32,2,96182,12022,1048576,amp_bf16,PURE,FULL_SHARD,False,False,1378865152 -1b,32768,8,h100_80gb,16.75,16.75,1,4,32,2,96105,12013,1048576,amp_bf16,PURE,FULL_SHARD,False,False,1378865152 -1b,8192,8,h100_80gb,18.33,18.33,2,4,64,27,226682,28335,524288,amp_bf16,PURE,FULL_SHARD,False,False,1328533504 -1b,8192,8,h100_80gb,18.37,18.37,2,4,64,27,227183,28397,524288,amp_bf16,PURE,FULL_SHARD,False,False,1328533504 -1b,8192,8,h100_80gb,18.2,18.2,2,4,64,27,225010,28126,524288,amp_bf16,PURE,FULL_SHARD,False,False,1328533504 -350m,32768,8,h100_80gb,14.48,14.48,1,4,32,5,191165,23895,1048576,amp_bf16,PURE,FULL_SHARD,False,False,387442688 -350m,32768,8,h100_80gb,14.44,14.44,1,4,32,5,190718,23839,1048576,amp_bf16,PURE,FULL_SHARD,False,False,387442688 -350m,32768,8,h100_80gb,14.49,14.49,1,4,32,5,191350,23918,1048576,amp_bf16,PURE,FULL_SHARD,False,False,387442688 -350m,16384,8,h100_80gb,14.95,14.95,2,4,64,20,335478,41934,1048576,amp_bf16,PURE,FULL_SHARD,False,False,370665472 -350m,16384,8,h100_80gb,14.88,14.88,2,4,64,20,333921,41740,1048576,amp_bf16,PURE,FULL_SHARD,False,False,370665472 -350m,16384,8,h100_80gb,14.98,14.98,2,4,64,20,336016,42002,1048576,amp_bf16,PURE,FULL_SHARD,False,False,370665472 -350m,512,8,h100_80gb,16.16,16.16,56,4,1792,2194,1123449,140431,917504,amp_bf16,PURE,FULL_SHARD,False,False,354412544 -350m,512,8,h100_80gb,16.4,16.4,56,4,1792,2226,1139870,142483,917504,amp_bf16,PURE,FULL_SHARD,False,False,354412544 -350m,512,8,h100_80gb,16.38,16.38,56,4,1792,2224,1138963,142370,917504,amp_bf16,PURE,FULL_SHARD,False,False,354412544 diff --git a/scripts/train/benchmarking/h100.numbers b/scripts/train/benchmarking/h100.numbers deleted file mode 100755 index 4d11c45f13..0000000000 Binary files a/scripts/train/benchmarking/h100.numbers and /dev/null differ diff --git a/scripts/train/benchmarking/h100_vsOAI.csv b/scripts/train/benchmarking/h100_vsOAI.csv deleted file mode 100644 index 46a14f7163..0000000000 --- a/scripts/train/benchmarking/h100_vsOAI.csv +++ /dev/null @@ -1,9 +0,0 @@ -Model,SeqLen (T),# GPUs,GPU,MFU,HFU,MicroBatchSize,GradAccum,GlobalBatchSize,Throughput (S/s),Throughput (T/s),Throughput (T/s/GPU),GlobalBatchSize (T),Precision,MP Mode,Sharding Strategy,Activation Checkpointing,Activation CPUOffload,NumParams -70b,2048,64,h100_80gb,42.57,56.76,8,4,2048,32,66523,1039,4194304,amp_bf16,DEFAULT,FULL_SHARD,TRUE,FALSE,64862437376 -70b,2048,32,h100_80gb,36.15,48.2,2,16,1024,13,28242,882,2097152,amp_bf16,DEFAULT,FULL_SHARD,TRUE,FALSE,64862437376 -30b,8192,8,h100_80gb,29.92,39.9,1,21,168,1,11072,1384,1376256,amp_bf16,DEFAULT,FULL_SHARD,TRUE,FALSE,30019254272 -30b,4096,8,h100_80gb,35.86,47.81,1,21,168,3,14419,1802,688128,amp_bf16,DEFAULT,FULL_SHARD,TRUE,FALSE,29989894144 -30b,2048,32,h100_80gb,43.92,58.57,14,3,1344,36,73860,2308,2752512,amp_bf16,DEFAULT,FULL_SHARD,TRUE,FALSE,29975214080 -30b,2048,16,h100_80gb,43.07,57.42,10,3,480,17,36209,2263,983040,amp_bf16,DEFAULT,FULL_SHARD,TRUE,FALSE,29975214080 -30b,2048,8,h100_80gb,38.11,50.82,3,21,504,7,16022,2002,1032192,amp_bf16,DEFAULT,FULL_SHARD,TRUE,FALSE,29975214080 -30b,1024,8,h100_80gb,38.76,51.68,6,21,1008,16,16672,2084,1032192,amp_bf16,DEFAULT,FULL_SHARD,TRUE,FALSE,29967874048 \ No newline at end of file diff --git a/scripts/train/benchmarking/h100_vsOAI.numbers b/scripts/train/benchmarking/h100_vsOAI.numbers deleted file mode 100755 index 7b063de9b9..0000000000 Binary files a/scripts/train/benchmarking/h100_vsOAI.numbers and /dev/null differ diff --git a/scripts/train/benchmarking/h100new.csv b/scripts/train/benchmarking/h100new.csv deleted file mode 100644 index 01824a76be..0000000000 --- a/scripts/train/benchmarking/h100new.csv +++ /dev/null @@ -1,132 +0,0 @@ -Model,SeqLen (T),# GPUs,GPU,MFU,HFU,MicroBatchSize,GradAccum,GlobalBatchSize,Throughput (S/s),Throughput (T/s),Throughput (T/s/GPU),GlobalBatchSize (T),Precision,MP Mode,Sharding Strategy,Activation Checkpointing,Activation CPUOffload,NumParams,Image -70b,2048,64,h100_80gb,42.57,56.76,8,4,2048,32,66523,1039,4194304,amp_bf16,DEFAULT,FULL_SHARD,True,False,64862437376,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -70b,2048,32,h100_80gb,36.15,48.2,2,16,1024,13,28242,882,2097152,amp_bf16,DEFAULT,FULL_SHARD,True,False,64862437376,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -30b,8192,8,h100_80gb,29.92,39.9,1,21,168,1,11072,1384,1376256,amp_bf16,DEFAULT,FULL_SHARD,True,False,30019254272,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -30b,8192,8,a100_80gb,39.38,52.5,1,21,168,0,4594,574,1376256,amp_bf16,DEFAULT,FULL_SHARD,True,False,30019254272,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -30b,4096,8,h100_80gb,35.86,47.81,1,21,168,3,14419,1802,688128,amp_bf16,DEFAULT,FULL_SHARD,True,False,29989894144,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -30b,4096,8,a100_80gb,51.37,68.49,1,21,168,1,6513,814,688128,amp_bf16,DEFAULT,FULL_SHARD,True,False,29989894144,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -30b,2048,32,h100_80gb,43.92,58.57,14,3,1344,36,73860,2308,2752512,amp_bf16,DEFAULT,FULL_SHARD,True,False,29975214080,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -30b,2048,16,h100_80gb,43.07,57.42,10,3,480,17,36209,2263,983040,amp_bf16,DEFAULT,FULL_SHARD,True,False,29975214080,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -30b,2048,8,h100_80gb,38.11,50.82,3,21,504,7,16022,2002,1032192,amp_bf16,DEFAULT,FULL_SHARD,True,False,29975214080,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -30b,2048,8,a100_80gb,55.3,73.74,3,21,504,3,7330,916,1032192,amp_bf16,DEFAULT,FULL_SHARD,True,False,29975214080,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -30b,1024,8,h100_80gb,38.76,51.68,6,21,1008,16,16672,2084,1032192,amp_bf16,DEFAULT,FULL_SHARD,True,False,29967874048,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -30b,1024,8,a100_80gb,55.82,74.43,6,21,1008,7,7571,946,1032192,amp_bf16,DEFAULT,FULL_SHARD,True,False,29967874048,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -30b,512,8,a100_80gb,56.4,75.2,12,21,2016,15,7739,967,1032192,amp_bf16,DEFAULT,FULL_SHARD,True,False,29964204032,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -13b,32768,8,h100_80gb,31.68,42.24,1,3,24,0,15812,1976,786432,amp_bf16,DEFAULT,FULL_SHARD,True,False,13011240960,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -13b,32768,8,a100_80gb,51.69,68.92,1,3,24,0,8134,1016,786432,amp_bf16,DEFAULT,FULL_SHARD,True,False,13011240960,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -13b,16384,8,h100_80gb,35.55,47.4,3,3,72,1,23881,2985,1179648,amp_bf16,DEFAULT,FULL_SHARD,True,False,12927354880,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -13b,16384,8,a100_80gb,54.07,72.1,3,3,72,0,11454,1431,1179648,amp_bf16,DEFAULT,FULL_SHARD,True,False,12927354880,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -13b,8192,8,a100_80gb,56.07,74.76,5,3,120,1,14362,1795,983040,amp_bf16,DEFAULT,FULL_SHARD,True,False,12885411840,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -13b,4096,8,h100_80gb,41.6,55.47,10,3,240,9,37740,4717,983040,amp_bf16,DEFAULT,FULL_SHARD,True,False,12864440320,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -13b,4096,8,a100_80gb,57.62,76.82,10,3,240,4,16482,2060,983040,amp_bf16,DEFAULT,FULL_SHARD,True,False,12864440320,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -13b,2048,64,h100_80gb,39.86,39.86,2,1,128,150,307209,4800,262144,amp_bf16,DEFAULT,FULL_SHARD,False,False,12853954560,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -13b,2048,32,h100_80gb,39.95,39.95,2,1,64,75,153960,4811,131072,amp_bf16,DEFAULT,FULL_SHARD,False,False,12853954560,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -13b,2048,16,h100_80gb,39.58,39.58,2,1,32,37,76280,4767,65536,amp_bf16,DEFAULT,FULL_SHARD,False,False,12853954560,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -13b,2048,8,a100_80gb,59.57,59.57,2,3,48,8,18097,2262,98304,amp_bf16,DEFAULT,FULL_SHARD,False,False,12853954560,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -13b,2048,8,h100_80gb,39.79,39.79,2,1,16,18,38336,4792,32768,amp_bf16,DEFAULT,FULL_SHARD,False,False,12853954560,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -13b,1024,8,h100_80gb,44.27,59.03,40,3,960,42,44019,5502,983040,amp_bf16,DEFAULT,FULL_SHARD,True,False,12848711680,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -13b,1024,8,a100_80gb,59.48,79.3,40,3,960,18,18647,2330,983040,amp_bf16,DEFAULT,FULL_SHARD,True,False,12848711680,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -7b,65536,8,h100_80gb,28.59,38.13,1,2,16,0,15654,1956,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,6918905856,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -7b,65536,8,a100_80gb,46.97,62.63,1,2,16,0,8108,1013,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,6918905856,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -7b,32768,8,h100_80gb,30.94,41.25,2,2,32,0,26550,3318,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,6784688128,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -7b,32768,8,a100_80gb,49.46,65.94,2,2,32,0,13382,1672,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,6784688128,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -7b,16384,8,a100_80gb,51.96,69.28,4,2,64,1,19629,2453,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,6717579264,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -7b,8192,8,h100_80gb,37.14,49.52,8,2,128,6,55481,6935,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,6684024832,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -7b,8192,8,a100_80gb,54.47,72.62,8,2,128,3,25655,3206,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,6684024832,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -7b,4096,8,h100_80gb,40.42,53.9,16,2,256,16,68893,8611,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,6667247616,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -7b,4096,8,a100_80gb,54.84,73.12,16,2,256,7,29472,3684,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,6667247616,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -7b,2048,8,a100_80gb,64.23,64.23,6,2,96,18,37130,4641,196608,amp_bf16,DEFAULT,FULL_SHARD,False,False,6658859008,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -7b,2048,8,h100_80gb,46.44,46.44,6,1,48,41,85144,10643,98304,amp_bf16,DEFAULT,FULL_SHARD,False,False,6658859008,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -7b,1024,8,h100_80gb,42.83,57.11,64,2,1024,79,81628,10203,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,6654664704,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -7b,1024,8,a100_80gb,58.01,77.35,64,2,1024,34,34857,4357,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,6654664704,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,65536,8,h100_80gb,26.81,35.74,1,2,16,0,26099,3262,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,2814366720,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,65536,8,a100_80gb,46.05,61.41,1,2,16,0,14137,1767,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,2814366720,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,32768,8,h100_80gb,28.84,38.46,3,6,144,1,46984,5873,4718592,amp_bf16,DEFAULT,FULL_SHARD,True,False,2730480640,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,32768,8,a100_80gb,47.18,62.91,3,6,144,0,24235,3029,4718592,amp_bf16,DEFAULT,FULL_SHARD,True,False,2730480640,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,16384,8,h100_80gb,36.34,36.34,1,6,48,5,89223,11152,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,2688537600,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,16384,8,a100_80gb,57.13,57.13,1,6,48,2,44233,5529,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,2688537600,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,8192,8,h100_80gb,40.31,40.31,3,6,144,16,132626,16578,1179648,amp_bf16,DEFAULT,FULL_SHARD,False,False,2667566080,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,8192,8,a100_80gb,59.34,59.34,3,6,144,7,61567,7695,1179648,amp_bf16,DEFAULT,FULL_SHARD,False,False,2667566080,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,4096,8,h100_80gb,42.31,42.31,5,6,240,40,167712,20964,983040,amp_bf16,DEFAULT,FULL_SHARD,False,False,2657080320,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,4096,8,a100_80gb,60.53,60.53,5,6,240,18,75658,9457,983040,amp_bf16,DEFAULT,FULL_SHARD,False,False,2657080320,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,2048,64,h100_80gb,40.8,40.8,6,3,1152,703,1441663,22525,2359296,amp_bf16,DEFAULT,FULL_SHARD,False,False,2651837440,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,2048,32,h100_80gb,41.7,41.7,6,3,576,359,736701,23021,1179648,amp_bf16,DEFAULT,FULL_SHARD,False,False,2651837440,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,2048,16,h100_80gb,43.73,43.73,10,3,480,188,386285,24142,983040,amp_bf16,DEFAULT,FULL_SHARD,False,False,2651837440,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,2048,8,a100_80gb,62.11,62.11,10,2,160,42,86491,10811,327680,amp_bf16,DEFAULT,FULL_SHARD,False,False,2651837440,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,1024,8,h100_80gb,46.2,46.2,20,6,960,211,216369,27046,983040,amp_bf16,DEFAULT,FULL_SHARD,False,False,2649216000,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,1024,8,a100_80gb,62.73,62.73,20,6,960,90,92643,11580,983040,amp_bf16,DEFAULT,FULL_SHARD,False,False,2649216000,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,512,8,h100_80gb,46.32,46.32,40,6,1920,436,223721,27965,983040,amp_bf16,DEFAULT,FULL_SHARD,False,False,2647905280,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -3b,512,8,a100_80gb,63.71,63.71,40,6,1920,189,97019,12127,983040,amp_bf16,DEFAULT,FULL_SHARD,False,False,2647905280,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,65536,8,h100_80gb,26.34,35.12,1,2,16,0,44050,5506,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,1445974016,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,65536,8,a100_80gb,46.18,61.57,1,2,16,0,24353,3044,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,1445974016,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,32768,8,h100_80gb,33.54,33.54,1,4,32,2,96203,12025,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,1378865152,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,32768,8,a100_80gb,55.52,55.52,1,4,32,1,50207,6275,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,1378865152,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,16384,8,h100_80gb,35.22,35.22,2,4,64,9,157194,19649,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,1345310720,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,16384,8,a100_80gb,56.6,56.6,2,4,64,4,79650,9956,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,1345310720,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,8192,8,h100_80gb,37.73,37.73,3,4,96,28,233256,29157,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,1328533504,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,8192,8,a100_80gb,56.69,56.69,3,4,96,13,110516,13814,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,1328533504,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,4096,8,h100_80gb,40.26,40.26,7,4,224,75,308282,38535,917504,amp_bf16,DEFAULT,FULL_SHARD,False,False,1320144896,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,4096,8,a100_80gb,59.0,59.0,7,4,224,34,142457,17807,917504,amp_bf16,DEFAULT,FULL_SHARD,False,False,1320144896,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,2048,64,h100_80gb,40.85,40.85,20,1,1280,1387,2841754,44402,2621440,amp_bf16,DEFAULT,FULL_SHARD,False,False,1315950592,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,2048,32,h100_80gb,41.52,41.52,20,1,640,705,1444183,45130,1310720,amp_bf16,DEFAULT,FULL_SHARD,False,False,1315950592,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,2048,16,h100_80gb,42.36,42.36,20,1,320,359,736596,46037,655360,amp_bf16,DEFAULT,FULL_SHARD,False,False,1315950592,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,2048,8,a100_80gb,59.86,59.86,14,4,448,80,164109,20513,917504,amp_bf16,DEFAULT,FULL_SHARD,False,False,1315950592,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,2048,8,h100_80gb,41.82,41.82,14,1,112,177,363645,45455,229376,amp_bf16,DEFAULT,FULL_SHARD,False,False,1315950592,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,1024,8,h100_80gb,41.95,41.95,18,4,576,382,391287,48910,589824,amp_bf16,DEFAULT,FULL_SHARD,False,False,1313853440,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,1024,8,a100_80gb,60.15,60.15,18,4,576,172,176898,22112,589824,amp_bf16,DEFAULT,FULL_SHARD,False,False,1313853440,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,512,8,h100_80gb,43.21,43.21,56,4,1792,816,418201,52275,917504,amp_bf16,DEFAULT,FULL_SHARD,False,False,1312804864,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -1b,512,8,a100_80gb,60.68,60.68,56,4,1792,361,185186,23148,917504,amp_bf16,DEFAULT,FULL_SHARD,False,False,1312804864,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,65536,8,a100_80gb,45.34,60.45,1,2,16,0,33150,4143,1048576,amp_bf16,DEFAULT,FULL_SHARD,True,False,857988096,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,32768,8,h100_80gb,31.84,31.84,1,2,16,3,130333,16291,524288,amp_bf16,DEFAULT,FULL_SHARD,False,False,807656448,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,32768,8,a100_80gb,54.57,54.57,1,2,16,2,70417,8802,524288,amp_bf16,DEFAULT,FULL_SHARD,False,False,807656448,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,16384,8,h100_80gb,33.57,33.57,3,2,48,13,222521,27815,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,782490624,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,16384,8,a100_80gb,54.64,54.64,3,2,48,6,114198,14274,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,782490624,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,8192,8,h100_80gb,34.84,34.84,6,2,96,40,334602,41825,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,769907712,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,8192,8,a100_80gb,55.31,55.31,6,2,96,20,167471,20933,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,769907712,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,4096,8,h100_80gb,35.83,35.83,12,2,192,108,443674,55459,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,763616256,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,4096,8,a100_80gb,56.05,56.05,12,2,192,53,218808,27351,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,763616256,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,2048,32,h100_80gb,37.57,37.57,24,1,768,1062,2175091,67971,1572864,amp_bf16,DEFAULT,FULL_SHARD,False,False,760470528,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,2048,16,h100_80gb,37.89,37.89,24,1,384,535,1096819,68551,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,760470528,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,2048,8,h100_80gb,34.9,34.9,24,2,384,246,505177,63147,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,760470528,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,2048,8,a100_80gb,56.85,56.85,24,2,384,126,259472,32434,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,760470528,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,1024,8,h100_80gb,39.76,39.76,48,2,768,613,628648,78581,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,758897664,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,1024,8,a100_80gb,47.76,47.76,48,2,768,232,238122,29765,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,758897664,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,512,8,h100_80gb,40.42,40.42,96,2,1536,1308,669998,83749,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,758111232,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -760m,512,8,a100_80gb,45.07,45.07,96,2,1536,460,235571,29446,786432,amp_bf16,DEFAULT,FULL_SHARD,False,False,758111232,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,65536,8,h100_80gb,28.56,28.56,1,2,16,1,103458,12932,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,420997120,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,65536,8,a100_80gb,52.7,52.7,1,2,16,0,60195,7524,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,420997120,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,32768,8,h100_80gb,28.89,28.89,2,2,32,5,190745,23843,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,387442688,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,32768,8,a100_80gb,52.46,52.46,2,2,32,3,109222,13652,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,387442688,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,16384,8,h100_80gb,30.36,30.36,4,2,64,20,340610,42576,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,370665472,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,16384,8,a100_80gb,53.28,53.28,4,2,64,11,188478,23559,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,370665472,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,8192,8,h100_80gb,31.78,31.78,8,2,128,66,548110,68513,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,362276864,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,8192,8,a100_80gb,53.8,53.8,8,2,128,35,292559,36569,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,362276864,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,4096,8,a100_80gb,53.31,53.31,16,2,256,96,396442,49555,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,358082560,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,2048,32,h100_80gb,32.5,32.5,32,1,1024,1833,3755965,117373,2097152,amp_bf16,DEFAULT,FULL_SHARD,False,False,355985408,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,2048,16,h100_80gb,33.04,33.04,32,1,512,932,1909430,119339,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,355985408,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,2048,8,h100_80gb,33.99,33.99,32,2,512,479,982082,122760,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,355985408,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,2048,8,a100_80gb,51.78,51.78,32,2,512,230,471744,58968,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,355985408,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,2048,8,a100_80gb,51.62,51.62,32,2,512,229,470263,58782,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,355985408,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,1024,8,h100_80gb,34.8,34.8,64,2,1024,1106,1132789,141598,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,354936832,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,1024,8,a100_80gb,50.51,50.51,64,2,1024,506,518504,64813,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,354936832,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -350m,512,8,a100_80gb,50.61,50.61,128,2,2048,1083,554643,69330,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,354412544,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,65536,8,h100_80gb,27.9,27.9,1,2,16,4,266378,33297,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,174070272,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,65536,8,a100_80gb,54.13,54.13,1,2,16,2,162946,20368,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,174070272,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,32768,8,h100_80gb,27.53,27.53,2,2,32,14,482399,60299,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,148904448,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,32768,8,a100_80gb,52.71,52.71,2,2,32,8,291256,36407,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,148904448,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,16384,8,h100_80gb,27.62,27.62,4,2,64,50,831475,103934,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,136321536,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,16384,8,a100_80gb,50.61,50.61,4,2,64,29,480322,60040,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,136321536,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,8192,8,a100_80gb,48.85,48.85,8,2,128,88,723142,90392,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,130030080,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,4096,8,h100_80gb,26.99,26.99,16,2,256,429,1759588,219948,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,126884352,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,4096,8,a100_80gb,46.08,46.08,16,2,256,231,947172,118396,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,126884352,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,2048,32,h100_80gb,24.76,24.76,32,1,1024,3913,8014684,250458,2097152,amp_bf16,DEFAULT,FULL_SHARD,False,False,125311488,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,2048,16,h100_80gb,25.1,25.1,32,1,512,1982,4061159,253822,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,125311488,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,2048,8,h100_80gb,26.77,26.77,40,2,640,1057,2165641,270705,1310720,amp_bf16,DEFAULT,FULL_SHARD,False,False,125311488,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,2048,8,a100_80gb,44.68,44.68,32,2,512,556,1139902,142487,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,125311488,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,2048,8,a100_80gb,44.45,44.45,32,2,512,553,1133901,141737,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,125311488,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,1024,8,h100_80gb,26.23,26.23,64,2,1024,2356,2413517,301689,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,124525056,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,1024,8,a100_80gb,43.15,43.15,64,2,1024,1222,1251751,156468,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,124525056,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,512,8,a100_80gb,42.56,42.56,128,2,2048,2588,1325455,165681,1048576,amp_bf16,DEFAULT,FULL_SHARD,False,False,124131840,mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04 -125m,2048,8,a100_40gb,-0.02,-0.02,-1,-64,512,-1,-2048,-256,1048576,bf16,PURE,FULL_SHARD,False,False,-1,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -125m,2048,8,a100_40gb,-0.02,-0.02,-1,-64,512,-1,-2048,-256,1048576,bf16,PURE,FULL_SHARD,False,False,-1,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -125m,2048,8,a100_40gb,6.79,6.79,8,2,128,84,173271,21658,262144,amp_bf16,PURE,FULL_SHARD,False,False,125311488,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 diff --git a/scripts/train/benchmarking/larger_mb.csv b/scripts/train/benchmarking/larger_mb.csv deleted file mode 100644 index 4ccc6d6d36..0000000000 --- a/scripts/train/benchmarking/larger_mb.csv +++ /dev/null @@ -1,10 +0,0 @@ -Model,SeqLen (T),# GPUs,GPU,MFU,HFU,MicroBatchSize,GradAccum,GlobalBatchSize,Throughput (S/s),Throughput (T/s),Throughput (T/s/GPU),GlobalBatchSize (T),Precision,MP Mode,Sharding Strategy,Activation Checkpointing,Activation CPUOffload,NumParams -13b,2048,8,a100_80gb,58.67,58.67,2,2,32,8,17824,2228,65536,amp_bf16,DEFAULT,FULL_SHARD,False,False,12853954560 -13b,2048,8,a100_80gb,49.18,49.18,1,2,16,7,14942,1867,32768,amp_bf16,DEFAULT,FULL_SHARD,False,False,12853954560 -7b,2048,8,a100_80gb,65.25,65.25,6,2,96,18,37723,4715,196608,amp_bf16,DEFAULT,FULL_SHARD,False,False,6658859008 -7b,2048,8,a100_80gb,62.46,62.46,4,2,64,17,36110,4513,131072,amp_bf16,DEFAULT,FULL_SHARD,False,False,6658859008 -7b,2048,8,a100_80gb,55.97,55.97,2,2,32,15,32355,4044,65536,amp_bf16,DEFAULT,FULL_SHARD,False,False,6658859008 -7b,2048,8,a100_80gb,46.59,46.59,1,2,16,13,26934,3366,32768,amp_bf16,DEFAULT,FULL_SHARD,False,False,6658859008 -3b,2048,8,a100_40gb,40.65,40.65,1,2,16,27,56609,7076,32768,amp_bf16,DEFAULT,FULL_SHARD,False,False,2651837440 -3b,2048,8,a100_80gb,34.93,46.57,1,2,16,23,48635,6079,32768,amp_bf16,DEFAULT,FULL_SHARD,True,True,2651837440 -3b,2048,8,a100_80gb,35.02,46.69,1,2,16,23,48759,6094,32768,amp_bf16,DEFAULT,FULL_SHARD,True,True,2651837440 diff --git a/scripts/train/benchmarking/larger_mb.numbers b/scripts/train/benchmarking/larger_mb.numbers deleted file mode 100755 index 5300741f4f..0000000000 Binary files a/scripts/train/benchmarking/larger_mb.numbers and /dev/null differ diff --git a/scripts/train/benchmarking/submit_benchmarks.py b/scripts/train/benchmarking/submit_benchmarks.py index c5d2becf1f..6c26724387 100644 --- a/scripts/train/benchmarking/submit_benchmarks.py +++ b/scripts/train/benchmarking/submit_benchmarks.py @@ -421,16 +421,14 @@ def run_config(config: Tuple[str, int, int, str, str, int, str], if 'nightly' in args.image: # Fix older composer deps. TODO: this should be removed once mvpatel2000/composer.git@784f50be7fa8617ed562704c0207316ca2284e71 is merged command += """pip install -U git+https://github.com/mvpatel2000/composer.git@784f50be7fa8617ed562704c0207316ca2284e71 pip uninstall torch==2.0.1 --yes - pip install --no-cache-dir --pre --index-url https://download.pytorch.org/whl/nightly/cu121 torch==2.1.0.dev20230821+cu121""" + pip install --no-cache-dir --pre --index-url https://download.pytorch.org/whl/nightly/cu121 torch==2.1.0.dev20230821+cu121""" if gpu_type == 'h100_80gb': # Required for flash-attn and FP8 training command += f""" pip install flash-attn==1.0.7 --no-build-isolation pip install git+https://github.com/NVIDIA/TransformerEngine.git@v0.10 pip uninstall install pydantic --yes pip install pydantic==1.9.0 - cd llm-foundry/scripts - python data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --tokenizer gpt2 --eos_text '<|endoftext|>' - composer train/train.py /mnt/config/parameters.yaml""" + """ if args.data_remote is None: command += f""" diff --git a/scripts/train/benchmarking/sweep.sh b/scripts/train/benchmarking/sweep.sh index aada6bc4aa..1949bff4eb 100755 --- a/scripts/train/benchmarking/sweep.sh +++ b/scripts/train/benchmarking/sweep.sh @@ -1,109 +1,109 @@ #!/bin/bash -PROJECT="tput" +PROJECT="opt30" GIT_COMMIT="v0.0.4" -# IMAGE="mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04" -IMAGE="mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04" -CLUSTER_80GB=r9z1 +IMAGE="mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04" +# IMAGE="mosaicml/pytorch:2.1.0_cu121-nightly20230827-python3.10-ubuntu20.04" +CLUSTER_80GB=r1z1 # A100 80GB # seqlen 2048 -python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 40 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --attn_impl xformers --cluster $CLUSTER_80GB -s 11 11 --RUN -# python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 32 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -# python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 24 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -# python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 14 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -# python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 10 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 7 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 3 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 40 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 32 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 24 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 14 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 10 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 6 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 3 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 7 --accum 1 --image $IMAGE1 --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 7 --accum 1 --image $IMAGE0 --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 7 --accum 1 --image $IMAGE1 --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 7 --accum 1 --image $IMAGE0 --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 6 --accum 1 --image $IMAGE1 --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 6 --accum 1 --image $IMAGE0 --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 6 --accum 1 --image $IMAGE1 --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 6 --accum 1 --image $IMAGE0 --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -# # INCREASE GPU COUNT -# for GPU_NUM in 16 32 64 -# do -# python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g $GPU_NUM --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -# python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g $GPU_NUM --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -# python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g $GPU_NUM --microbatch_size 24 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -# python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g $GPU_NUM --microbatch_size 20 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g $GPU_NUM --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -# done +# INCREASE GPU COUNT +for GPU_NUM in 16 32 64 +do + python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g $GPU_NUM --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN + python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g $GPU_NUM --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN + python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g $GPU_NUM --microbatch_size 24 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN + python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g $GPU_NUM --microbatch_size 20 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g $GPU_NUM --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +done -# python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 16 --microbatch_size 10 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 16 --microbatch_size 24 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -# python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 16 --microbatch_size 10 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -# python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 32 --microbatch_size 6 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 32 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 64 --microbatch_size 6 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 64 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 32 --microbatch_size 14 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -# python submit_benchmarks.py --project $PROJECT -m 70b.yaml -g 32 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -# python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 64 --microbatch_size 16 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -# python submit_benchmarks.py --project $PROJECT -m 70b.yaml -g 64 --microbatch_size 8 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 16 --microbatch_size 10 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 16 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 16 --microbatch_size 10 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 32 --microbatch_size 6 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 32 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 64 --microbatch_size 6 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 64 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 32 --microbatch_size 14 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 70b.yaml -g 32 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 64 --microbatch_size 16 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 70b.yaml -g 64 --microbatch_size 8 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN # SCALE SEQUENCE LENGTH # seqlen 512 -# python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 128 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --precision fp8 --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN -# python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 128 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN -# python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 96 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN -# python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 56 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN -# python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 40 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 64 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 20 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 12 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN -# # seqlen 1024 -# python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 64 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN -# python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 64 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN -# python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 48 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN -# python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 18 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN -# python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 20 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 64 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 40 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN -# python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 6 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN -# # seqlen 4096 -# python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 16 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN -# python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 16 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN -# python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 12 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN -# python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 7 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN -# python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 5 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 16 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 10 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN -# python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 1 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN -# # seqlen 8192 -# python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 8 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN -# python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 8 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN -# python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 6 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN -# python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 3 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN -# python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 8 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 5 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN -# python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 1 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN -# # seqlen 16384 -# python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 4 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN -# python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 4 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN -# python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 3 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN -# python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 2 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN -# python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 4 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 3 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN -# # seqlen 32768 -# python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 2 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN -# python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 2 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN -# python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN -# python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 1 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN -# python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN -# # seqlen 65536 -# python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN -# python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN -# python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN --fsdp_config_activation_checkpointing true -# python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN --fsdp_config_activation_checkpointing true -# python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN +python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 128 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --precision fp8 --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN +python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 128 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN +python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 96 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 56 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 40 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 64 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 20 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 12 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN +# seqlen 1024 +python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 64 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN +python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 64 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN +python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 48 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 18 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 20 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 64 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 40 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN +python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 6 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN +# seqlen 4096 +python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 16 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN +python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 16 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN +python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 12 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 7 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 5 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 16 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 10 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN +python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 1 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN +# seqlen 8192 +python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 8 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN +python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 8 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN +python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 6 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 3 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 8 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 5 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN +python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN +# seqlen 16384 +python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 4 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN +python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 4 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN +python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 3 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 2 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 4 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 3 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN +# seqlen 32768 +python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 2 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN +python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 2 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN +python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 1 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN +# seqlen 65536 +python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN +python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN +python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN --fsdp_config_activation_checkpointing true +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN --fsdp_config_activation_checkpointing true +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN diff --git a/scripts/train/benchmarking/torches.csv b/scripts/train/benchmarking/torches.csv deleted file mode 100644 index 3cf7be6db5..0000000000 --- a/scripts/train/benchmarking/torches.csv +++ /dev/null @@ -1,25 +0,0 @@ -Model,SeqLen (T),# GPUs,GPU,MFU,HFU,MicroBatchSize,GradAccum,GlobalBatchSize,Throughput (S/s),Throughput (T/s),Throughput (T/s/GPU),GlobalBatchSize (T),Precision,MP Mode,Sharding Strategy,Activation Checkpointing,Activation CPUOffload,NumParams,Image -30b,2048,8,a100_80gb,52.12,69.49,3,21,504,3,6908,863,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29975214080,mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -30b,1024,8,a100_80gb,52.43,69.91,6,21,1008,6,7111,888,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29967874048,mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -30b,512,8,a100_80gb,52.56,70.08,12,21,2016,14,7212,901,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29964204032,mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -13b,32768,8,a100_80gb,50.26,67.02,1,3,24,0,7910,988,786432,amp_bf16,PURE,FULL_SHARD,True,False,13011240960,mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -13b,8192,8,a100_80gb,52.76,70.35,5,3,120,1,13514,1689,983040,amp_bf16,PURE,FULL_SHARD,True,False,12885411840,mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -13b,2048,8,a100_80gb,55.33,73.78,20,3,480,8,16810,2101,983040,amp_bf16,PURE,FULL_SHARD,True,False,12853954560,mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -13b,512,8,a100_80gb,55.93,74.57,80,3,1920,34,17821,2227,983040,amp_bf16,PURE,FULL_SHARD,True,False,12846090240,mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -7b,32768,8,a100_80gb,48.75,65.0,2,2,32,0,13189,1648,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6784688128,mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -7b,8192,8,a100_80gb,51.47,68.62,8,2,128,2,24242,3030,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6684024832,mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -7b,2048,8,a100_80gb,54.23,72.3,32,2,512,15,31348,3918,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6658859008,mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -7b,512,8,a100_80gb,55.27,73.69,128,2,2048,66,33876,4234,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6652567552,mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -30b,8192,8,a100_80gb,39.86,53.15,1,21,168,0,4650,581,1376256,amp_bf16,PURE,FULL_SHARD,True,False,30019254272,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -30b,4096,8,a100_80gb,49.36,65.81,1,21,168,1,6258,782,688128,amp_bf16,PURE,FULL_SHARD,True,False,29989894144,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -30b,2048,8,a100_80gb,51.8,69.06,3,21,504,3,6865,858,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29975214080,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -30b,1024,8,a100_80gb,51.97,69.3,6,21,1008,6,7048,881,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29967874048,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -30b,512,8,a100_80gb,52.04,69.38,12,21,2016,13,7140,892,1032192,amp_bf16,PURE,FULL_SHARD,True,False,29964204032,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -13b,32768,8,a100_80gb,49.34,65.79,1,3,24,0,7765,970,786432,amp_bf16,PURE,FULL_SHARD,True,False,13011240960,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -13b,8192,8,a100_80gb,52.45,69.93,5,3,120,1,13433,1679,983040,amp_bf16,PURE,FULL_SHARD,True,False,12885411840,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -13b,2048,8,a100_80gb,53.63,71.51,20,3,480,7,16293,2036,983040,amp_bf16,PURE,FULL_SHARD,True,False,12853954560,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -13b,512,8,a100_80gb,55.07,73.43,80,3,1920,34,17547,2193,983040,amp_bf16,PURE,FULL_SHARD,True,False,12846090240,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -7b,32768,8,a100_80gb,48.13,64.18,2,2,32,0,13023,1627,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6784688128,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -7b,8192,8,a100_80gb,50.19,66.91,8,2,128,2,23639,2954,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6684024832,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -7b,2048,8,a100_80gb,52.45,69.93,32,2,512,14,30321,3790,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6658859008,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -7b,512,8,a100_80gb,52.93,70.57,128,2,2048,63,32442,4055,1048576,amp_bf16,PURE,FULL_SHARD,True,False,6652567552,mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04