diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml index 890029f911ae..7098fe73abff 100755 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml @@ -85,16 +85,22 @@ model: type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' adapter_dim: 32 adapter_dropout: 0.0 - norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used. + norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True lora_tuning: adapter_dim: 32 adapter_dropout: 0.0 column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True # Used for p-tuning peft training p_tuning: @@ -102,6 +108,9 @@ model: bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck embedding_dim: 1024 # the size of the prompt encoder embeddings init_std: 0.023 + + ia3_tuning: + layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers data: train_ds: diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py index f9f8e1ee952f..cc7cb8060be1 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py @@ -25,9 +25,11 @@ from nemo.collections.nlp.models.language_modeling.megatron_gpt_peft_models import ( MegatronGPTAdapterModel, + MegatronGPTAdapterModelWeightTying, MegatronGPTAdapterPTuningModel, MegatronGPTIA3Model, MegatronGPTLoRAModel, + MegatronGPTLoRAModelWeightTying, MegatronGPTPTuningModel, ) from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTModel @@ -114,7 +116,10 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): def _get_peft_scheme(cfg): if cfg.peft.peft_scheme == "adapter": - peft_cls = MegatronGPTAdapterModel + if cfg.peft.adapter_tuning.weight_tying: + peft_cls = MegatronGPTAdapterModelWeightTying + else: + peft_cls = MegatronGPTAdapterModel elif cfg.peft.peft_scheme == "ia3": peft_cls = MegatronGPTIA3Model elif cfg.peft.peft_scheme == "ptuning": @@ -122,7 +127,10 @@ def _get_peft_scheme(cfg): elif cfg.peft.peft_scheme == "adapter_and_ptuning": peft_cls = MegatronGPTAdapterPTuningModel elif cfg.peft.peft_scheme == "lora": - peft_cls = MegatronGPTLoRAModel + if cfg.peft.lora_tuning.weight_tying: + peft_cls = MegatronGPTLoRAModelWeightTying + else: + peft_cls = MegatronGPTLoRAModel else: raise RuntimeError("Invalid Peft scheme") return peft_cls diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py index 776c0558d5ab..c32c9a8c5d23 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py @@ -21,8 +21,10 @@ AdapterName, InfusedAdapterConfig, LoraKQVAdapterConfig, + LoraKQVAdapterWeightTyingConfig, MLPInfusedAdapterConfig, ParallelLinearAdapterConfig, + ParallelLinearAdapterWeightTyingConfig, PromptEncoderAdapterConfig, ) from nemo.core.classes.mixins import adapter_mixins @@ -131,7 +133,37 @@ def setup_optimizer_param_groups(self): logging.info(f"Optimizer groups set:\n{self.summarize()}") -class MegatronGPTAdapterModel(MegatronGPTPEFTModel): +class MegatronGPTLayerwisePEFTModel(MegatronGPTPEFTModel): + def __init__( + self, cfg: DictConfig, trainer: Trainer, + ): + super().__init__(cfg, trainer) + + def init_peft_modules(self): + """ + Randomly initialize the peft params and add them to the appropriate modules. + """ + assert len(self.peft_name_keys) > 0, "peft_name_keys have not been set no PEFT modules will be added" + assert len(self.name_key_to_cfg) > 0, "name_key_to_cfg has not been set no PEFT modules will be added" + logging.info(f"Before adding PEFT params:\n{self.summarize()}") + for layer in self.model.language_model.encoder.layers: + if layer.layer_number in self.layer_selection: + for _, module in layer.named_modules(): + if isinstance(module, adapter_mixins.AdapterModuleMixin): + for peft_key in self.peft_name_keys: + peft_cfg = self.name_key_to_cfg[peft_key] + if ( + model_utils.import_class_by_path(peft_cfg._target_) + in module.get_accepted_adapter_types() + ): + module.add_adapter( + name=peft_key, cfg=peft_cfg, + ) + logging.info(f"After adding PEFT params:\n{self.summarize()}") + return True + + +class MegatronGPTAdapterModel(MegatronGPTLayerwisePEFTModel): """ MegatronGPTAdapterLearningModel is a model that combines a base model (GPTSFTModel) with a adapters. This class only supports the canonical Adapter training described in Houlsby et al. (https://arxiv.org/pdf/1902.00751.pdf) @@ -151,7 +183,6 @@ def __init__( AdapterName.POST_ATTN_ADAPTER, ] adapter_tuning_cfg = cfg.peft.adapter_tuning - adapter_cfg = ParallelLinearAdapterConfig( in_features=cfg.hidden_size, out_features=cfg.hidden_size, @@ -167,10 +198,73 @@ def __init__( for k in self.peft_name_keys: self.name_key_to_cfg[k] = adapter_cfg + self.layer_selection = adapter_tuning_cfg.get("layer_selection", None) + if self.layer_selection is None: + self.layer_selection = list(range(1, cfg.num_layers + 1)) super().__init__(cfg, trainer) -class MegatronGPTIA3Model(MegatronGPTPEFTModel): +class MegatronGPTAdapterModelWeightTying(MegatronGPTLayerwisePEFTModel): + """ + TODO + """ + + def __init__( + self, cfg: DictConfig, trainer: Trainer, + ): + self.peft_name_keys = [ + AdapterName.PRE_ATTN_ADAPTER, + AdapterName.POST_ATTN_ADAPTER, + ] + adapter_tuning_cfg = cfg.peft.adapter_tuning + + adapter_cfg = ParallelLinearAdapterWeightTyingConfig( + in_features=cfg.hidden_size, + out_features=cfg.hidden_size, + dim=adapter_tuning_cfg.adapter_dim, + norm_position=adapter_tuning_cfg.get("norm_position", "pre"), + norm_type=adapter_tuning_cfg.get("norm_type", "mixedfusedlayernorm"), + column_init_method=adapter_tuning_cfg.get("column_init_method", "xavier"), + row_init_method=adapter_tuning_cfg.get("row_init_method", "zero"), + dropout=adapter_tuning_cfg.adapter_dropout, + num_position_embeddings=cfg.num_layers * 2, + dim_position_embeddings=cfg.hidden_size, + position_embedding_strategy=adapter_tuning_cfg.get("position_embedding_strategy", None), + ) + + self.name_key_to_cfg = {} + for k in self.peft_name_keys: + self.name_key_to_cfg[k] = adapter_cfg + + self.layer_selection = adapter_tuning_cfg.get("layer_selection", None) + if self.layer_selection is None: + self.layer_selection = list(range(1, cfg.num_layers + 1)) + super().__init__(cfg, trainer) + self.tie_weights() + + def tie_weights(self,): + pos_idx = 0 + layer0 = self.model.language_model.encoder.layers[0] + for adapter_name in layer0.adapter_layer: + adapter = layer0.get_adapter_module(adapter_name) + print(adapter_name, pos_idx) + adapter.set_position(pos_idx) + pos_idx += 1 + + for layer in self.model.language_model.encoder.layers[1:]: + for adapter_name in layer.adapter_layer: + print(adapter_name, pos_idx) + adapter_l = layer.get_adapter_module(adapter_name) + adapter_0 = layer0.get_adapter_module(adapter_name) + if hasattr(adapter_0, "layer_norm"): + lnorm = adapter_0.layer_norm + else: + lnorm = None + adapter_l.tie_weights(pos_idx, adapter_0) + pos_idx += 1 + + +class MegatronGPTIA3Model(MegatronGPTLayerwisePEFTModel): """ MegatronGPTInfusedAdapterModel is a model that combines a base model (GPTSFTModel) with a "Infused Adapter that can Inhibiting and Amplify Inner Activations", known as IA3. This class supports the addition of IA3 into a transformer based LM as described in Liu et al. (https://arxiv.org/pdf/2205.05638.pdf) @@ -330,7 +424,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens -class MegatronGPTLoRAModel(MegatronGPTPEFTModel): +class MegatronGPTLoRAModel(MegatronGPTLayerwisePEFTModel): """ MegatronGPTLoRAModel is a model that combines a base model (GPTSFTModel) with a low-rank adapters. The lora adapters will be added in `nemo/collections/nlp/modules/common/megatron/attention.py` @@ -360,8 +454,8 @@ def __init__( in_features=cfg.hidden_size, out_features=3 * projection_size, dim=lora_cfg.adapter_dim, - norm_position="none", - norm_type="none", + norm_position=None, + norm_type=None, activation="identity", column_init_method=lora_cfg.get("column_init_method", "normal"), row_init_method=lora_cfg.get("row_init_method", "zero"), @@ -372,5 +466,87 @@ def __init__( self.name_key_to_cfg = {} for k in self.peft_name_keys: self.name_key_to_cfg[k] = adapter_cfg + self.layer_selection = lora_cfg.get("layer_selection", None) + if self.layer_selection is None: + self.layer_selection = list(range(1, cfg.num_layers + 1)) + super().__init__(cfg, trainer) + +class MegatronGPTLoRAModelWeightTying(MegatronGPTLayerwisePEFTModel): + """ + TODO + """ + + def __init__( + self, cfg: DictConfig, trainer: Trainer, + ): + self.peft_name_keys = [ + AdapterName.LORA_KQV_ADAPTER, + ] + lora_cfg = cfg.peft.lora_tuning + if cfg.get("kv_channels", None) is None: + assert ( + cfg.hidden_size % cfg.num_attention_heads == 0 + ), 'hidden_size must be divisible by num_attention_heads if kv_channels is None' + kv_channels = cfg.hidden_size // cfg.num_attention_heads + else: + kv_channels = cfg.kv_channels + projection_size = kv_channels * cfg.num_attention_heads + position_embedding_strategy = lora_cfg.get("position_embedding_strategy", None) + if position_embedding_strategy is None: + dim_position_embeddings = 0 + elif position_embedding_strategy == "add": + dim_position_embeddings = cfg.hidden_size + elif position_embedding_strategy == "biasadd": + dim_position_embeddings = 3 * projection_size + elif position_embedding_strategy == "concat": + dim_position_embeddings = lora_cfg.adapter_dim + elif position_embedding_strategy == "mlpconcat": + dim_position_embeddings = lora_cfg.adapter_dim + else: + raise RuntimeError(f"Unknown position embedding strategy {position_embedding_strategy} for tied weights") + + adapter_cfg = LoraKQVAdapterWeightTyingConfig( + in_features=cfg.hidden_size, + out_features=3 * projection_size, + dim=lora_cfg.adapter_dim, + norm_position=None, + norm_type=None, + activation="identity", + column_init_method=lora_cfg.get("column_init_method", "normal"), + row_init_method=lora_cfg.get("row_init_method", "zero"), + gather_output=False, + dropout=lora_cfg.adapter_dropout, + num_position_embeddings=cfg.num_layers, + dim_position_embeddings=dim_position_embeddings, + position_embedding_strategy=position_embedding_strategy, + ) + + self.name_key_to_cfg = {} + for k in self.peft_name_keys: + self.name_key_to_cfg[k] = adapter_cfg + self.layer_selection = lora_cfg.get("layer_selection", None) + if self.layer_selection is None: + self.layer_selection = list(range(1, cfg.num_layers + 1)) super().__init__(cfg, trainer) + self.tie_weights() + + def tie_weights(self,): + pos_idx = 0 + layer0 = self.model.language_model.encoder.layers[0] + for adapter_name in layer0.self_attention.adapter_layer: + adapter = layer0.self_attention.get_adapter_module(adapter_name) + print(adapter_name, pos_idx) + adapter.set_position(pos_idx) + pos_idx += 1 + + for layer in self.model.language_model.encoder.layers[1:]: + for adapter_name in layer.self_attention.adapter_layer: + print(adapter_name, pos_idx) + adapter_l = layer.self_attention.get_adapter_module(adapter_name) + adapter_0 = layer0.self_attention.get_adapter_module(adapter_name) + position_embeddings_0 = None + if adapter_0.position_embedding_strategy: + position_embeddings_0 = adapter_0.position_embeddings + adapter_l.tie_weights(pos_idx, adapter_0) + pos_idx += 1 diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index d4a75aa18fb1..576366b90ddd 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -17,7 +17,7 @@ import enum import logging from dataclasses import dataclass - +from typing import Optional import torch import torch.nn as nn import torch.nn.init as init @@ -106,8 +106,8 @@ def __init__( out_features: int, dim: int, activation: str = 'swish', - norm_position: str = 'post', - norm_type: str = 'mixedfusedlayernorm', + norm_position: Optional[str] = 'post', + norm_type: Optional[str] = 'mixedfusedlayernorm', column_init_method: str = 'xavier', # TODO: (@adithyare) should rename this to input_init_method to be more precise. row_init_method: str = 'zero', # TODO: (@adithyare) should rename this to output_init_method to be more precise. gather_output: bool = True, @@ -161,6 +161,8 @@ def __init__( self.layer_norm = nn.LayerNorm(ln_features) else: raise NotImplementedError("norm_type should be either mixedfusedlayernorm or layernorm") + else: + self.layer_norm = None if dropout > 0.0: self.dropout = nn.Dropout(dropout) @@ -215,8 +217,8 @@ class ParallelLinearAdapterConfig: out_features: int dim: int activation: str = 'swish' - norm_position: str = 'post' - norm_type: str = 'mixedfusedlayernorm' + norm_position: Optional[str] = 'post' + norm_type: Optional[str] = 'mixedfusedlayernorm' column_init_method: str = 'xavier' row_init_method: str = 'zero' gather_output: bool = True @@ -375,3 +377,153 @@ class PromptEncoderAdapterConfig: init_std: float output_dim: int _target_: str = "{0}.{1}".format(PromptEncoderAdapter.__module__, PromptEncoderAdapter.__name__) + + +class ParallelLinearAdapterWeightTying(ParallelLinearAdapter): + """ + Extends parallel linear adapter for weight tying by providing a position embedding and convenience methods for tying weights + """ + + def __init__( + self, + in_features: int, + out_features: int, + dim: int, + activation: str = 'swish', + norm_position: Optional[str] = 'post', + norm_type: Optional[str] = 'mixedfusedlayernorm', + column_init_method: str = 'xavier', # TODO: (@adithyare) should rename this to input_init_method to be more precise. + row_init_method: str = 'zero', # TODO: (@adithyare) should rename this to output_init_method to be more precise. + gather_output: bool = True, + dropout: float = 0.0, + num_position_embeddings: int = 1, + dim_position_embeddings: int = 1024, + position_embedding_strategy: Optional[str] = "add", + ): + self.position_embeddings = None + self.mlp = None + self.position_embedding_strategy = position_embedding_strategy + assert self.position_embedding_strategy in ["add", "concat", "mlpconcat", "biasadd", None] + if self.position_embedding_strategy == "concat": + in_features += dim_position_embeddings + elif self.position_embedding_strategy == "mlpconcat": + in_features += dim_position_embeddings + elif self.position_embedding_strategy == "biasadd": + assert ( + out_features == dim_position_embeddings + ), "adapter output feature size should match position emb size to bias add" + elif self.position_embedding_strategy == "add": + assert ( + in_features == dim_position_embeddings + ), "adapter input feature size should match position emb size to add" + super().__init__( + in_features, + out_features, + dim, + activation, + norm_position, + norm_type, + column_init_method, + row_init_method, + gather_output, + dropout, + ) + if self.position_embedding_strategy: + self.position_embeddings = torch.nn.Embedding(num_position_embeddings, dim_position_embeddings) + self.position_embeddings.weight.data.fill_(0.0) + if self.position_embedding_strategy == "mlpconcat": + self.mlp = torch.nn.Sequential( + torch.nn.Linear(dim_position_embeddings, dim_position_embeddings, bias=False), + torch.nn.GELU(), + torch.nn.Linear(dim_position_embeddings, dim_position_embeddings, bias=False), + ) + self.register_buffer("position_id", torch.LongTensor([1]), persistent=False) + + def set_position(self, position_id): + self.position_id *= position_id + + def tie_weights(self, position_id, adapter): + + self.set_position(position_id) + if self.linear_in: + self.linear_in.weight = adapter.linear_in.weight + if self.linear_out: + self.linear_out.weight = adapter.linear_out.weight + if self.layer_norm: + self.layer_norm.weight = adapter.layer_norm.weight + self.layer_norm.bias = adapter.layer_norm.bias + if self.mlp: + self.mlp[0].weight = adapter.mlp[0].weight + self.mlp[2].weight = adapter.mlp[2].weight + if self.position_embeddings: + self.position_embeddings.weight = adapter.position_embeddings.weight + + return True + + def forward(self, x): + + if self.position_embedding_strategy: + pos = self.position_embeddings(self.position_id).unsqueeze(0) + if self.position_embedding_strategy == "add": + pos = pos.expand_as(x) + x = x + pos + + elif self.position_embedding_strategy == "concat": + pos = pos.expand(x.shape[0], x.shape[1], pos.shape[2]) + x = torch.cat((x, pos), dim=2) + elif self.position_embedding_strategy == "mlpconcat": + pos = pos.expand(x.shape[0], x.shape[1], pos.shape[2]) + pos = self.mlp(pos) + x = torch.cat((x, pos), dim=2) + + if self.norm_position == 'pre': + x = self.layer_norm(x) + + x, _ = self.linear_in(x) # (@adithyare) ColumnLinear returns output and bias, we are ignoring the bias term. + x = self.activation(x) + x, _ = self.linear_out(x) + if self.norm_position == 'post': + x = self.layer_norm(x) + + if self.position_embedding_strategy == "biasadd": + pos = pos.expand_as(x) + x = x + pos + + # Add dropout if available + if self.dropout is not None: + x = self.dropout(x) + + return x + + +@dataclass +class ParallelLinearAdapterWeightTyingConfig: + in_features: int + out_features: int + dim: int + activation: str = 'swish' + norm_position: Optional[str] = 'post' + norm_type: Optional[str] = 'mixedfusedlayernorm' + column_init_method: str = 'xavier' + row_init_method: str = 'zero' + gather_output: bool = True + dropout: float = 0.0 + num_position_embeddings: int = 1 + dim_position_embeddings: int = 1024 + position_embedding_strategy: Optional[str] = "concat" + _target_: str = "{0}.{1}".format( + ParallelLinearAdapterWeightTying.__module__, ParallelLinearAdapterWeightTying.__name__ + ) + + +class LoraKQVAdapterWeightTying(ParallelLinearAdapterWeightTying): + """ + TODO + """ + + pass + + +@dataclass +class LoraKQVAdapterWeightTyingConfig(ParallelLinearAdapterWeightTyingConfig): + _target_: str = "{0}.{1}".format(LoraKQVAdapterWeightTying.__module__, LoraKQVAdapterWeightTying.__name__) diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py index a5a8b86b85bf..6b8189194333 100644 --- a/nemo/collections/nlp/modules/common/megatron/attention.py +++ b/nemo/collections/nlp/modules/common/megatron/attention.py @@ -22,6 +22,7 @@ AdapterName, InfusedAdapterConfig, LoraKQVAdapterConfig, + LoraKQVAdapterWeightTyingConfig, LoraKVAdapterConfig, LoraQAdapterConfig, ) @@ -143,6 +144,7 @@ def __init__( LoraKQVAdapterConfig._target_, LoraQAdapterConfig._target_, LoraKVAdapterConfig._target_, + LoraKQVAdapterWeightTyingConfig._target_, ] ) diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index 9cdcccf6e685..045daaf1151a 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -25,6 +25,7 @@ from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ( AdapterName, ParallelLinearAdapterConfig, + ParallelLinearAdapterWeightTyingConfig, ) from nemo.collections.nlp.modules.common.megatron.attention import ParallelAttention, ParallelChunkedCrossAttention from nemo.collections.nlp.modules.common.megatron.fused_bias_dropout_add import ( @@ -188,7 +189,13 @@ def __init__( self.position_embedding_type = position_embedding_type self.param_dtype = utils_funcs.dtype_from_precision(precision, megatron_amp_O2) - self.set_accepted_adapter_types([LinearAdapterConfig._target_, ParallelLinearAdapterConfig._target_]) + self.set_accepted_adapter_types( + [ + LinearAdapterConfig._target_, + ParallelLinearAdapterConfig._target_, + ParallelLinearAdapterWeightTyingConfig._target_, + ] + ) if not bias and bias_dropout_add_fusion: raise ValueError(