Skip to content

Commit

Permalink
Llama: support for max_position_embeddings (huggingface#22471)
Browse files Browse the repository at this point in the history
* Llama now supports max_position_embeddings

* Save config; Cosmetic edits
  • Loading branch information
gante authored and novice03 committed Jun 23, 2023
1 parent c794f78 commit 4aac769
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 35 deletions.
5 changes: 5 additions & 0 deletions src/transformers/models/llama/configuration_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ class LlamaConfig(PretrainedConfig):
Number of attention heads for each attention layer in the Transformer encoder.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-12):
Expand Down Expand Up @@ -85,6 +88,7 @@ def __init__(
num_hidden_layers=32,
num_attention_heads=32,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
Expand All @@ -95,6 +99,7 @@ def __init__(
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
Expand Down
49 changes: 14 additions & 35 deletions src/transformers/models/llama/modeling_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,42 +160,24 @@ def forward(self, x):
class LlamaAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""

def __init__(
self,
hidden_size: int,
num_heads: int,
):
def __init__(self, config: LlamaConfig):
super().__init__()
self.hidden_size = hidden_size
self.num_heads = num_heads
self.head_dim = hidden_size // num_heads
self.config = config
self.hidden_size = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.hidden_size // self.num_heads
self.max_position_embeddings = config.max_position_embeddings

if (self.head_dim * num_heads) != self.hidden_size:
if (self.head_dim * self.num_heads) != self.hidden_size:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {num_heads})."
f" and `num_heads`: {self.num_heads})."
)
self.q_proj = nn.Linear(
hidden_size,
num_heads * self.head_dim,
bias=False,
)
self.k_proj = nn.Linear(
hidden_size,
num_heads * self.head_dim,
bias=False,
)
self.v_proj = nn.Linear(
hidden_size,
num_heads * self.head_dim,
bias=False,
)
self.o_proj = nn.Linear(
num_heads * self.head_dim,
hidden_size,
bias=False,
)
self.rotary_emb = LlamaRotaryEmbedding(self.head_dim)
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)

def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
Expand Down Expand Up @@ -270,10 +252,7 @@ class LlamaDecoderLayer(nn.Module):
def __init__(self, config: LlamaConfig):
super().__init__()
self.hidden_size = config.hidden_size
self.self_attn = LlamaAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
)
self.self_attn = LlamaAttention(config=config)
self.mlp = LlamaMLP(
hidden_size=self.hidden_size,
intermediate_size=config.intermediate_size,
Expand Down

0 comments on commit 4aac769

Please sign in to comment.