Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

change apply_rotary_pos_emb of Glmmodel for GLM-Edge Series model #34629

Merged
merged 28 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
e6e54f0
change apply_rotary_pos_emb
zRzRzRzRzRzRzR Nov 6, 2024
6a75751
upload for glm-edge
zRzRzRzRzRzRzR Nov 20, 2024
935fe8a
remove useless part
zRzRzRzRzRzRzR Nov 21, 2024
fa070e0
follow the suggestion
zRzRzRzRzRzRzR Nov 21, 2024
0ba58d6
fix
zRzRzRzRzRzRzR Nov 21, 2024
87d90e5
format
zRzRzRzRzRzRzR Nov 21, 2024
7038703
format
zRzRzRzRzRzRzR Nov 21, 2024
1f17ea5
test
zRzRzRzRzRzRzR Nov 21, 2024
ef9fd9c
format again
zRzRzRzRzRzRzR Nov 21, 2024
aceb417
format again
zRzRzRzRzRzRzR Nov 21, 2024
31cf72e
remove modular change
zRzRzRzRzRzRzR Nov 21, 2024
a8d3377
remove modular change
zRzRzRzRzRzRzR Nov 21, 2024
a75d83c
this apply_rotary_pos_emb need modify?
zRzRzRzRzRzRzR Nov 21, 2024
2a12a1c
fix with this
zRzRzRzRzRzRzR Nov 21, 2024
cb7a09b
format
zRzRzRzRzRzRzR Nov 21, 2024
a9001a1
format
zRzRzRzRzRzRzR Nov 21, 2024
93fb505
ruff check
zRzRzRzRzRzRzR Nov 21, 2024
c674c3e
Merge branch 'huggingface:main' into glm-4-1108
zRzRzRzRzRzRzR Nov 21, 2024
34e7229
modify modular_glm failed
zRzRzRzRzRzRzR Nov 21, 2024
c57cd93
Merge branch 'huggingface:main' into glm-4-1108
zRzRzRzRzRzRzR Nov 24, 2024
b605489
Merge branch 'huggingface:main' into glm-4-1108
zRzRzRzRzRzRzR Nov 26, 2024
0c44372
remove partial_rotary_factor of function partial_rotary_factor
zRzRzRzRzRzRzR Nov 26, 2024
8703374
fix wrong change of examples/research_projects
zRzRzRzRzRzRzR Nov 26, 2024
f81ba89
revert
zRzRzRzRzRzRzR Nov 26, 2024
73afd71
remove line 118
zRzRzRzRzRzRzR Nov 26, 2024
73614df
Merge branch 'huggingface:main' into glm-4-1108
zRzRzRzRzRzRzR Nov 26, 2024
dd47bb0
use q_rot
zRzRzRzRzRzRzR Nov 26, 2024
1ae053c
Merge branch 'glm-4-1108' of github.com:zRzRzRzRzRzRzR/transformers i…
zRzRzRzRzRzRzR Nov 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions examples/research_projects/lxmert/demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,18 @@
}
],
"source": [
"from IPython.display import clear_output, Image, display\n",
"import PIL.Image\n",
"import io\n",
"import json\n",
"import torch\n",
"\n",
"import numpy as np\n",
"import PIL.Image\n",
"from IPython.display import Image, display\n",
"from modeling_frcnn import GeneralizedRCNN\n",
"from processing_image import Preprocess\n",
"from visualizing_image import SingleImageViz\n",
"from modeling_frcnn import GeneralizedRCNN\n",
"from utils import Config\n",
"\n",
"import utils\n",
"from transformers import LxmertForQuestionAnswering, LxmertTokenizer\n",
"import wget\n",
"import pickle\n",
"import os\n",
"from utils import Config\n",
"\n",
"\n",
"# URL = \"https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/images/input.jpg\",\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,19 @@
"source": [
"# Includes\n",
"\n",
"import h5py\n",
"import os\n",
"import json\n",
"import os\n",
"from collections import OrderedDict\n",
"\n",
"from scipy import sparse\n",
"import h5py\n",
"import numpy as np\n",
"\n",
"import torch\n",
"from scipy import sparse\n",
"from torch import nn\n",
"\n",
"from transformers import *\n",
"\n",
"\n",
"os.chdir(\"../../\")"
]
},
Expand Down
179 changes: 91 additions & 88 deletions examples/research_projects/visual_bert/demo.ipynb

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions src/transformers/models/glm/configuration_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class GlmConfig(PretrainedConfig):
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
`num_attention_heads`.
partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position.
head_dim (`int`, *optional*, defaults to 128):
The attention head dimension.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
Expand Down Expand Up @@ -93,6 +94,7 @@ def __init__(
num_hidden_layers=40,
num_attention_heads=32,
num_key_value_heads=2,
partial_rotary_factor=0.5,
head_dim=128,
hidden_act="silu",
attention_dropout=0.0,
Expand All @@ -114,6 +116,7 @@ def __init__(
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.partial_rotary_factor = partial_rotary_factor
zRzRzRzRzRzRzR marked this conversation as resolved.
Show resolved Hide resolved
self.head_dim = head_dim
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
Expand Down
71 changes: 46 additions & 25 deletions src/transformers/models/glm/convert_glm_weights_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,28 @@
# fmt: on


def merge_safetensors(input_dir: str):
all_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")]
all_files = sorted(all_files, key=lambda x: int(x.rsplit("-", 3)[1]))
def load_weights(input_dir: str):
safetensor_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")]
bin_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".bin")]

all_weights = {}
for file in all_files:
tensors = load_file(file)
all_weights.update(tensors)

return all_weights
if safetensor_files:
safetensor_files = sorted(safetensor_files, key=lambda x: int(x.rsplit("-", 3)[1]))
for file in safetensor_files:
tensors = load_file(file)
all_weights.update(tensors)
return all_weights

elif bin_files:
bin_files = sorted(bin_files, key=lambda x: int(x.rsplit("-", 3)[1]))
for file in bin_files:
tensors = torch.load(file, map_location="cpu")
all_weights.update(tensors)
return all_weights

else:
raise ValueError("No .safetensors or .bin files found in the specified directory.")


def map_old_key_to_new(old_key):
Expand Down Expand Up @@ -100,7 +112,8 @@ def convert_config(original_config: dict):
"attention_bias": "add_qkv_bias",
}
similar_keys_to_keep = [
"num_attention_heads" "hidden_size",
"num_attention_heads",
"hidden_size",
"attention_dropout",
"use_cache",
"eos_token_id",
Expand All @@ -120,40 +133,43 @@ def convert_config(original_config: dict):
return new_config


def convert_glm_tokenizer(input_dir):
def convert_glm_tokenizer(input_dir, use_post_processor=False):
fast_tok = PreTrainedTokenizerFast.from_pretrained(input_dir, model_input_names=["input_ids", "attention_mask"])
# Add the two tokens automatically with post processor
fast_tok._tokenizer.post_processor = processors.Sequence(
[
processors.ByteLevel(trim_offsets=False),
processors.TemplateProcessing(
single="[gMASK]:0 <sop>:0 $A:0",
pair="[gMASK]:0 <sop>:0 $A:0 $B:1",
special_tokens=[("[gMASK]", 151331), ("<sop>", 151333)],
),
],
)

if use_post_processor:
fast_tok._tokenizer.post_processor = processors.Sequence(
[
processors.ByteLevel(trim_offsets=False),
processors.TemplateProcessing(
single="[gMASK]:0 <sop>:0 $A:0",
pair="[gMASK]:0 <sop>:0 $A:0 $B:1",
special_tokens=[("[gMASK]", 151331), ("<sop>", 151333)],
),
],
)
else:
fast_tok._tokenizer.post_processor = processors.Sequence(
[processors.ByteLevel(trim_offsets=False)],
)
return fast_tok


def convert_glm_model(input_dir, output_dir):
def convert_glm_model(input_dir, output_dir, use_post_processor=False):
# Load and convert config
with open(os.path.join(input_dir, "config.json")) as f:
original_config = json.load(f)
config = convert_config(original_config)
config.save_pretrained(output_dir)

# Load and convert weights
original_state_dict = merge_safetensors(input_dir)
original_state_dict = load_weights(input_dir)
new_dict = convert_state_dict(original_state_dict, config)
with torch.device("meta"):
model = GlmForCausalLM(config)
model.load_state_dict(new_dict, strict=True, assign=True)
model.save_pretrained(output_dir)

# Load and convert tokenizer
tokenizer = convert_glm_tokenizer(input_dir)
tokenizer = convert_glm_tokenizer(input_dir, use_post_processor)
tokenizer.save_pretrained(output_dir)


Expand All @@ -169,6 +185,11 @@ def convert_glm_model(input_dir, output_dir):
type=str,
help="Location to write HF model and tokenizer",
)
parser.add_argument(
"--use_post_processor",
action="store_true",
help="Whether to apply post processor with special tokens",
)

args = parser.parse_args()
convert_glm_model(args.input_dir, args.output_dir)
convert_glm_model(args.input_dir, args.output_dir, args.use_post_processor)
30 changes: 21 additions & 9 deletions src/transformers/models/glm/modeling_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def rotate_half(x):
return torch.stack((-x2, x1), dim=-1).flatten(-2)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, partial_rotary_factor=0.5):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, partial_rotary_factor=0.5):
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):

We actually don't need to pass the rotary_factor as an argument to the function! See next comment, that way we don't even have to modify the modular file for the Attentions!

"""Applies Rotary Position Embedding to the query and key tensors.

Args:
Expand All @@ -159,6 +159,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor by which the rotary embedding.
Returns:
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
"""
Expand All @@ -169,11 +170,12 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)

# Keep half for later concatenation
q, q_pass = q[..., : q.shape[-1] // 2], q[..., q.shape[-1] // 2 :]
k, k_pass = k[..., : k.shape[-1] // 2], k[..., k.shape[-1] // 2 :]
# Keep half or full tensor for later concatenation
rotary_dim = int(q.shape[-1] * partial_rotary_factor)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Keep half or full tensor for later concatenation
rotary_dim = int(q.shape[-1] * partial_rotary_factor)
# Keep half or full tensor for later concatenation
rotary_dim = cos.shape[-1]

We actually don't need to pass the rotary_factor as an argument to the function!

q, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
k, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]

# Apply rotary embeddings on the first half
# Apply rotary embeddings on the first half or full tensor
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)

Expand Down Expand Up @@ -216,6 +218,7 @@ def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
self.partial_rotary_factor = config.partial_rotary_factor

def forward(
self,
Expand All @@ -240,7 +243,9 @@ def forward(
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
query_states, key_states = apply_rotary_pos_emb(
query_states, key_states, cos, sin, partial_rotary_factor=self.partial_rotary_factor
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to pass the extra arg! See above


if past_key_value is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
Expand Down Expand Up @@ -320,7 +325,9 @@ def forward(
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
query_states, key_states = apply_rotary_pos_emb(
query_states, key_states, cos, sin, partial_rotary_factor=self.partial_rotary_factor
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same


if past_key_value is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
Expand Down Expand Up @@ -432,7 +439,9 @@ def forward(
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
query_states, key_states = apply_rotary_pos_emb(
query_states, key_states, cos, sin, partial_rotary_factor=self.partial_rotary_factor
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same


if past_key_value is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
Expand Down Expand Up @@ -705,11 +714,14 @@ def __init__(self, config: GlmConfig):
)
self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.rotary_emb = GlmRotaryEmbedding(
dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta
dim=config.head_dim * config.partial_rotary_factor,
max_position_embeddings=config.max_position_embeddings,
base=config.rope_theta,
)
self.gradient_checkpointing = False
if getattr(config, "pretraining_tp", 1) != 1:
logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
self.partial_rotary_factor = config.partial_rotary_factor

# Initialize weights and apply final processing
self.post_init()
Expand Down
18 changes: 12 additions & 6 deletions src/transformers/models/glm/modular_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def rotate_half(x):
return torch.stack((-x2, x1), dim=-1).flatten(-2)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, partial_rotary_factor=0.5):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to pass the extra arg! See above

"""Applies Rotary Position Embedding to the query and key tensors.

Args:
Expand All @@ -85,6 +85,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor by which the rotary embedding.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same, can be removed

Returns:
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
"""
Expand All @@ -95,11 +96,12 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)

# Keep half for later concatenation
q, q_pass = q[..., : q.shape[-1] // 2], q[..., q.shape[-1] // 2 :]
k, k_pass = k[..., : k.shape[-1] // 2], k[..., k.shape[-1] // 2 :]
# Keep half or full tensor for later concatenation
rotary_dim = int(q.shape[-1] * partial_rotary_factor)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Keep half or full tensor for later concatenation
rotary_dim = int(q.shape[-1] * partial_rotary_factor)
# Keep half or full tensor for later concatenation
rotary_dim = cos.shape[-1]

Same as above

q, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
k, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
q, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
k, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]

we usually use these notations!


# Apply rotary embeddings on the first half
# Apply rotary embeddings on the first half or full tensor
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)

Expand All @@ -114,6 +116,7 @@ def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
super().__init__(config, layer_idx)
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
self.scaling = 1 / math.sqrt(self.head_dim)
self.partial_rotary_factor = config.partial_rotary_factor


class GlmFlashAttention2(GlmAttention, GraniteFlashAttention2):
Expand Down Expand Up @@ -151,8 +154,11 @@ def __init__(self, config: GlmConfig):
[GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.partial_rotary_factor = config.partial_rotary_factor
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
self.partial_rotary_factor = config.partial_rotary_factor

I don't think this is used no?

self.rotary_emb = GlmRotaryEmbedding(
dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta
dim=config.head_dim * config.partial_rotary_factor,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
dim=config.head_dim * config.partial_rotary_factor,
dim=int(config.head_dim * config.partial_rotary_factor),

You need int here as well

max_position_embeddings=config.max_position_embeddings,
base=config.rope_theta,
)
self.gradient_checkpointing = False

Expand Down