forked from aws/amazon-sagemaker-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathestimate_efficency.py
101 lines (87 loc) · 3.23 KB
/
estimate_efficency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import torch
import numpy as np
def mac_per_head(
seq_len,
hidden_size,
attention_head_size,
):
per_head_qkv = lambda seq_len: 3 * seq_len * hidden_size * attention_head_size
per_head_attn = lambda seq_len: 2 * seq_len * seq_len * attention_head_size
per_head_output = lambda seq_len: seq_len * attention_head_size * hidden_size
mac = per_head_qkv(seq_len) + per_head_attn(seq_len) + per_head_output(seq_len)
return mac
def mac_per_neuron(seq_len, hidden_size):
return 2 * seq_len * hidden_size
def compute_mac(
num_heads_per_layer,
num_neurons_per_layer,
seq_len,
hidden_size,
attention_head_size,
):
mac = 0.0
for num_heads, num_neurons in zip(num_heads_per_layer, num_neurons_per_layer):
attention_mac = num_heads * mac_per_head(
seq_len, hidden_size, attention_head_size
)
ffn_mac = num_neurons * mac_per_neuron(seq_len, hidden_size)
mac += attention_mac + ffn_mac
return mac
def compute_parameters(dmodel, dhead, num_heads_per_layer, num_neurons_per_layer):
num_layers = num_heads_per_layer.shape[0]
assert num_layers == num_neurons_per_layer.shape[0]
num_parameters = 0
for layer in range(num_layers):
n_layer_norm = 2 * dmodel
if num_heads_per_layer[layer] > 0:
n_attention = (
(dmodel * dhead + dhead) * num_heads_per_layer[layer] * 3
) # attention
n_attention += dmodel * dmodel + dmodel # output
n_attention += n_layer_norm
else:
n_attention = 0
if num_neurons_per_layer[layer] > 0:
n_ffn = (
2 * dmodel * num_neurons_per_layer[layer]
+ dmodel
+ num_neurons_per_layer[layer]
)
n_ffn += n_layer_norm
else:
n_ffn = 0
num_parameters += n_attention + n_ffn
return int(num_parameters)
def compute_latency(model, tokenizer, batch, device):
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(
enable_timing=True
)
repetitions = 300
timings = np.zeros((repetitions, 1))
# warm-up GPU
for _ in range(10):
_ = model(**tokenizer(batch, return_tensors="pt").to(device))
# Measure latency
with torch.no_grad():
for rep in range(repetitions):
starter.record()
_ = model(**tokenizer(batch, return_tensors="pt").to(device))
ender.record()
# synchronize GPU
torch.cuda.synchronize()
curr_time = starter.elapsed_time(ender)
timings[rep] = curr_time
mean_syn = np.sum(timings) / repetitions
return mean_syn