-
Notifications
You must be signed in to change notification settings - Fork 80
/
Copy pathtransformer_timeseries.py
236 lines (182 loc) · 9.89 KB
/
transformer_timeseries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import torch.nn as nn
from torch import nn, Tensor
import positional_encoder as pe
import torch.nn.functional as F
class TimeSeriesTransformer(nn.Module):
"""
This class implements a transformer model that can be used for times series
forecasting. This time series transformer model is based on the paper by
Wu et al (2020) [1]. The paper will be referred to as "the paper".
A detailed description of the code can be found in my article here:
https://towardsdatascience.com/how-to-make-a-pytorch-transformer-for-time-series-forecasting-69e073d4061e
In cases where the paper does not specify what value was used for a specific
configuration/hyperparameter, this class uses the values from Vaswani et al
(2017) [2] or from PyTorch source code.
Unlike the paper, this class assumes that input layers, positional encoding
layers and linear mapping layers are separate from the encoder and decoder,
i.e. the encoder and decoder only do what is depicted as their sub-layers
in the paper. For practical purposes, this assumption does not make a
difference - it merely means that the linear and positional encoding layers
are implemented inside the present class and not inside the
Encoder() and Decoder() classes.
[1] Wu, N., Green, B., Ben, X., O'banion, S. (2020).
'Deep Transformer Models for Time Series Forecasting:
The Influenza Prevalence Case'.
arXiv:2001.08317 [cs, stat] [Preprint].
Available at: http://arxiv.org/abs/2001.08317 (Accessed: 9 March 2022).
[2] Vaswani, A. et al. (2017)
'Attention Is All You Need'.
arXiv:1706.03762 [cs] [Preprint].
Available at: http://arxiv.org/abs/1706.03762 (Accessed: 9 March 2022).
"""
def __init__(self,
input_size: int,
dec_seq_len: int,
batch_first: bool,
out_seq_len: int=58,
dim_val: int=512,
n_encoder_layers: int=4,
n_decoder_layers: int=4,
n_heads: int=8,
dropout_encoder: float=0.2,
dropout_decoder: float=0.2,
dropout_pos_enc: float=0.1,
dim_feedforward_encoder: int=2048,
dim_feedforward_decoder: int=2048,
num_predicted_features: int=1
):
"""
Args:
input_size: int, number of input variables. 1 if univariate.
dec_seq_len: int, the length of the input sequence fed to the decoder
dim_val: int, aka d_model. All sub-layers in the model produce
outputs of dimension dim_val
n_encoder_layers: int, number of stacked encoder layers in the encoder
n_decoder_layers: int, number of stacked encoder layers in the decoder
n_heads: int, the number of attention heads (aka parallel attention layers)
dropout_encoder: float, the dropout rate of the encoder
dropout_decoder: float, the dropout rate of the decoder
dropout_pos_enc: float, the dropout rate of the positional encoder
dim_feedforward_encoder: int, number of neurons in the linear layer
of the encoder
dim_feedforward_decoder: int, number of neurons in the linear layer
of the decoder
num_predicted_features: int, the number of features you want to predict.
Most of the time, this will be 1 because we're
only forecasting FCR-N prices in DK2, but in
we wanted to also predict FCR-D with the same
model, num_predicted_features should be 2.
"""
super().__init__()
self.dec_seq_len = dec_seq_len
#print("input_size is: {}".format(input_size))
#print("dim_val is: {}".format(dim_val))
# Creating the three linear layers needed for the model
self.encoder_input_layer = nn.Linear(
in_features=input_size,
out_features=dim_val
)
self.decoder_input_layer = nn.Linear(
in_features=num_predicted_features,
out_features=dim_val
)
self.linear_mapping = nn.Linear(
in_features=dim_val,
out_features=num_predicted_features
)
# Create positional encoder
self.positional_encoding_layer = pe.PositionalEncoder(
d_model=dim_val,
dropout=dropout_pos_enc
)
# The encoder layer used in the paper is identical to the one used by
# Vaswani et al (2017) on which the PyTorch module is based.
encoder_layer = nn.TransformerEncoderLayer(
d_model=dim_val,
nhead=n_heads,
dim_feedforward=dim_feedforward_encoder,
dropout=dropout_encoder,
batch_first=batch_first
)
# Stack the encoder layers in nn.TransformerDecoder
# It seems the option of passing a normalization instance is redundant
# in my case, because nn.TransformerEncoderLayer per default normalizes
# after each sub-layer
# (https://github.com/pytorch/pytorch/issues/24930).
self.encoder = nn.TransformerEncoder(
encoder_layer=encoder_layer,
num_layers=n_encoder_layers,
norm=None
)
decoder_layer = nn.TransformerDecoderLayer(
d_model=dim_val,
nhead=n_heads,
dim_feedforward=dim_feedforward_decoder,
dropout=dropout_decoder,
batch_first=batch_first
)
# Stack the decoder layers in nn.TransformerDecoder
# It seems the option of passing a normalization instance is redundant
# in my case, because nn.TransformerDecoderLayer per default normalizes
# after each sub-layer
# (https://github.com/pytorch/pytorch/issues/24930).
self.decoder = nn.TransformerDecoder(
decoder_layer=decoder_layer,
num_layers=n_decoder_layers,
norm=None
)
def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor=None,
tgt_mask: Tensor=None) -> Tensor:
"""
Returns a tensor of shape:
[target_sequence_length, batch_size, num_predicted_features]
Args:
src: the encoder's output sequence. Shape: (S,E) for unbatched input,
(S, N, E) if batch_first=False or (N, S, E) if
batch_first=True, where S is the source sequence length,
N is the batch size, and E is the number of features (1 if univariate)
tgt: the sequence to the decoder. Shape: (T,E) for unbatched input,
(T, N, E)(T,N,E) if batch_first=False or (N, T, E) if
batch_first=True, where T is the target sequence length,
N is the batch size, and E is the number of features (1 if univariate)
src_mask: the mask for the src sequence to prevent the model from
using data points from the target sequence
tgt_mask: the mask for the tgt sequence to prevent the model from
using data points from the target sequence
"""
#print("From model.forward(): Size of src as given to forward(): {}".format(src.size()))
#print("From model.forward(): tgt size = {}".format(tgt.size()))
# Pass throguh the input layer right before the encoder
src = self.encoder_input_layer(src) # src shape: [batch_size, src length, dim_val] regardless of number of input features
#print("From model.forward(): Size of src after input layer: {}".format(src.size()))
# Pass through the positional encoding layer
src = self.positional_encoding_layer(src) # src shape: [batch_size, src length, dim_val] regardless of number of input features
#print("From model.forward(): Size of src after pos_enc layer: {}".format(src.size()))
# Pass through all the stacked encoder layers in the encoder
# Masking is only needed in the encoder if input sequences are padded
# which they are not in this time series use case, because all my
# input sequences are naturally of the same length.
# (https://github.com/huggingface/transformers/issues/4083)
src = self.encoder( # src shape: [batch_size, enc_seq_len, dim_val]
src=src
)
#print("From model.forward(): Size of src after encoder: {}".format(src.size()))
# Pass decoder input through decoder input layer
decoder_output = self.decoder_input_layer(tgt) # src shape: [target sequence length, batch_size, dim_val] regardless of number of input features
#print("From model.forward(): Size of decoder_output after linear decoder layer: {}".format(decoder_output.size()))
#if src_mask is not None:
#print("From model.forward(): Size of src_mask: {}".format(src_mask.size()))
#if tgt_mask is not None:
#print("From model.forward(): Size of tgt_mask: {}".format(tgt_mask.size()))
# Pass throguh decoder - output shape: [batch_size, target seq len, dim_val]
decoder_output = self.decoder(
tgt=decoder_output,
memory=src,
tgt_mask=tgt_mask,
memory_mask=src_mask
)
#print("From model.forward(): decoder_output shape after decoder: {}".format(decoder_output.shape))
# Pass through linear mapping
decoder_output = self.linear_mapping(decoder_output) # shape [batch_size, target seq len]
#print("From model.forward(): decoder_output size after linear_mapping = {}".format(decoder_output.size()))
return decoder_output