diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 3e0c52e89a93d5..58143192c20482 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -1101,7 +1101,8 @@ def forward( all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None all_hidden_states = () if output_hidden_states else None - for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): + for i in range(len(self.h)): + block, layer_past = self.h[i], past_key_values[i] # Model parallel if self.model_parallel: torch.cuda.set_device(hidden_states.device)