Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

example中提供一下summarization微调的例子 #1854

Open
CurtainRight opened this issue Dec 2, 2024 · 3 comments
Open

example中提供一下summarization微调的例子 #1854

CurtainRight opened this issue Dec 2, 2024 · 3 comments

Comments

@CurtainRight
Copy link

Is your feature request related to a problem? Please describe.
案例最好是能够把hg中的微调案例同步一下,便于大家快速切换上手

Describe the solution you'd like
加速微调脚本

Additional context
现在迫切需要summarization的微调脚本。https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization

@CurtainRight
Copy link
Author

@CurtainRight
Copy link
Author

image
8a612ebb2d9bb3a9ce5bdc70e3fd4d5
缺少DataCollatorForSeq2Seq

@CurtainRight
Copy link
Author

调试代码:

import numpy as np
import evaluate
from typing import List, Optional, Union, Any
from rouge import Rouge

import mindspore

导入封神榜自己的token

from tokenizers_pegasus import PegasusTokenizer
from mindspore.dataset import GeneratorDataset, transforms

from mindnlp.engine import Trainer, TrainingArguments

from mindnlp.dataset import load_dataset

from mindnlp.transformers import (
AutoModelForSeq2SeqLM
)

from mindnlp.engine.train_args.seq2seq import Seq2SeqTrainingArguments

处理数据

import mindspore

train_dataset = raw_datasets = load_dataset(
'csv',
data_files='/home/ma-user/script/train.csv'
)

def process_dataset(dataset: GeneratorDataset, tokenizer, max_seq_len=1024, batch_size=32, shuffle=False, take_len=None):
is_ascend = mindspore.get_context('device_target') == 'Ascend'
# The tokenize function
def tokenize(text):
if is_ascend:
tokenized = tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_len)
else:
tokenized = tokenizer(text, truncation=True, max_length=max_seq_len)
return tokenized['input_ids'], tokenized['attention_mask']
def tokenize2(text):
if is_ascend:
tokenized = tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_len)
else:
tokenized = tokenizer(text, truncation=True, max_length=max_seq_len)
return tokenized['input_ids']

# Shuffle the order of the dataset
if shuffle:
    dataset = dataset.shuffle(buffer_size=batch_size)

    # Select the first several entries of the dataset
if take_len:
    dataset = dataset.take(take_len)

# Apply the tokenize function, transforming the 'text' column into the three output columns generated by the tokenizer.
dataset = dataset.map(operations=[tokenize], input_columns="text", output_columns=['input_ids', 'attention_mask'])
# Cast the datatype of the 'label' column to int32 and rename the column to 'labels'
dataset = dataset.map(operations=[tokenize2], input_columns="label", output_columns="labels")
print(dataset)
# Batch the dataset with padding.
if is_ascend:
    dataset = dataset.batch(batch_size)
else:
    dataset = dataset.padded_batch(batch_size, pad_info={'input_ids': (None, tokenizer.pad_token_id),
                                                         'attention_mask': (None, 0)})
return dataset

batch_size = 4 # Size of each batch
train_dataset = process_dataset(train_dataset, tokenizer, batch_size=batch_size, shuffle=True)

训练

training_args = Seq2SeqTrainingArguments(
output_dir = "./output",
per_device_train_batch_size=4,
# per_device_eval_batch_size=4,
learning_rate=2e-5,
num_train_epochs=2,
logging_steps=200,
evaluation_strategy="epoch",
save_strategy="epoch",
)
def compute_metrics(eval_pred):

return {}

trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
# eval_dataset=eval_dataset if training_args.do_eval else None,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
# data_collator=data_collator,
#compute_metrics=compute_metrics if training_args.predict_with_generate else None,
)

checkpoint = '/home/holly-npu/work/script/checkpoint/checkpoint-20231217'
train_result = trainer.train()
trainer.save_model() # Saves the tokenizer too for easy upload

metrics = train_result.metrics

报错信息:

AttributeError Traceback (most recent call last)
Cell In[33], line 27
15 trainer = Trainer(
16 model=model,
17 args=training_args,
(...)
23 #compute_metrics=compute_metrics if training_args.predict_with_generate else None,
24 )
26 checkpoint = '/home/holly-npu/work/script/checkpoint/checkpoint-20231217'
---> 27 train_result = trainer.train()
28 trainer.save_model() # Saves the tokenizer too for easy upload
30 metrics = train_result.metrics

File /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindnlp/engine/trainer/base.py:755, in Trainer.train(self, resume_from_checkpoint, ignore_keys_for_eval, **kwargs)
750 self.model_wrapped = self.model
752 inner_training_loop = find_executable_batch_size(
753 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
754 )
--> 755 return inner_training_loop(
756 args=args,
757 resume_from_checkpoint=resume_from_checkpoint,
758 ignore_keys_for_eval=ignore_keys_for_eval,
759 )

File /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindnlp/engine/trainer/base.py:1107, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, ignore_keys_for_eval)
1104 if step % args.gradient_accumulation_steps == 0:
1105 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
-> 1107 tr_loss_step, grads = self.training_step(model, inputs)
1108 if (
1109 args.logging_nan_inf_filter
1110 and (ops.isnan(tr_loss_step) or ops.isinf(tr_loss_step))
1111 ):
1112 # if loss is nan or inf simply add the average of previous logged losses
1113 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindnlp/engine/trainer/base.py:1382, in Trainer.training_step(self, model, inputs)
1379 weights += tuple(group['params'])
1380 self.grad_fn = mindspore.value_and_grad(forward, None, weights)
-> 1382 loss, grads = self.grad_fn(inputs)
1384 return loss / self.args.gradient_accumulation_steps, grads

File /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindspore/ops/composite/base.py:625, in Grad.call..after_grad(*args, **kwargs)
624 def after_grad(*args, **kwargs):
--> 625 return grad
(fn_, weights)(*args, **kwargs)

File /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindspore/common/api.py:185, in _wrap_func..wrapper(*arg, **kwargs)
183 @wraps(fn)
184 def wrapper(*arg, **kwargs):
--> 185 results = fn(*arg, **kwargs)
186 return _convert_python_data(results)

File /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindspore/ops/composite/base.py:600, in _Grad.call..after_grad(*args, **kwargs)
598 @_wrap_func
599 def after_grad(*args, **kwargs):
--> 600 res = self.pynative_forward_run(fn, grad, weights, args, kwargs)
601 out = pynative_executor.grad(fn, grad, weights, grad_position, *args, **kwargs)
602 out = _grads_divided_by_device_num_if_recomputation(out)

File /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindspore/ops/composite/base.py:650, in _Grad._pynative_forward_run(self, fn, grad, weights, args, kwargs)
648 _pynative_executor.set_grad_flag(True)
649 _pynative_executor.new_graph(fn, *args, **new_kwargs)
--> 650 outputs = fn(*args, **new_kwargs)
651 _pynative_executor.end_graph(fn, outputs, *args, **new_kwargs)
652 return outputs

File /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindnlp/engine/trainer/base.py:1374, in Trainer.training_step..forward(inputs)
1373 def forward(inputs):
-> 1374 return self.compute_loss(model, inputs)

File /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindnlp/engine/trainer/base.py:1396, in Trainer.compute_loss(self, model, inputs, return_outputs)
1394 else:
1395 labels = None
-> 1396 outputs = model(**inputs)
1397 # Save past state if it exists
1398 # TODO: this needs to be fixed and made cleaner later.
1399 if self.args.past_index >= 0:

File /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindnlp/core/nn/modules/module.py:391, in Module._wrapped_call_impl(self, *args, **kwargs)
389 if self.ms_class:
390 return self.forward(*args, **kwargs)
--> 391 return self._call_impl(*args, **kwargs)

File /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindnlp/core/nn/modules/module.py:402, in Module._call_impl(self, *args, **kwargs)
397 # If we don't have any hooks, we want to skip the rest of the logic in
398 # this function, and just call forward.
399 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
400 or _global_backward_pre_hooks or _global_backward_hooks
401 or _global_forward_hooks or _global_forward_pre_hooks):
--> 402 return forward_call(*args, **kwargs)
404 try:
405 result = None

File /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindnlp/transformers/models/pegasus/modeling_pegasus.py:1647, in PegasusForConditionalGeneration.forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, past_key_values, inputs_embeds, decoder_inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
1645 use_cache = False
1646 if decoder_input_ids is None and decoder_inputs_embeds is None:
-> 1647 decoder_input_ids = shift_tokens_right(
1648 labels, self.config.pad_token_id, self.config.decoder_start_token_id
1649 )
1651 outputs = self.model(
1652 input_ids,
1653 attention_mask=attention_mask,
(...)
1666 return_dict=return_dict,
1667 )
1668 lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias

File /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindnlp/transformers/models/pegasus/modeling_pegasus.py:55, in shift_tokens_right(input_ids, pad_token_id, decoder_start_token_id)
51 """
52 Shift input ids one token to the right.
53 """
54 shifted_input_ids = input_ids.new_zeros(input_ids.shape)
---> 55 shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
56 shifted_input_ids[:, 0] = decoder_start_token_id
58 if pad_token_id is None:

AttributeError: 'StubTensor' object has no attribute 'clone'

没有data_collator、Seq2SeqTrainer。导致hg里面原始的训练无法迁移,提提供基础的训练摘要脚本。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant