We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
no_deprecation_warning=True
我的 run_sft为: CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node 1 supervised_finetuning.py --model_type llama --model_name_or_path ./DUOMO-Lab/TransGPT-v0 --train_file_dir ./data/finetune --validation_file_dir ./data/finetune --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --do_train --do_eval --use_peft True --fp16 --max_train_samples 2 --max_eval_samples 2 --load_in_8bit True --num_train_epochs 1 --learning_rate 2e-5 --warmup_ratio 0.05 --weight_decay 0.05 --logging_strategy steps --logging_steps 10 --eval_steps 50 --evaluation_strategy steps --save_steps 500 --save_strategy steps --save_total_limit 3 --gradient_accumulation_steps 1 --preprocessing_num_workers 4 --output_dir ./outputs-sft-v1 --overwrite_output_dir --ddp_timeout 30000 --logging_first_step True --target_modules all --lora_rank 8 --lora_alpha 16 --lora_dropout 0.05 --torch_dtype float16 --device_map auto --report_to tensorboard \
The text was updated successfully, but these errors were encountered:
No branches or pull requests
/data/anaconda3/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set
no_deprecation_warning=True
to disable this warningwarnings.warn(
0%| | 0/1 [00:00<?, ?it/s]/data/anaconda3/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:318: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /data/kk/TransGPT-main/supervised_finetuning.py:877 in │
│ │
│ 874 │
│ 875 │
│ 876 if name == "main": │
│ ❱ 877 │ main() │
│ 878 │
│ │
│ /data/kk/TransGPT-main/supervised_finetuning.py:848 in main │
│ │
│ 845 │ │ checkpoint = None │
│ 846 │ │ if training_args.resume_from_checkpoint is not None: │
│ 847 │ │ │ checkpoint = training_args.resume_from_checkpoint │
│ ❱ 848 │ │ train_result = trainer.train(resume_from_checkpoint=checkpoint) │
│ 849 │ │ │
│ 850 │ │ metrics = train_result.metrics │
│ 851 │ │ metrics["train_samples"] = max_train_samples │
│ │
│ /data/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:1645 in train │
│ │
│ 1642 │ │ inner_training_loop = find_executable_batch_size( │
│ 1643 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │
│ 1644 │ │ ) │
│ ❱ 1645 │ │ return inner_training_loop( │
│ 1646 │ │ │ args=args, │
│ 1647 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1648 │ │ │ trial=trial, │
│ │
│ /data/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:2007 in │
│ inner_training_loop │
│ │
│ 2004 │ │ │ │ │ │ scale_after = self.scaler.get_scale() │
│ 2005 │ │ │ │ │ │ optimizer_was_run = scale_before <= scale_after │
│ 2006 │ │ │ │ │ else: │
│ ❱ 2007 │ │ │ │ │ │ self.optimizer.step() │
│ 2008 │ │ │ │ │ │ optimizer_was_run = not self.accelerator.optimizer_step_was_skip │
│ 2009 │ │ │ │ │ │
│ 2010 │ │ │ │ │ if optimizer_was_run: │
│ │
│ /data/anaconda3/lib/python3.10/site-packages/accelerate/optimizer.py:134 in step │
│ │
│ 131 │ │ │ │ xm.optimizer_step(self.optimizer, optimizer_args=optimizer_args) │
│ 132 │ │ │ elif self.scaler is not None: │
│ 133 │ │ │ │ scale_before = self.scaler.get_scale() │
│ ❱ 134 │ │ │ │ self.scaler.step(self.optimizer, closure) │
│ 135 │ │ │ │ self.scaler.update() │
│ 136 │ │ │ │ scale_after = self.scaler.get_scale() │
│ 137 │ │ │ │ # If we reduced the loss scale, it means the optimizer step was skipped │
│ │
│ /data/anaconda3/lib/python3.10/site-packages/torch/cuda/amp/grad_scaler.py:372 in step │
│ │
│ 369 │ │ if optimizer_state["stage"] is OptState.READY: │
│ 370 │ │ │ self.unscale(optimizer) │
│ 371 │ │ │
│ ❱ 372 │ │ assert len(optimizer_state["found_inf_per_device"]) > 0, "No inf checks were rec │
│ 373 │ │ │
│ 374 │ │ retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs) │
│ 375 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
AssertionError: No inf checks were recorded for this optimizer.
0%| | 0/1 [00:06<?, ?it/s]
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 49028) of binary: /data/anaconda3/bin/python
Traceback (most recent call last):
File "/data/anaconda3/bin/torchrun", line 8, in
sys.exit(main())
File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
supervised_finetuning.py FAILED
Failures:
<NO_OTHER_FAILURES>
Root Cause (first observed failure):
[0]:
time : 2023-08-17_15:35:31
host : njxg-its-gpu01.njxg.baidu.com
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 49028)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
我的 run_sft为:
CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node 1 supervised_finetuning.py
--model_type llama
--model_name_or_path ./DUOMO-Lab/TransGPT-v0
--train_file_dir ./data/finetune
--validation_file_dir ./data/finetune
--per_device_train_batch_size 4
--per_device_eval_batch_size 4
--do_train
--do_eval
--use_peft True
--fp16
--max_train_samples 2
--max_eval_samples 2
--load_in_8bit True
--num_train_epochs 1
--learning_rate 2e-5
--warmup_ratio 0.05
--weight_decay 0.05
--logging_strategy steps
--logging_steps 10
--eval_steps 50
--evaluation_strategy steps
--save_steps 500
--save_strategy steps
--save_total_limit 3
--gradient_accumulation_steps 1
--preprocessing_num_workers 4
--output_dir ./outputs-sft-v1
--overwrite_output_dir
--ddp_timeout 30000
--logging_first_step True
--target_modules all
--lora_rank 8
--lora_alpha 16
--lora_dropout 0.05
--torch_dtype float16
--device_map auto
--report_to tensorboard \
The text was updated successfully, but these errors were encountered: