Skip to content

Commit

Permalink
Modify the startup method and clear up the code of elmo. (PaddlePaddl…
Browse files Browse the repository at this point in the history
…e#201)

Co-authored-by: Zeyu Chen <[email protected]>
  • Loading branch information
xiemoyuan and ZeyuChen authored Mar 30, 2021
1 parent 09185dd commit b03c4dc
Show file tree
Hide file tree
Showing 16 changed files with 321 additions and 416 deletions.
10 changes: 5 additions & 5 deletions examples/dialogue/dgu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,21 +109,21 @@ swda:由多轮对话id、标签label、发言人caller、对话内容conversat
运行如下命令即可在训练集 (train.tsv) 上进行模型训练,并在开发集 (dev.tsv) 验证,训练结束后会在测试集 (test.txt) 上进行模型评估

```shell
export CUDA_VISIBLE_DEVICES=0,1
# GPU启动,n_gpu指定训练所用的GPU数量,可以是单卡,也可以多卡。默认会进行训练、验证和评估
python -u main.py --task_name=udc --data_dir=./DGU_datasets/udc --output_dir=./checkpoints/udc --n_gpu=2
# GPU启动,gpus指定训练所用的GPU卡号,可以是单卡,也可以多卡。默认会进行训练、验证和评估
unset CUDA_VISIBLE_DEVICES
python -m paddle.distributed.launch --gpus "0" --log_dir ./log main.py --task_name=udc --data_dir=./DGU_datasets/udc --output_dir=./checkpoints/udc --device=gpu
# 若只需进行评估,do_train设为False,并且必须指定init_from_ckpt
# python -u main.py --task_name=udc --data_dir=./DGU_datasets/udc --do_train=False --init_from_ckpt=./checkpoints/udc/best
# python -m paddle.distributed.launch --gpus "0" --log_dir ./log main.py --task_name=udc --data_dir=./DGU_datasets/udc --do_train=False --init_from_ckpt=./checkpoints/udc/best --device=gpu
```

以上参数表示:

* `task_name`:任务名称,可以为udc、dstc2、atis_slot、atis_intent、mrda或swda。
* `data_dir`:训练数据路径。
* `output_dir`:训练保存模型的文件路径。
* `n_gpu`:训练所使用的GPU卡的数量,默认为1。
* `do_train:是否进行训练,默认为`True`。
* `init_from_ckpt`:恢复模型参数的路径。
* `device`:表示训练使用的设备。

其他可选参数和参数的默认值请参考`args.py`

Expand Down
121 changes: 23 additions & 98 deletions examples/dialogue/dgu/args.py
Original file line number Diff line number Diff line change
@@ -1,109 +1,34 @@
import argparse


# yapf: disable
def parse_args():
parser = argparse.ArgumentParser(__doc__)
parser.add_argument(
"--task_name",
default=None,
type=str,
required=True,
help="The name of the task to train.")
parser.add_argument(
"--model_name_or_path",
default='bert-base-uncased',
type=str,
help="Path to pre-trained bert model or shortcut name.")
parser.add_argument(
"--output_dir",
default=None,
type=str,
help="The output directory where the checkpoints will be saved.")
parser.add_argument(
"--data_dir",
default=None,
type=str,
help="The directory where the dataset will be load.")
parser.add_argument(
"--init_from_ckpt",
default=None,
type=str,
help="The path of checkpoint to be loaded.")
parser.add_argument(
"--max_seq_len",
default=None,
type=int,
help="The maximum total input sequence length after tokenization for trainng. "
"Sequences longer than this will be truncated, sequences shorter will be padded."
)
parser.add_argument(
"--test_max_seq_len",
default=None,
type=int,
help="The maximum total input sequence length after tokenization for testing. "
"Sequences longer than this will be truncated, sequences shorter will be padded."
)
parser.add_argument(
"--batch_size",
default=None,
type=int,
help="Batch size per GPU/CPU for training.")
parser.add_argument(
"--test_batch_size",
default=None,
type=int,
help="Batch size per GPU/CPU for testing.")
parser.add_argument(
"--learning_rate",
default=None,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument(
"--weight_decay",
default=0.01,
type=float,
help="Weight decay if we apply some.")
parser.add_argument(
"--epochs",
default=None,
type=int,
help="Total number of training epochs to perform.")
parser.add_argument(
"--logging_steps",
default=None,
type=int,
help="Log every X updates steps.")
parser.add_argument(
"--save_steps",
default=None,
type=int,
help="Save checkpoint every X updates steps.")
parser.add_argument(
"--seed", default=42, type=int, help="Random seed for initialization.")
parser.add_argument(
"--n_gpu",
default=1,
type=int,
help="The number of gpus to use, 0 for cpu.")
parser.add_argument(
"--warmup_proportion",
default=0.1,
type=float,
help="The proportion of warmup.")
parser.add_argument(
'--max_grad_norm',
default=1.0,
type=float,
help='The max value of grad norm.')
parser.add_argument(
"--do_train", default=True, type=eval, help="Whether training.")
parser.add_argument(
"--do_eval", default=True, type=eval, help="Whether evaluation.")
parser.add_argument(
"--do_test", default=True, type=eval, help="Whether testing.")
parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.")
parser.add_argument("--model_name_or_path", default='bert-base-uncased', type=str, help="Path to pre-trained bert model or shortcut name.")
parser.add_argument("--output_dir", default=None, type=str, help="The output directory where the checkpoints will be saved.")
parser.add_argument("--data_dir", default=None, type=str, help="The directory where the dataset will be load.")
parser.add_argument("--init_from_ckpt", default=None, type=str, help="The path of checkpoint to be loaded.")
parser.add_argument("--max_seq_len", default=None, type=int, help="The maximum total input sequence length after tokenization for trainng. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--test_max_seq_len", default=None, type=int, help="The maximum total input sequence length after tokenization for testing. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=None, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument("--test_batch_size", default=None, type=int, help="Batch size per GPU/CPU for testing.")
parser.add_argument("--learning_rate", default=None, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay if we apply some.")
parser.add_argument("--epochs", default=None, type=int, help="Total number of training epochs to perform.")
parser.add_argument("--logging_steps", default=None, type=int, help="Log every X updates steps.")
parser.add_argument("--save_steps", default=None, type=int, help="Save checkpoint every X updates steps.")
parser.add_argument("--seed", default=42, type=int, help="Random seed for initialization.")
parser.add_argument("--warmup_proportion", default=0.1, type=float, help="The proportion of warmup.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="The max value of grad norm.")
parser.add_argument("--do_train", default=True, type=eval, help="Whether training.")
parser.add_argument("--do_eval", default=True, type=eval, help="Whether evaluation.")
parser.add_argument("--do_test", default=True, type=eval, help="Whether testing.")
parser.add_argument("--device", type=str, default="gpu", help="Device for selecting for the training.")

args = parser.parse_args()
return args
# yapf: enable


def set_default_args(args):
Expand Down
52 changes: 24 additions & 28 deletions examples/dialogue/dgu/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,13 @@ def print_logs(args, step, logits, labels, loss, total_time, metric):
(step, loss, f1_micro, total_time / args.logging_steps))


def train(args, model, train_data_loader, dev_data_loader, metric, rank):
num_examples = len(train_data_loader) * args.batch_size * args.n_gpu
def train(args, model, train_data_loader, dev_data_loader, metric, n_procs,
rank):
num_examples = len(train_data_loader) * args.batch_size * n_procs
max_train_steps = args.epochs * len(train_data_loader)
if rank == 0:
print("Num train examples: %d" % num_examples)
print("Max train steps: %d" % max_train_steps)
print("Warmup proportion: %d" % args.warmup_proportion)
print("\nNum train examples: %d" % num_examples)
print("Max train steps: %d" % max_train_steps)
print("Warmup proportion: %.2f" % args.warmup_proportion)

lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_train_steps,
args.warmup_proportion)
Expand All @@ -137,8 +137,7 @@ def train(args, model, train_data_loader, dev_data_loader, metric, rank):
best_metric = 0.0
total_time = 0.0
for epoch in range(args.epochs):
if rank == 0:
print('\nEpoch %d/%d' % (epoch + 1, args.epochs))
print('\nEpoch %d/%d' % (epoch + 1, args.epochs))
batch_start_time = time.time()
for batch in train_data_loader:
step += 1
Expand All @@ -150,21 +149,20 @@ def train(args, model, train_data_loader, dev_data_loader, metric, rank):
lr_scheduler.step()
optimizer.clear_grad()
total_time += (time.time() - batch_start_time)
if rank == 0:
if step % args.logging_steps == 0:
print_logs(args, step, logits, labels, loss, total_time,
metric)
total_time = 0.0
if step % args.save_steps == 0 or step == max_train_steps:
if step % args.logging_steps == 0:
print_logs(args, step, logits, labels, loss, total_time, metric)
total_time = 0.0
if step % args.save_steps == 0 or step == max_train_steps:
if rank == 0:
save_ckpt(model, optimizer, args.output_dir, step)
if args.do_eval:
print('\nEval begin...')
metric_out = evaluation(args, model, dev_data_loader,
metric)
if metric_out > best_metric:
best_metric = metric_out
save_ckpt(model, optimizer, args.output_dir, 'best')
print('Best model, step: %d\n' % step)
if args.do_eval:
print('\nEval begin...')
metric_out = evaluation(args, model, dev_data_loader,
metric)
if rank == 0 and metric_out > best_metric:
best_metric = metric_out
save_ckpt(model, optimizer, args.output_dir, 'best')
print('Best model, step: %d\n' % step)
batch_start_time = time.time()


Expand Down Expand Up @@ -216,7 +214,7 @@ def create_data_loader(args, dataset_class, trans_func, batchify_fn, mode):


def main(args):
paddle.set_device('gpu' if args.n_gpu else 'cpu')
paddle.set_device(args.device)
world_size = dist.get_world_size()
rank = dist.get_rank()
if world_size > 1 and args.do_train:
Expand Down Expand Up @@ -265,7 +263,8 @@ def main(args):
args, dataset_class, test_trans_func, batchify_fn, 'dev')
else:
dev_data_loader = None
train(args, model, train_data_loader, dev_data_loader, metric, rank)
train(args, model, train_data_loader, dev_data_loader, metric,
world_size, rank)

if args.do_test:
if rank == 0:
Expand Down Expand Up @@ -297,7 +296,4 @@ def print_args(args):
set_default_args(args)
print_args(args)

if args.n_gpu > 1:
dist.spawn(main, args=(args, ), nprocs=args.n_gpu)
else:
main(args)
main(args)
22 changes: 13 additions & 9 deletions examples/dialogue/lic2021_baseline/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ UnifiedTransformer模型的细节可以[参阅论文](https://arxiv.org/abs/2006
### 数据准备

由于样例数据涉及LIC 2021对话比赛,暂不开放。
关于数据集及数据集的预处理过程,详见[2021语言与智能技术竞赛:多技能对话](https://aistudio.baidu.com/aistudio/competition/detail/67)及官方提供的基线系统Baselines。

模型的输入由3部分组成:词向量token_ids,句向量token_type_ids和位置向量position_ids。本项目的数据集是样例文本经过数据预处理脚本得到的id化的数据集。数据的每一行由3列组成,以";"作为分割符,格式:token_ids;token_type_ids;position_ids。具体细节请参考`data.py`

Expand All @@ -29,10 +30,9 @@ UnifiedTransformer模型的细节可以[参阅论文](https://arxiv.org/abs/2006
运行如下命令即可在样例训练集上进行finetune,并在样例验证集上进行验证

```shell
export CUDA_VISIBLE_DEVICES=0
# GPU启动,参数`n_gpus`指定训练所用的GPU数量,可以是单卡,也可以多卡,默认是单卡
python -u finetune.py \
--n_gpus=1 \
# GPU启动,参数`--gpus`指定训练所用的GPU卡号,可以是单卡,也可以多卡
unset CUDA_VISIBLE_DEVICES
python -m paddle.distributed.launch --gpus "0" --log_dir ./log finetune.py \
--model_name_or_path=unified_transformer-12L-cn \
--train_data_path=./datasets/train.txt \
--valid_data_path=./datasets/valid.txt \
Expand All @@ -46,11 +46,12 @@ python -u finetune.py \
--weight_decay=0.01 \
--warmup_steps=4000 \
--max_grad_norm=0.1 \
--sort_pool_size=65536
--sort_pool_size=65536 \
--device=gpu
```

其中参数释义如下:
- `n_gpus` 表示使用的GPU卡数。若希望使用多卡训练,将其设置为指定数目即可,最大数量不能超过环境变量CUDA_VISIBLE_DEVICES配置的GPU个数;若为0,则使用CPU
- `gpus` 指示了训练所用的GPU卡号
- `model_name_or_path` 指示了finetune使用的具体预训练模型,可以是PaddleNLP提供的预训练模型,或者是本地的预训练模型。如果使用本地的预训练模型,可以配置本地模型的目录地址,例如: ./checkpoints/model_xx/,目录中需包含paddle预训练模型model_state.pdparams。如果使用PaddleNLP提供的预训练模型,可以选择下面其中之一。

| PaddleNLP提供的预训练模型 |
Expand All @@ -71,6 +72,7 @@ python -u finetune.py \
- `warmup_steps` 表示学习率逐渐升高到基础学习率(即上面配置的lr)所需要的迭代数,最早的使用可以参考[这篇论文](https://arxiv.org/pdf/1706.02677.pdf)
- `max_grad_norm` 表示梯度裁剪允许的最大梯度值。
- `sort_pool_size` 表示在构建batch数据时,用来排序的pool size。
- `device` 表示训练使用的设备。

参数详情和参数的默认值请参考`args.py`

Expand All @@ -95,8 +97,8 @@ python -u finetune.py \

```shell
export CUDA_VISIBLE_DEVICES=0
# GPU启动,n_gpu指定训练所用的GPU数量,预测仅支持单卡
python -u infer.py \
# GPU启动,预测仅支持单卡
python infer.py \
--model_name_or_path=./checkpoints/model_80000 \
--test_data_path=./datasets/test.txt \
--output_path=./predict.txt \
Expand All @@ -107,7 +109,8 @@ python -u infer.py \
--max_dec_len=64 \
--num_samples=20 \
--decode_strategy=sampling \
--top_k=5
--top_k=5 \
--device=gpu
```

其中参数释义如下:
Expand All @@ -128,6 +131,7 @@ python -u infer.py \
- `num_samples` 表示每条样本生成的句子的数量。对于每条样本,模型会生成`num_samples`个句子,根据每个句子的概率得分进行排序,得分最高的句子作为最终的生成结果。
- `decode_strategy` 表示预测解码时采取的策略,可选"sampling"、"greedy_search"和"beam_search"之一。
- `top_k` 表示采用"sampling"解码策略时,token的概率按从大到小排序,生成的token只从前`top_k`个中进行采样。
- `device` 表示训练使用的设备。

参数详情和参数的默认值请参考`args.py`

Expand Down
2 changes: 1 addition & 1 deletion examples/dialogue/lic2021_baseline/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ def parse_args():
parser.add_argument('--logging_steps', type=int, default=500, help='Log every X updates steps.')
parser.add_argument('--save_steps', type=int, default=8000, help='Save checkpoint every X updates steps.')
parser.add_argument('--seed', type=int, default=2021, help='Random seed for initialization.')
parser.add_argument('--n_gpus', type=int, default=1, help='The number of gpus to use, 0 for cpu.')
parser.add_argument('--batch_size', type=int, default=8192, required=True, help='Batch size per GPU/CPU for training.')
parser.add_argument('--lr', type=float, default=1e-5, help='The initial learning rate.')
parser.add_argument('--weight_decay', type=float, default=0.01, help='The weight decay for optimizer.')
Expand All @@ -31,6 +30,7 @@ def parse_args():
parser.add_argument('--num_beams', type=int, default=0, help='The number of beams for beam search.')
parser.add_argument('--length_penalty', type=float, default=1.0, help='The exponential penalty to the sequence length for beam search.')
parser.add_argument('--early_stopping', type=eval, default=False, help='Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.')
parser.add_argument('--device', type=str, default='gpu', help='Device for selecting for the training.')

args = parser.parse_args()
return args
Expand Down
10 changes: 5 additions & 5 deletions examples/dialogue/lic2021_baseline/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,16 @@ def __init__(self,
bos_token_id,
sort_pool_size=2**16,
seed=1,
n_gpus=None,
n_procs=None,
rank=None,
mode='test'):
super(DialogueDataset, self).__init__()

self.file_list = glob(filepattern)
self.sort_pool_size = 0 if mode == 'test' else sort_pool_size
self.n_gpus = n_gpus if n_gpus else dist.get_world_size()
self.n_procs = n_procs if n_procs else dist.get_world_size()
self.rank = rank if rank else dist.get_rank()
self.batch_size = batch_size * self.n_gpus
self.batch_size = batch_size * self.n_procs
self.shuffle = True if mode == 'train' else False
self.mode = mode
self.pad_id = pad_token_id
Expand Down Expand Up @@ -154,8 +154,8 @@ def __iter__(self):
for batch_data in self.get_batch:
# sample [token_ids, type_ids, pos_ids, tgt_start_idx]
# raw_batch [sample0, sample1, ...]
if self.n_gpus > 1:
batch_data = batch_data[self.rank::self.n_gpus]
if self.n_procs > 1:
batch_data = batch_data[self.rank::self.n_procs]
batch_data = zip(*batch_data)
token_ids, type_ids, pos_ids, tgt_start_idx = batch_data

Expand Down
Loading

0 comments on commit b03c4dc

Please sign in to comment.