Skip to content

Commit

Permalink
remove old datasets, swich spawn to launch (PaddlePaddle#224)
Browse files Browse the repository at this point in the history
* remove old datasets, spawn->lanch

* fix rank
  • Loading branch information
smallv0221 authored Apr 6, 2021
1 parent ef1719b commit d4376d1
Show file tree
Hide file tree
Showing 47 changed files with 1,090 additions and 4,029 deletions.
5 changes: 3 additions & 2 deletions docs/data_prepare/dataset_self_defined.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
如何自定义数据集
============

通过使用PaddleNLP提供的 :class:`MapDataset` 和 :class:`IterDataset` 。任何人都可以方便的定义属于自己的数据集。
通过使用PaddleNLP提供的 :func:`load_dataset` , :class:`MapDataset` 和 :class:`IterDataset` 。任何人都可以方便的定义属于自己的数据集。

从本地文件创建数据集
-------------------

从本地文件创建数据集时,我们 **推荐** 根据本地数据集的格式给出读取function并传入 :func:`load_dataset`中创建数据集。
从本地文件创建数据集时,我们 **推荐** 根据本地数据集的格式给出读取function并传入 :func:`load_dataset` 中创建数据集。

:obj:`waybill_ie` 快递单信息抽取任务中的数据为例:

Expand All @@ -25,6 +25,7 @@
labels = labels.split('\002')
yield {'tokens': words, 'labels': labels}
# data_path为read()方法的参数
map_ds = load_dataset(read, data_path='train.txt',lazy=False)
iter_ds = load_dataset(read, data_path='train.txt',lazy=True)
Expand Down
3 changes: 2 additions & 1 deletion examples/language_model/rnnlm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
任务训练启动命令如下:

```
python train.py
unset CUDA_VISIBLE_DEVICES
python -m paddle.distributed.launch --gpus "0" train.py \
```

程序运行时将会自动进行训练,评估,测试。同时训练过程中会自动保存模型到checkpoint、中。
Expand Down
8 changes: 4 additions & 4 deletions examples/language_model/rnnlm/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ def parse_args():
default=None,
help="The path of checkpoint to be loaded.")
parser.add_argument(
"--n_gpu",
type=int,
default=1,
help="number of gpus to use, 0 for cpu.")
'--device',
choices=['cpu', 'gpu'],
default="gpu",
help="Select which device to train model, defaults to gpu.")
args = parser.parse_args()
return args
11 changes: 6 additions & 5 deletions examples/language_model/rnnlm/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def group_texts(examples):


def train(args):
paddle.set_device("gpu" if args.n_gpu else "cpu")
paddle.set_device(args.device)
data_path = args.data_path
train_loader, valid_loader, test_loader, vocab_size = create_data_loader(
batch_size=args.batch_size,
Expand Down Expand Up @@ -121,7 +121,8 @@ def train(args):

if __name__ == '__main__':
args = parse_args()
if args.n_gpu > 1:
paddle.distributed.spawn(train, args=(args, ), nprocs=args.n_gpu)
else:
train(args)
assert args.device in [
"cpu", "gpu", "xpu"
], "Invalid device! Available device should be cpu, gpu, or xpu."

train(args)
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ DuReader-robust数据集是单篇章、抽取式阅读理解数据集,具体
按如下方式启动 Fine-tuning:

```shell
python -u ./run_du.py \
unset CUDA_VISIBLE_DEVICES
python -m paddle.distributed.launch --gpus "0" run_du.py \
--task_name dureader_robust \
--model_type bert \
--model_name_or_path bert-base-chinese \
Expand All @@ -42,10 +43,10 @@ python -u ./run_du.py \
--save_steps 1000 \
--warmup_proportion 0.1 \
--weight_decay 0.01 \
--output_dir ./tmp/dureader_robust/ \
--do_predict \
--output_dir ./tmp/dureader-robust/ \
--do_train \
--n_gpu 1 \
--do_predict \
--device gpu \
```

* `task_name`: 数据集的名称,不区分大小写,如dureader_robust,cmrc2018, drcd。
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,10 @@ def parse_args():
parser.add_argument(
"--seed", type=int, default=42, help="random seed for initialization")
parser.add_argument(
"--n_gpu",
type=int,
default=1,
help="number of gpus to use, 0 for cpu.")
'--device',
choices=['cpu', 'gpu'],
default="gpu",
help="Select which device to train model, defaults to gpu.")
parser.add_argument(
"--doc_stride",
type=int,
Expand Down
15 changes: 6 additions & 9 deletions examples/machine_reading_comprehension/DuReader-robust/run_du.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,16 +109,17 @@ def forward(self, y, label):


def run(args):
paddle.set_device("gpu" if args.n_gpu else "cpu")
paddle.set_device(args.device)
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
rank = paddle.distributed.get_rank()

task_name = args.task_name.lower()
args.model_type = args.model_type.lower()
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
set_seed(args)
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
if rank == 0:
if os.path.exists(args.model_name_or_path):
print("init checkpoint from %s" % args.model_name_or_path)

Expand Down Expand Up @@ -259,8 +260,7 @@ def prepare_train_features(examples):
optimizer.clear_grad()

if global_step % args.save_steps == 0 or global_step == num_training_steps:
if (not args.n_gpu > 1
) or paddle.distributed.get_rank() == 0:
if rank == 0:
output_dir = os.path.join(args.output_dir,
"model_%d" % global_step)
if not os.path.exists(output_dir):
Expand Down Expand Up @@ -307,7 +307,7 @@ def prepare_validation_features(examples):

return tokenized_examples

if args.do_predict and paddle.distributed.get_rank() == 0:
if args.do_predict and rank == 0:

if args.predict_file:
dev_ds = load_dataset(task_name, data_files=args.predict_file)
Expand All @@ -334,7 +334,4 @@ def prepare_validation_features(examples):

if __name__ == "__main__":
args = parse_args()
if args.n_gpu > 1:
paddle.distributed.spawn(run, args=(args, ), nprocs=args.n_gpu)
else:
run(args)
run(args)
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@
按如下方式启动 Fine-tuning:

```shell
python -u ./run_du.py \
unset CUDA_VISIBLE_DEVICES
python -m paddle.distributed.launch --gpus "0" run_du.py \
--model_type bert \
--model_name_or_path bert-base-chinese \
--max_seq_length 384 \
Expand All @@ -52,7 +53,7 @@ python -u ./run_du.py \
--warmup_proportion 0.1 \
--weight_decay 0.01 \
--output_dir ./tmp/dureader-yesno/ \
--n_gpu 1 \
--device gpu \
```

* `model_type`: 预训练模型的种类。如bert,ernie,roberta等。
Expand All @@ -66,4 +67,4 @@ accu: 0.861040
```
评估结束后模型会自动对测试集进行预测,并将可提交的结果生成在`prediction.json`中。

**NOTE:** 如需恢复模型训练,则model_name_or_path只需指定到文件夹名即可。如`--model_name_or_path=./tmp/dureader-yesno/model_19000/`,程序会自动加载模型参数`/model_state.pdparams`,也会自动加载词表,模型config和tokenizer的config。
**NOTE:** 如需恢复模型训练,则model_name_or_path只需指定到文件夹名即可。如`--model_name_or_path=./tmp/dureader-yesno/model_19000/`,程序会自动加载模型参数`/model_state.pdparams`,也会自动加载词表,模型config和tokenizer的config。
8 changes: 4 additions & 4 deletions examples/machine_reading_comprehension/DuReader-yesno/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,10 @@ def parse_args():
parser.add_argument(
"--seed", type=int, default=42, help="random seed for initialization")
parser.add_argument(
"--n_gpu",
type=int,
default=1,
help="number of gpus to use, 0 for cpu.")
'--device',
choices=['cpu', 'gpu'],
default="gpu",
help="Select which device to train model, defaults to gpu.")
parser.add_argument(
"--do_lower_case",
action='store_false',
Expand Down
13 changes: 5 additions & 8 deletions examples/machine_reading_comprehension/DuReader-yesno/run_du.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@ def predict(model, data_loader):


def do_train(args):
paddle.set_device("gpu" if args.n_gpu else "cpu")
paddle.set_device(args.device)
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()

rank = paddle.distributed.get_rank()
args.model_type = args.model_type.lower()
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
Expand Down Expand Up @@ -192,7 +192,7 @@ def do_train(args):
optimizer.clear_grad()

if global_step % args.save_steps == 0 or global_step == num_training_steps:
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
if rank == 0:
evaluate(model, metric, dev_data_loader)
output_dir = os.path.join(args.output_dir,
"model_%d" % global_step)
Expand All @@ -207,7 +207,7 @@ def do_train(args):
if global_step == num_training_steps:
break

if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
if rank == 0:
predictions = predict(model, test_data_loader)
with open('prediction.json', "w") as writer:
writer.write(
Expand All @@ -217,7 +217,4 @@ def do_train(args):

if __name__ == "__main__":
args = parse_args()
if args.n_gpu > 1:
paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu)
else:
do_train(args)
do_train(args)
16 changes: 9 additions & 7 deletions examples/machine_reading_comprehension/SQuAD/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,22 @@ SQuAD v2.0
对于 SQuAD v1.1,按如下方式启动 Fine-tuning:

```shell
python -u ./run_squad.py \
unset CUDA_VISIBLE_DEVICES
python -m paddle.distributed.launch --gpus "0" run_squad.py \
--model_type bert \
--model_name_or_path bert-base-uncased \
--max_seq_length 384 \
--batch_size 12 \
--learning_rate 3e-5 \
--num_train_epochs 2 \
--logging_steps 100 \
--logging_steps 1000 \
--save_steps 1000 \
--warmup_proportion 0.1 \
--weight_decay 0.01 \
--output_dir ./tmp/squad/ \
--device gpu \
--do_train \
--do_predict \
--n_gpu 1
--do_predict
```

* `model_type`: 预训练模型的种类。如bert,ernie,roberta等。
Expand All @@ -68,7 +69,8 @@ python -u ./run_squad.py \
对于 SQuAD v2.0,按如下方式启动 Fine-tuning:

```shell
python -u ./run_squad.py \
unset CUDA_VISIBLE_DEVICES
python -m paddle.distributed.launch --gpus "0" run_squad.py \
--model_type bert \
--model_name_or_path bert-base-uncased \
--max_seq_length 384 \
Expand All @@ -80,9 +82,9 @@ python -u ./run_squad.py \
--warmup_proportion 0.1 \
--weight_decay 0.01 \
--output_dir ./tmp/squad/ \
--n_gpu 1 \
--device gpu \
--do_train \
--do_pred \
--do_predict \
--version_2_with_negative
```

Expand Down
8 changes: 4 additions & 4 deletions examples/machine_reading_comprehension/SQuAD/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,10 @@ def parse_args():
parser.add_argument(
"--seed", type=int, default=42, help="random seed for initialization")
parser.add_argument(
"--n_gpu",
type=int,
default=1,
help="number of gpus to use, 0 for cpu.")
'--device',
choices=['cpu', 'gpu'],
default="gpu",
help="Select which device to train model, defaults to gpu.")
parser.add_argument(
"--doc_stride",
type=int,
Expand Down
19 changes: 7 additions & 12 deletions examples/machine_reading_comprehension/SQuAD/run_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,16 +109,16 @@ def forward(self, y, label):


def run(args):
paddle.set_device("gpu" if args.n_gpu else "cpu")
paddle.set_device(args.device)
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()

rank = paddle.distributed.get_rank()
args.model_type = args.model_type.lower()
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

set_seed(args)
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
if rank == 0:
if os.path.exists(args.model_name_or_path):
print("init checkpoint from %s" % args.model_name_or_path)

Expand Down Expand Up @@ -268,8 +268,7 @@ def prepare_train_features(examples):
optimizer.clear_grad()

if global_step % args.save_steps == 0 or global_step == num_training_steps:
if (not args.n_gpu > 1
) or paddle.distributed.get_rank() == 0:
if rank == 0:
output_dir = os.path.join(args.output_dir,
"model_%d" % global_step)
if not os.path.exists(output_dir):
Expand Down Expand Up @@ -316,7 +315,7 @@ def prepare_validation_features(examples):

return tokenized_examples

if args.do_predict:
if args.do_predict and rank == 0:
if args.predict_file:
dev_ds = load_dataset('sqaud', data_files=args.predict_file)
elif args.version_2_with_negative:
Expand All @@ -339,13 +338,9 @@ def prepare_validation_features(examples):
collate_fn=dev_batchify_fn,
return_list=True)

if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
evaluate(model, dev_data_loader, args)
evaluate(model, dev_data_loader, args)


if __name__ == "__main__":
args = parse_args()
if args.n_gpu > 1:
paddle.distributed.spawn(run, args=(args, ), nprocs=args.n_gpu)
else:
run(args)
run(args)
17 changes: 11 additions & 6 deletions paddlenlp/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from .chnsenticorp import *
from .dataset import *
from .chnsenticorp import *
from .cmrc2018 import *
from .drcd import *
from .dureader_robust import *
from .glue import *
from .imdb import *
from .lcqmc import *
from .msra_ner import *
from .peoples_daily_ner import *
from .ptb import *
from .squad import *
from .translation import *
from .dureader import *
from .peoples_daily_ner import *
from .poetry import *
from .cmrc2018 import *
from .drcd import *
from .dureader_robust import *
from .glue import *
from .wmt14ende import *
from .couplet import *
from .experimental import load_dataset, DatasetBuilder, MapDataset, IterDataset
from .yahoo_answer_100k import *
Loading

0 comments on commit d4376d1

Please sign in to comment.