forked from PaddlePaddle/PaddleNLP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathargs.py
189 lines (173 loc) Β· 7.53 KB
/
args.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import paddle
from paddlenlp.trainer.argparser import strtobool
from paddlenlp.utils.log import logger
def parse_args(MODEL_CLASSES):
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_type",
default=None,
type=str,
required=True,
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
)
parser.add_argument(
"--model_name_or_path",
default=None,
type=str,
required=True,
help="Path to pre-trained model or shortcut name selected in the list: "
+ ", ".join(
sum([list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values()], [])
),
)
# Train I/O config
parser.add_argument(
"--input_dir",
default=None,
type=str,
required=True,
help="The input directory where the data will be read from.",
)
parser.add_argument(
"--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the training logs and checkpoints will be written.",
)
parser.add_argument("--split", type=str, default="949,50,1", help="Train/valid/test data split.")
parser.add_argument("--max_seq_len", type=int, default=1024, help="Max sequence length.")
parser.add_argument(
"--micro_batch_size",
default=8,
type=int,
help="Batch size per device for one step training.",
)
parser.add_argument(
"--global_batch_size",
default=None,
type=int,
help="Global batch size for all training process. None for not check the size is valid. If we only use data parallelism, it should be device_num * micro_batch_size.",
)
# Default training config
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--grad_clip", default=0.0, type=float, help="Grad clip for the parameter.")
parser.add_argument("--max_lr", default=1e-4, type=float, help="The initial max learning rate for Adam.")
parser.add_argument("--min_lr", default=1e-5, type=float, help="The initial min learning rate for Adam.")
parser.add_argument(
"--warmup_rate", default=0.01, type=float, help="Linear warmup over warmup_steps for learing rate."
)
# Adam optimizer config
parser.add_argument(
"--adam_beta1",
default=0.9,
type=float,
help="The beta1 for Adam optimizer. The exponential decay rate for the 1st moment estimates.",
)
parser.add_argument(
"--adam_beta2",
default=0.999,
type=float,
help="The bate2 for Adam optimizer. The exponential decay rate for the 2nd moment estimates.",
)
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
# Training steps config
parser.add_argument("--max_steps", default=500000, type=int, help="set total number of training steps to perform.")
parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
parser.add_argument(
"--decay_steps",
default=360000,
type=int,
help="The steps use to control the learing rate. If the step > decay_steps, will use the min_lr.",
)
parser.add_argument("--logging_freq", type=int, default=1, help="Log every X updates steps.")
parser.add_argument("--eval_freq", type=int, default=500, help="Evaluate for every X updates steps.")
parser.add_argument("--eval_iters", type=int, default=10, help="Evaluate the model use X steps data.")
# Config for 4D Parallelism
parser.add_argument(
"--use_sharding", type=strtobool, nargs="?", const=False, help="Use sharding Parallelism to training."
)
parser.add_argument(
"--sharding_degree", type=int, default=1, help="Sharding degree. Share the parameters to many cards."
)
parser.add_argument("--dp_degree", type=int, default=1, help="Data Parallelism degree.")
parser.add_argument(
"--mp_degree", type=int, default=1, help="Model Parallelism degree. Spliting the linear layers to many cards."
)
parser.add_argument(
"--pp_degree",
type=int,
default=1,
help="Pipeline Parallelism degree. Spliting the the model layers to different parts.",
)
parser.add_argument(
"--use_recompute", type=strtobool, nargs="?", const=False, help="Using the recompute to save the memory."
)
# AMP config
parser.add_argument("--use_amp", type=strtobool, nargs="?", const=False, help="Enable mixed precision training.")
parser.add_argument(
"--amp_level", type=str, default="O1", choices=["O1", "O2"], help="select O1 or O2 of amp level."
)
parser.add_argument(
"--enable_addto",
type=strtobool,
nargs="?",
const=True,
help="Whether to enable the addto strategy for gradient accumulation or not. This is only used for AMP training.",
)
parser.add_argument(
"--scale_loss",
type=float,
default=32768,
help="The value of scale_loss for fp16. This is only used for AMP training.",
)
parser.add_argument("--hidden_dropout_prob", type=float, default=0.1, help="The hidden dropout prob.")
parser.add_argument(
"--attention_probs_dropout_prob", type=float, default=0.1, help="The attention probs dropout prob."
)
parser.add_argument("--to_static", action="store_true", help="Whether use to_static to train.")
# Other config
parser.add_argument("--seed", type=int, default=1234, help="Random seed for initialization")
parser.add_argument(
"--check_accuracy", type=strtobool, nargs="?", const=False, help="Check accuracy for training process."
)
parser.add_argument(
"--device", type=str, default="gpu", choices=["cpu", "gpu", "xpu", "npu"], help="select cpu, gpu, xpu devices."
)
parser.add_argument(
"--lr_decay_style", type=str, default="cosine", choices=["cosine", "none"], help="Learning rate decay style."
)
parser.add_argument(
"-p",
"--profiler_options",
type=str,
default=None,
help='The option of profiler, which should be in format "key1=value1;key2=value2;key3=value3".',
)
args = parser.parse_args()
args.test_iters = args.eval_iters * 10
if args.check_accuracy:
if args.hidden_dropout_prob != 0:
args.hidden_dropout_prob = 0.0
logger.warning("The hidden_dropout_prob should set to 0 for accuracy checking.")
if args.attention_probs_dropout_prob != 0:
args.attention_probs_dropout_prob = 0.0
logger.warning("The attention_probs_dropout_prob should set to 0 for accuracy checking.")
logger.info("{:20}:{}".format("paddle commit id", paddle.version.commit))
for arg in vars(args):
logger.info("{:20}:{}".format(arg, getattr(args, arg)))
return args