Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TTS]add blank between characters for vits #2040

Merged
merged 1 commit into from
Jun 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions examples/csmsc/vits/conf/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ generator_first: False # whether to start updating generator first
##########################################################
# OTHER TRAINING SETTING #
##########################################################
max_epoch: 1000 # number of epochs
num_snapshots: 10 # max number of snapshots to keep while training
seed: 777 # random seed number
num_snapshots: 10 # max number of snapshots to keep while training
train_max_steps: 250000 # Number of training steps. == total_iters / ngpus, total_iters = 1000000
save_interval_steps: 1000 # Interval steps to save checkpoint.
eval_interval_steps: 250 # Interval steps to evaluate the network.
seed: 777 # random seed number
4 changes: 4 additions & 0 deletions examples/csmsc/vits/local/preprocess.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ stage=0
stop_stage=100

config_path=$1
add_blank=$2

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# get durations from MFA's result
Expand Down Expand Up @@ -44,6 +45,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--feats-stats=dump/train/feats_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt \
--add-blank=${add_blank} \
--skip-wav-copy

python3 ${BIN_DIR}/normalize.py \
Expand All @@ -52,6 +54,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--feats-stats=dump/train/feats_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt \
--add-blank=${add_blank} \
--skip-wav-copy

python3 ${BIN_DIR}/normalize.py \
Expand All @@ -60,5 +63,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--feats-stats=dump/train/feats_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt \
--add-blank=${add_blank} \
--skip-wav-copy
fi
6 changes: 5 additions & 1 deletion examples/csmsc/vits/local/synthesize_e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
config_path=$1
train_output_path=$2
ckpt_name=$3
add_blank=$4

stage=0
stop_stage=0


if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
Expand All @@ -14,5 +17,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--phones_dict=dump/phone_id_map.txt \
--output_dir=${train_output_path}/test_e2e \
--text=${BIN_DIR}/../sentences.txt
--text=${BIN_DIR}/../sentences.txt \
--add-blank=${add_blank}
fi
5 changes: 3 additions & 2 deletions examples/csmsc/vits/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_153.pdz
add_blank=true

# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
Expand All @@ -18,7 +19,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
./local/preprocess.sh ${conf_path} ${add_blank}|| exit -1
fi

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
Expand All @@ -32,5 +33,5 @@ fi

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ${add_blank}|| exit -1
fi
3 changes: 1 addition & 2 deletions examples/ljspeech/voc0/local/synthesize.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,4 @@ python ${BIN_DIR}/synthesize.py \
--input=${input_mel_path} \
--output=${train_output_path}/wavs/ \
--checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
--ngpu=1 \
--verbose
--ngpu=1
24 changes: 1 addition & 23 deletions paddlespeech/t2s/exps/fastspeech2/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,30 +58,8 @@ def main():
"--phones-dict", type=str, default=None, help="phone vocabulary file.")
parser.add_argument(
"--speaker-dict", type=str, default=None, help="speaker id map file.")
parser.add_argument(
"--verbose",
type=int,
default=1,
help="logging level. higher is more logging. (default=1)")
args = parser.parse_args()

# set logger
if args.verbose > 1:
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
elif args.verbose > 0:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
logging.warning('Skip DEBUG/INFO messages')
args = parser.parse_args()

dumpdir = Path(args.dumpdir).expanduser()
# use absolute path
Expand Down
9 changes: 0 additions & 9 deletions paddlespeech/t2s/exps/fastspeech2/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,11 +209,6 @@ def main():

parser.add_argument("--config", type=str, help="fastspeech2 config file.")

parser.add_argument(
"--verbose",
type=int,
default=1,
help="logging level. higher is more logging. (default=1)")
parser.add_argument(
"--num-cpu", type=int, default=1, help="number of process.")

Expand Down Expand Up @@ -248,10 +243,6 @@ def main():
with open(args.config, 'rt') as f:
config = CfgNode(yaml.safe_load(f))

if args.verbose > 1:
print(vars(args))
print(config)

sentences, speaker_set = get_phn_dur(dur_file)

merge_silence(sentences)
Expand Down
24 changes: 1 addition & 23 deletions paddlespeech/t2s/exps/gan_vocoder/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,30 +47,8 @@ def main():
default=False,
action="store_true",
help="whether to skip the copy of wav files.")
parser.add_argument(
"--verbose",
type=int,
default=1,
help="logging level. higher is more logging. (default=1)")
args = parser.parse_args()

# set logger
if args.verbose > 1:
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
elif args.verbose > 0:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
logging.warning('Skip DEBUG/INFO messages')
args = parser.parse_args()

dumpdir = Path(args.dumpdir).expanduser()
# use absolute path
Expand Down
9 changes: 0 additions & 9 deletions paddlespeech/t2s/exps/gan_vocoder/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,6 @@ def main():
required=True,
help="directory to dump feature files.")
parser.add_argument("--config", type=str, help="vocoder config file.")
parser.add_argument(
"--verbose",
type=int,
default=1,
help="logging level. higher is more logging. (default=1)")
parser.add_argument(
"--num-cpu", type=int, default=1, help="number of process.")
parser.add_argument(
Expand All @@ -197,10 +192,6 @@ def main():
with open(args.config, 'rt') as f:
config = CfgNode(yaml.safe_load(f))

if args.verbose > 1:
print(vars(args))
print(config)

sentences, speaker_set = get_phn_dur(dur_file)
merge_silence(sentences)

Expand Down
23 changes: 0 additions & 23 deletions paddlespeech/t2s/exps/speedyspeech/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,6 @@ def main():
"--tones-dict", type=str, default=None, help="tone vocabulary file.")
parser.add_argument(
"--speaker-dict", type=str, default=None, help="speaker id map file.")
parser.add_argument(
"--verbose",
type=int,
default=1,
help="logging level. higher is more logging. (default=1)")

parser.add_argument(
"--use-relative-path",
Expand All @@ -63,24 +58,6 @@ def main():
help="whether use relative path in metadata")
args = parser.parse_args()

# set logger
if args.verbose > 1:
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
elif args.verbose > 0:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
logging.warning('Skip DEBUG/INFO messages')

dumpdir = Path(args.dumpdir).expanduser()
# use absolute path
dumpdir = dumpdir.resolve()
Expand Down
9 changes: 0 additions & 9 deletions paddlespeech/t2s/exps/speedyspeech/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,11 +195,6 @@ def main():

parser.add_argument("--config", type=str, help="fastspeech2 config file.")

parser.add_argument(
"--verbose",
type=int,
default=1,
help="logging level. higher is more logging. (default=1)")
parser.add_argument(
"--num-cpu", type=int, default=1, help="number of process.")

Expand Down Expand Up @@ -230,10 +225,6 @@ def main():
with open(args.config, 'rt') as f:
config = CfgNode(yaml.safe_load(f))

if args.verbose > 1:
print(vars(args))
print(config)

sentences, speaker_set = get_phn_dur(dur_file)

merge_silence(sentences)
Expand Down
9 changes: 0 additions & 9 deletions paddlespeech/t2s/exps/tacotron2/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,6 @@ def main():

parser.add_argument("--config", type=str, help="fastspeech2 config file.")

parser.add_argument(
"--verbose",
type=int,
default=1,
help="logging level. higher is more logging. (default=1)")
parser.add_argument(
"--num-cpu", type=int, default=1, help="number of process.")

Expand Down Expand Up @@ -223,10 +218,6 @@ def main():
with open(args.config, 'rt') as f:
config = CfgNode(yaml.safe_load(f))

if args.verbose > 1:
print(vars(args))
print(config)

sentences, speaker_set = get_phn_dur(dur_file)

merge_silence(sentences)
Expand Down
24 changes: 1 addition & 23 deletions paddlespeech/t2s/exps/transformer_tts/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,30 +51,8 @@ def main():
"--phones-dict", type=str, default=None, help="phone vocabulary file.")
parser.add_argument(
"--speaker-dict", type=str, default=None, help="speaker id map file.")
parser.add_argument(
"--verbose",
type=int,
default=1,
help="logging level. higher is more logging. (default=1)")
args = parser.parse_args()

# set logger
if args.verbose > 1:
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
elif args.verbose > 0:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
logging.warning('Skip DEBUG/INFO messages')
args = parser.parse_args()

# check directory existence
dumpdir = Path(args.dumpdir).resolve()
Expand Down
9 changes: 0 additions & 9 deletions paddlespeech/t2s/exps/transformer_tts/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,6 @@ def main():
type=str,
help="yaml format configuration file.")

parser.add_argument(
"--verbose",
type=int,
default=1,
help="logging level. higher is more logging. (default=1)")
parser.add_argument(
"--num-cpu", type=int, default=1, help="number of process.")

Expand All @@ -210,10 +205,6 @@ def main():
_C = Configuration(_C)
config = _C.clone()

if args.verbose > 1:
print(vars(args))
print(config)

phone_id_map_path = dumpdir / "phone_id_map.txt"
speaker_id_map_path = dumpdir / "speaker_id_map.txt"

Expand Down
Loading