diff --git a/bins/qvc/inf_preprocess.py b/bins/qvc/inf_preprocess.py new file mode 100644 index 00000000..6af6301e --- /dev/null +++ b/bins/qvc/inf_preprocess.py @@ -0,0 +1,56 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import shutil +import argparse +#from utils.util import load_config + +def inf_preprocess(file1, file2): + #cfg = load_config(args.config) + source_file = file1 + target_folder1 = 'temp/temp1/temp2/song1' + if not os.path.exists(target_folder1): + os.makedirs(target_folder1) + for i in range(1, 5): + new_file_name = f'A{i}.wav' + new_file_path = os.path.join(target_folder1, new_file_name) + shutil.copy(source_file, new_file_path) + print(f'Copied {source_file} to {new_file_path}') + target_folder2 = 'temp/temp0' + source_file = file2 + if not os.path.exists(target_folder2): + os.makedirs(target_folder2) + new_file_name = f'B.wav' + new_file_path = os.path.join(target_folder2, new_file_name) + shutil.copy(source_file, new_file_path) + print(f'Copied {source_file} to {new_file_path}') + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--infsource", + type=str, + default="source_audio", + help="Source audio file or directory. If a JSON file is given, " + "inference from dataset is applied. If a directory is given, " + "inference from all wav/flac/mp3 audio files in the directory is applied. " + "Default: inference from all wav/flac/mp3 audio files in ./source_audio", + ) + args = parser.parse_args() + #cfg = load_config(args.config) + source_file = args.infsource + target_folder = 'temp/temp1/temp2/song1' + if not os.path.exists(target_folder): + os.makedirs(target_folder) + + for i in range(1, 5): + new_file_name = f'A{i}.wav' + new_file_path = os.path.join(target_folder, new_file_name) + shutil.copy(source_file, new_file_path) + print(f'Copied {source_file} to {new_file_path}') + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bins/qvc/inference.py b/bins/qvc/inference.py new file mode 100644 index 00000000..7f8a0f4d --- /dev/null +++ b/bins/qvc/inference.py @@ -0,0 +1,267 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os +import glob +from tqdm import tqdm +import json +import torch +import time + +from models.svc.diffusion.diffusion_inference import DiffusionInference +from models.svc.comosvc.comosvc_inference import ComoSVCInference +from models.svc.transformer.transformer_inference import TransformerInference +from models.svc.vits.vits_inference import VitsInference +from utils.util import load_config +from utils.audio_slicer import split_audio, merge_segments_encodec +from processors import acoustic_extractor, content_extractor + + +def build_inference(args, cfg, infer_type="from_dataset"): + supported_inference = { + "DiffWaveNetSVC": DiffusionInference, + "DiffComoSVC": ComoSVCInference, + "TransformerSVC": TransformerInference, + "VitsSVC": VitsInference, + } + + inference_class = supported_inference[cfg.model_type] + return inference_class(args, cfg, infer_type) + + +def prepare_for_audio_file(args, cfg, num_workers=1): + preprocess_path = cfg.preprocess.processed_dir + audio_name = cfg.inference.source_audio_name + temp_audio_dir = os.path.join(preprocess_path, audio_name) + + ### eval file + t = time.time() + eval_file = prepare_source_eval_file(cfg, temp_audio_dir, audio_name) + args.source = eval_file + with open(eval_file, "r") as f: + metadata = json.load(f) + print("Prepare for meta eval data: {:.1f}s".format(time.time() - t)) + + ### acoustic features + t = time.time() + acoustic_extractor.extract_utt_acoustic_features_serial( + metadata, temp_audio_dir, cfg + ) + if cfg.preprocess.use_min_max_norm_mel == True: + acoustic_extractor.cal_mel_min_max( + dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata + ) + acoustic_extractor.cal_pitch_statistics_svc( + dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata + ) + print("Prepare for acoustic features: {:.1f}s".format(time.time() - t)) + + ### content features + t = time.time() + content_extractor.extract_utt_content_features_dataloader( + cfg, metadata, num_workers + ) + print("Prepare for content features: {:.1f}s".format(time.time() - t)) + return args, cfg, temp_audio_dir + + +def merge_for_audio_segments(audio_files, args, cfg): + audio_name = cfg.inference.source_audio_name + target_singer_name = "result" + + merge_segments_encodec( + wav_files=audio_files, + fs=cfg.preprocess.sample_rate, + output_path=os.path.join( + args.output_dir, "result.wav" + ), + overlap_duration=cfg.inference.segments_overlap_duration, + ) + + for tmp_file in audio_files: + os.remove(tmp_file) + result = os.path.join(args.output_dir, "result.wav") + return result + + +def prepare_source_eval_file(cfg, temp_audio_dir, audio_name): + """ + Prepare the eval file (json) for an audio + """ + + audio_chunks_results = split_audio( + wav_file=cfg.inference.source_audio_path, + target_sr=cfg.preprocess.sample_rate, + output_dir=os.path.join(temp_audio_dir, "wavs"), + max_duration_of_segment=cfg.inference.segments_max_duration, + overlap_duration=cfg.inference.segments_overlap_duration, + ) + + metadata = [] + for i, res in enumerate(audio_chunks_results): + res["index"] = i + res["Dataset"] = audio_name + res["Singer"] = audio_name + res["Uid"] = "{}_{}".format(audio_name, res["Uid"]) + metadata.append(res) + + eval_file = os.path.join(temp_audio_dir, "eval.json") + with open(eval_file, "w") as f: + json.dump(metadata, f, indent=4, ensure_ascii=False, sort_keys=True) + + return eval_file + + +def cuda_relevant(deterministic=False): + torch.cuda.empty_cache() + # TF32 on Ampere and above + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.enabled = True + torch.backends.cudnn.allow_tf32 = True + # Deterministic + torch.backends.cudnn.deterministic = deterministic + torch.backends.cudnn.benchmark = not deterministic + torch.use_deterministic_algorithms(deterministic) + + +def infer(args, cfg, infer_type): + # Build inference + t = time.time() + trainer = build_inference(args, cfg, infer_type) + print("Model Init: {:.1f}s".format(time.time() - t)) + + # Run inference + t = time.time() + output_audio_files = trainer.inference() + print("Model inference: {:.1f}s".format(time.time() - t)) + return output_audio_files + + +def build_parser(): + r"""Build argument parser for inference.py. + Anything else should be put in an extra config YAML file. + """ + + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + type=str, + required=True, + help="JSON/YAML file for configurations.", + ) + parser.add_argument( + "--acoustics_dir", + type=str, + help="Acoustics model checkpoint directory. If a directory is given, " + "search for the latest checkpoint dir in the directory. If a specific " + "checkpoint dir is given, directly load the checkpoint.", + ) + parser.add_argument( + "--vocoder_dir", + type=str, + required=True, + help="Vocoder checkpoint directory. Searching behavior is the same as " + "the acoustics one.", + ) + parser.add_argument( + "--target_singer", + type=str, + required=True, + help="convert to a specific singer (e.g. --target_singers singer_id).", + ) + parser.add_argument( + "--trans_key", + default=0, + help="0: no pitch shift; autoshift: pitch shift; int: key shift.", + ) + parser.add_argument( + "--source", + type=str, + default="source_audio", + help="Source audio file or directory. If a JSON file is given, " + "inference from dataset is applied. If a directory is given, " + "inference from all wav/flac/mp3 audio files in the directory is applied. " + "Default: inference from all wav/flac/mp3 audio files in ./source_audio", + ) + parser.add_argument( + "--output_dir", + type=str, + default="conversion_results", + help="Output directory. Default: ./conversion_results", + ) + parser.add_argument( + "--log_level", + type=str, + default="warning", + help="Logging level. Default: warning", + ) + parser.add_argument( + "--keep_cache", + action="store_true", + default=True, + help="Keep cache files. Only applicable to inference from files.", + ) + parser.add_argument( + "--diffusion_inference_steps", + type=int, + default=1000, + help="Number of inference steps. Only applicable to diffusion inference.", + ) + return parser + + +def main(): + ### Parse arguments and config + args = build_parser().parse_args() + cfg = load_config(args.config) + + # CUDA settings + cuda_relevant() + + if os.path.isdir(args.source): + ### Infer from file + + # Get all the source audio files (.wav, .flac, .mp3) + source_audio_dir = args.source + audio_list = [] + for suffix in ["wav", "flac", "mp3"]: + audio_list += glob.glob( + os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True + ) + print("There are {} source audios: ".format(len(audio_list))) + + # Infer for every file as dataset + output_root_path = args.output_dir + for audio_path in tqdm(audio_list): + audio_name = audio_path.split("/")[-1].split(".")[0] + args.output_dir = os.path.join(output_root_path, audio_name) + print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name)) + + cfg.inference.source_audio_path = audio_path + cfg.inference.source_audio_name = audio_name + cfg.inference.segments_max_duration = 10.0 + cfg.inference.segments_overlap_duration = 1.0 + + # Prepare metadata and features + args, cfg, cache_dir = prepare_for_audio_file(args, cfg) + + # Infer from file + output_audio_files = infer(args, cfg, infer_type="from_file") + + # Merge the split segments + merge_for_audio_segments(output_audio_files, args, cfg) + + # Keep or remove caches + if not args.keep_cache: + os.removedirs(cache_dir) + + else: + ### Infer from dataset + infer(args, cfg, infer_type="from_dataset") + + +if __name__ == "__main__": + main() diff --git a/bins/qvc/post_process.py b/bins/qvc/post_process.py new file mode 100644 index 00000000..d7dc9cb3 --- /dev/null +++ b/bins/qvc/post_process.py @@ -0,0 +1,15 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import shutil + +def main(): + target_folder = 'temp' + if os.path.exists(target_folder): + shutil.rmtree(target_folder) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bins/qvc/preprocess.py b/bins/qvc/preprocess.py new file mode 100644 index 00000000..453b5001 --- /dev/null +++ b/bins/qvc/preprocess.py @@ -0,0 +1,183 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import faulthandler + +faulthandler.enable() + +import os +import argparse +import json +from multiprocessing import cpu_count + + +from utils.util import load_config +from preprocessors.processor import preprocess_dataset +from preprocessors.metadata import cal_metadata +from processors import acoustic_extractor, content_extractor, data_augment + + +def extract_acoustic_features(dataset, output_path, cfg, n_workers=1): + """Extract acoustic features of utterances in the dataset + + Args: + dataset (str): name of dataset, e.g. opencpop + output_path (str): directory that stores train, test and feature files of datasets + cfg (dict): dictionary that stores configurations + n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1. + """ + types = ["train", "test"] if "eval" not in dataset else ["test"] + metadata = [] + dataset_output = os.path.join(output_path, dataset) + + for dataset_type in types: + dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) + with open(dataset_file, "r") as f: + metadata.extend(json.load(f)) + + # acoustic_extractor.extract_utt_acoustic_features_parallel( + # metadata, dataset_output, cfg, n_workers=n_workers + # ) + acoustic_extractor.extract_utt_acoustic_features_serial( + metadata, dataset_output, cfg + ) + + +def extract_content_features(dataset, output_path, cfg, num_workers=1): + """Extract content features of utterances in the dataset + + Args: + dataset (str): name of dataset, e.g. opencpop + output_path (str): directory that stores train, test and feature files of datasets + cfg (dict): dictionary that stores configurations + """ + types = ["train", "test"] if "eval" not in dataset else ["test"] + metadata = [] + for dataset_type in types: + dataset_output = os.path.join(output_path, dataset) + dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) + with open(dataset_file, "r") as f: + metadata.extend(json.load(f)) + + content_extractor.extract_utt_content_features_dataloader( + cfg, metadata, num_workers + ) + + +def preprocess(cfg, args): + """Proprocess raw data of single or multiple datasets (in cfg.dataset) + + Args: + cfg (dict): dictionary that stores configurations + args (ArgumentParser): specify the configuration file and num_workers + """ + # Specify the output root path to save the processed data + output_path = cfg.preprocess.processed_dir + os.makedirs(output_path, exist_ok=True) + + ## Split train and test sets + for dataset in cfg.dataset: + print("Preprocess {}...".format(dataset)) + preprocess_dataset( + dataset, + cfg.dataset_path[dataset], + output_path, + cfg.preprocess, + cfg.task_type, + is_custom_dataset=dataset in cfg.use_custom_dataset, + ) + + # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch + try: + assert isinstance( + cfg.preprocess.data_augment, list + ), "Please provide a list of datasets need to be augmented." + if len(cfg.preprocess.data_augment) > 0: + new_datasets_list = [] + for dataset in cfg.preprocess.data_augment: + new_datasets = data_augment.augment_dataset(cfg, dataset) + new_datasets_list.extend(new_datasets) + cfg.dataset.extend(new_datasets_list) + print("Augmentation datasets: ", cfg.dataset) + except: + print("No Data Augmentation.") + + # Dump metadata of datasets (singers, train/test durations, etc.) + cal_metadata(cfg) + + ## Prepare the acoustic features + for dataset in cfg.dataset: + # Skip augmented datasets which do not need to extract acoustic features + # We will copy acoustic features from the original dataset later + if ( + "pitch_shift" in dataset + or "formant_shift" in dataset + or "equalizer" in dataset in dataset + ): + continue + print( + "Extracting acoustic features for {} using {} workers ...".format( + dataset, args.num_workers + ) + ) + extract_acoustic_features(dataset, output_path, cfg, args.num_workers) + # Calculate the statistics of acoustic features + if cfg.preprocess.mel_min_max_norm: + acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg) + + if cfg.preprocess.extract_pitch: + acoustic_extractor.cal_pitch_statistics_svc(dataset, output_path, cfg) + + # Copy acoustic features for augmented datasets by creating soft-links + for dataset in cfg.dataset: + if "pitch_shift" in dataset: + src_dataset = dataset.replace("_pitch_shift", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + elif "formant_shift" in dataset: + src_dataset = dataset.replace("_formant_shift", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + elif "equalizer" in dataset: + src_dataset = dataset.replace("_equalizer", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + else: + continue + dataset_dir = os.path.join(output_path, dataset) + metadata = [] + for split in ["train", "test"] if not "eval" in dataset else ["test"]: + metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split)) + with open(metadata_file_path, "r") as f: + metadata.extend(json.load(f)) + print("Copying acoustic features for {}...".format(dataset)) + acoustic_extractor.copy_acoustic_features( + metadata, dataset_dir, src_dataset_dir, cfg + ) + if cfg.preprocess.mel_min_max_norm: + acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg) + + if cfg.preprocess.extract_pitch: + acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg) + + # Prepare the content features + for dataset in cfg.dataset: + print("Extracting content features for {}...".format(dataset)) + extract_content_features(dataset, output_path, cfg, args.num_workers) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", default="config.json", help="json files for configurations." + ) + parser.add_argument("--num_workers", type=int, default=int(cpu_count())) + parser.add_argument("--prepare_alignment", type=bool, default=False) + + args = parser.parse_args() + cfg = load_config(args.config) + + preprocess(cfg, args) + + +if __name__ == "__main__": + main() diff --git a/bins/qvc/train.py b/bins/qvc/train.py new file mode 100644 index 00000000..0c20d5b4 --- /dev/null +++ b/bins/qvc/train.py @@ -0,0 +1,111 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse + +import torch + +from models.svc.diffusion.diffusion_trainer import DiffusionTrainer +from models.svc.comosvc.comosvc_trainer import ComoSVCTrainer +from models.svc.transformer.transformer_trainer import TransformerTrainer +from models.svc.vits.vits_trainer import VitsSVCTrainer +from utils.util import load_config + + +def build_trainer(args, cfg): + supported_trainer = { + "DiffWaveNetSVC": DiffusionTrainer, + "DiffComoSVC": ComoSVCTrainer, + "TransformerSVC": TransformerTrainer, + "VitsSVC": VitsSVCTrainer, + } + + trainer_class = supported_trainer[cfg.model_type] + trainer = trainer_class(args, cfg) + return trainer + + +def cuda_relevant(deterministic=False): + torch.cuda.empty_cache() + # TF32 on Ampere and above + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.enabled = True + torch.backends.cudnn.allow_tf32 = True + # Deterministic + torch.backends.cudnn.deterministic = deterministic + torch.backends.cudnn.benchmark = not deterministic + torch.use_deterministic_algorithms(deterministic) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + default="config.json", + help="json files for configurations.", + required=True, + ) + parser.add_argument( + "--exp_name", + type=str, + default="exp_name", + help="A specific name to note the experiment", + required=True, + ) + parser.add_argument( + "--resume", + action="store_true", + help="If specified, to resume from the existing checkpoint.", + ) + parser.add_argument( + "--resume_from_ckpt_path", + type=str, + default="", + help="The specific checkpoint path that you want to resume from.", + ) + parser.add_argument( + "--resume_type", + type=str, + default="", + help="`resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights", + ) + + parser.add_argument( + "--log_level", default="warning", help="logging level (debug, info, warning)" + ) + args = parser.parse_args() + cfg = load_config(args.config) + + # Data Augmentation + if ( + type(cfg.preprocess.data_augment) == list + and len(cfg.preprocess.data_augment) > 0 + ): + new_datasets_list = [] + for dataset in cfg.preprocess.data_augment: + new_datasets = [ + f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None, + ( + f"{dataset}_formant_shift" + if cfg.preprocess.use_formant_shift + else None + ), + f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None, + f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None, + ] + new_datasets_list.extend(filter(None, new_datasets)) + cfg.dataset.extend(new_datasets_list) + + # CUDA settings + cuda_relevant() + + # Build trainer + trainer = build_trainer(args, cfg) + + trainer.train_loop() + + +if __name__ == "__main__": + main() diff --git a/bins/qvc/webui.py b/bins/qvc/webui.py new file mode 100644 index 00000000..76b7a7ae --- /dev/null +++ b/bins/qvc/webui.py @@ -0,0 +1,205 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import gradio as gr +import os +import shutil +from inf_preprocess import * +from inference import * +from post_process import * +from preprocess import * +from train import * + +def processing_audio(infsource, tarsource, shifts): + global cfg, args + target_folder = 'temp' + if os.path.exists(target_folder): + shutil.rmtree(target_folder) + inf_preprocess(infsource, tarsource) + preprocess(cfg, args) + if shifts is not None: + args.trans_key = shifts + if ( + type(cfg.preprocess.data_augment) == list + and len(cfg.preprocess.data_augment) > 0 + ): + new_datasets_list = [] + for dataset in cfg.preprocess.data_augment: + new_datasets = [ + f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None, + ( + f"{dataset}_formant_shift" + if cfg.preprocess.use_formant_shift + else None + ), + f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None, + f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None, + ] + new_datasets_list.extend(filter(None, new_datasets)) + cfg.dataset.extend(new_datasets_list) + + # CUDA settings + cuda_relevant() + # Build trainer + trainer = build_trainer(args, cfg) + trainer.train_loop() + + args.source = 'temp/temp0' + if os.path.isdir(args.source): + ### Infer from file + + # Get all the source audio files (.wav, .flac, .mp3) + source_audio_dir = args.source + audio_list = [] + for suffix in ["wav", "flac", "mp3"]: + audio_list += glob.glob( + os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True + ) + print("There are {} source audios: ".format(len(audio_list))) + + # Infer for every file as dataset + output_root_path = args.output_dir + for audio_path in tqdm(audio_list): + audio_name = audio_path.split("/")[-1].split(".")[0] + args.output_dir = os.path.join(output_root_path, audio_name) + print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name)) + + cfg.inference.source_audio_path = audio_path + cfg.inference.source_audio_name = audio_name + cfg.inference.segments_max_duration = 10.0 + cfg.inference.segments_overlap_duration = 1.0 + + # Prepare metadata and features + args, cfg, cache_dir = prepare_for_audio_file(args, cfg) + + # Infer from file + output_audio_files = infer(args, cfg, infer_type="from_file") + + # Merge the split segments + result = merge_for_audio_segments(output_audio_files, args, cfg) + + # Keep or remove caches + if not args.keep_cache: + os.removedirs(cache_dir) + + else: + ### Infer from dataset + infer(args, cfg, infer_type="from_dataset") + return result + + +def main(): + infsource_audio = gr.Audio(label="Source Audio", type="filepath") + tarsource_audio = gr.Audio(label="Target Audio", type="filepath") + options1 = gr.Dropdown(["autoshift", "-10", "-9", "-8", "-7", "-6", "-5", "-4", "-3", "-2", "-1", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"], label="How many semitones you want to transpose?") + outputs = gr.Audio(label="Output Audio") + inputs = [tarsource_audio, infsource_audio, options1] + title = "Amphion-QuickVC" + + gr.Interface(processing_audio, inputs, outputs, title=title).queue().launch(server_name="0.0.0.0", share=True) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + default="config.json", + help="json files for configurations.", + required=True, + ) + parser.add_argument( + "--exp_name", + type=str, + default="exp_name", + help="A specific name to note the experiment", + required=True, + ) + parser.add_argument("--num_workers", type=int, default=int(cpu_count())) + parser.add_argument("--prepare_alignment", type=bool, default=False) + parser.add_argument( + "--resume", + action="store_true", + help="If specified, to resume from the existing checkpoint.", + ) + parser.add_argument( + "--resume_from_ckpt_path", + type=str, + default="", + help="The specific checkpoint path that you want to resume from.", + ) + parser.add_argument( + "--resume_type", + type=str, + default="", + help="`resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights", + ) + parser.add_argument( + "--log_level", default="warning", help="logging level (debug, info, warning)" + ) + parser.add_argument( + "--acoustics_dir", + type=str, + help="Acoustics model checkpoint directory. If a directory is given, " + "search for the latest checkpoint dir in the directory. If a specific " + "checkpoint dir is given, directly load the checkpoint.", + ) + parser.add_argument( + "--vocoder_dir", + type=str, + required=True, + help="Vocoder checkpoint directory. Searching behavior is the same as " + "the acoustics one.", + ) + parser.add_argument( + "--target_singer", + type=str, + required=True, + help="convert to a specific singer (e.g. --target_singers singer_id).", + ) + parser.add_argument( + "--trans_key", + default=0, + help="0: no pitch shift; autoshift: pitch shift; int: key shift.", + ) + parser.add_argument( + "--source", + type=str, + default="source_audio", + help="Source audio file or directory. If a JSON file is given, " + "inference from dataset is applied. If a directory is given, " + "inference from all wav/flac/mp3 audio files in the directory is applied. " + "Default: inference from all wav/flac/mp3 audio files in ./source_audio", + ) + parser.add_argument( + "--infsource", + type=str, + default="source_audio", + help="Source audio file or directory. If a JSON file is given, " + "inference from dataset is applied. If a directory is given, " + "inference from all wav/flac/mp3 audio files in the directory is applied. " + "Default: inference from all wav/flac/mp3 audio files in ./source_audio", + ) + parser.add_argument( + "--output_dir", + type=str, + default="conversion_results", + help="Output directory. Default: ./conversion_results", + ) + + parser.add_argument( + "--keep_cache", + action="store_true", + default=True, + help="Keep cache files. Only applicable to inference from files.", + ) + parser.add_argument( + "--diffusion_inference_steps", + type=int, + default=1000, + help="Number of inference steps. Only applicable to diffusion inference.", + ) + + args = parser.parse_args() + cfg = load_config(args.config) + main() diff --git a/egs/svc/QVC/README.md b/egs/svc/QVC/README.md new file mode 100644 index 00000000..d4595e0d --- /dev/null +++ b/egs/svc/QVC/README.md @@ -0,0 +1,142 @@ +# Quick (Singing) Voice Conversion + +This is an implementation of a simple Webui which provides a simple and quick text-free one-shot voice conversion for the uninitiated. Thereotically, the user only takes two short audios (source and target) and a few minutes to receive the VC result. +It aims to use the base model (checkpoint) trained from the VCTK, M4Singer datasets (or other supported datasets) as a foundation, and then fine-tune the base model using the input source audio for voice conversion and output. Now it supports MultipleContentSVC and VITS. + +Like other SVC tasks, There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference/conversion + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download + +By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md). + +### Configuration + +Specify the dataset paths in `exp_config_[model_type].json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, +``` + +## 2. Features Extraction + +### Content-based Pretrained Models Download + +By default, we utilize ContentVec and Whisper to extract content features. How to download them is detailed [here](../../../pretrained/README.md). + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config_[model_type].json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). Config_type 1 means MultipleContentSVC (DiffWaveNet); 2 means VITS. + +```bash +sh egs/svc/QVC/run.sh --stage 1 --config_type (1 or 2) +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config_[model_type].json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... + "adamw": { + "lr": 2.0e-4 + }, + ... + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`. + +```bash +sh egs/svc/QVC/run.sh --stage 2 --name [YourExptName] --config_type (1 or 2) +``` + +## 4. Inference/Conversion + +### Run + +`inf_config_[model_type].json` is a file similar to `exp_config_[model_type].json` storing training parameters, but you need to be careful when modifying configs here (especially the temp storing path here). +For inference/conversion, you need to specify the following configurations when running `run.sh`: + +| Parameters | Description | Example | +| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` +| `--resume_from_ckpt_path` | The checkpoint path to load model parameters. | `[Your path to save logs and checkpoints]` | +| `--infer_source_file` ` | The `infer_source_file` should be a file with *.wav, *.mp3 or *.flac. | +| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. | +| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. | + +Note that type of checkpoint model weights must match the model config type, now run: + +```bash +sh egs/svc/QVC/run.sh --stage 3 --gpu "0" --config_type (1 or 2) \ + --resume_from_ckpt_path [Your checkpoint Path] \ + --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \ + --infer_source_file [Your Audio Path] \ + --target_source_dir [Your Audios Folder] \ + --infer_key_shift "autoshift" +``` + +Before opening the Webui, you need to install: +``` +pip install gradio==3.42.0 +``` + +Then you can initilize the Webui by running: + +```bash +sh egs/svc/QVC/run.sh --stage 4 --gpu "0" --config_type (1 or 2) \ + --resume_from_ckpt_path [Your checkpoint Path] \ + --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \ + --infer_source_file [Your Audio Path] \ + --target_source_dir [Your Audios Folder] \ + --infer_key_shift "autoshift" + +``` \ No newline at end of file diff --git a/egs/svc/QVC/exp_config_diff.json b/egs/svc/QVC/exp_config_diff.json new file mode 100644 index 00000000..b607494c --- /dev/null +++ b/egs/svc/QVC/exp_config_diff.json @@ -0,0 +1,127 @@ +{ + "base_config": "config/svc/diffusion.json", + "model_type": "DiffWaveNetSVC", + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + // Config for features extraction + "features_extraction_mode": "offline", // Online or offline features extraction ("offline" or "online") + "extract_mel": true, + "extract_pitch": true, + "extract_energy": true, + "extract_whisper_feature": true, + "extract_contentvec_feature": true, + "extract_wenet_feature": false, + "whisper_batch_size": 30, // decrease it if your GPU is out of memory + "contentvec_batch_size": 1, + // Fill in the content-based pretrained model's path + "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", + "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", + "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", + "whisper_model": "medium", + "whisper_model_path": "pretrained/whisper/medium.pt", + // Config for features usage + "use_mel": true, + "use_min_max_norm_mel": true, + "use_frame_pitch": true, + "use_frame_energy": true, + "use_spkid": true, + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "n_mel": 100, + "sample_rate": 24000 + }, + "model": { + "condition_encoder": { + // Config for features usage + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "wenet_dim": 512, + "use_singer_encoder": false, + "pitch_min": 50, + "pitch_max": 1100 + }, + "diffusion": { + "scheduler": "ddpm", + "scheduler_settings": { + "num_train_timesteps": 1000, + "beta_start": 1.0e-4, + "beta_end": 0.02, + "beta_schedule": "linear" + }, + // Diffusion steps encoder + "step_encoder": { + "dim_raw_embedding": 128, + "dim_hidden_layer": 512, + "activation": "SiLU", + "num_layer": 2, + "max_period": 10000 + }, + // Diffusion decoder + "model_type": "bidilconv", + // bidilconv, unet2d, TODO: unet1d + "bidilconv": { + "base_channel": 512, + "n_res_block": 40, + "conv_kernel_size": 3, + "dilation_cycle_length": 4, + // specially, 1 means no dilation + "conditioner_size": 384 + } + } + }, + "train": { + "batch_size": 32, + "gradient_accumulation_step": 1, + "max_epoch": -1, // -1 means no limit + "save_checkpoint_stride": [ + 3, + 50 + ], + "keep_last": [ + 3, + 2 + ], + "run_eval": [ + true, + true + ], + "adamw": { + "lr": 2.0e-4 + }, + "reducelronplateau": { + "factor": 0.8, + "patience": 30, + "min_lr": 1.0e-4 + }, + "dataloader": { + "num_worker": 8, + "pin_memory": true + }, + "sampler": { + "holistic_shuffle": false, + "drop_last": true + } + } +} \ No newline at end of file diff --git a/egs/svc/QVC/exp_config_vits.json b/egs/svc/QVC/exp_config_vits.json new file mode 100644 index 00000000..24b53ecd --- /dev/null +++ b/egs/svc/QVC/exp_config_vits.json @@ -0,0 +1,105 @@ +{ + "base_config": "config/vitssvc.json", + "model_type": "VitsSVC", + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + + "n_mel": 100, + "sample_rate": 24000, + + // contentvec + "extract_contentvec_feature": true, + "contentvec_sample_rate": 16000, + "contentvec_batch_size": 1, + "contentvec_frameshift": 0.02, + // whisper + "extract_whisper_feature": true, + "whisper_sample_rate": 16000, + "whisper_frameshift": 0.01, + "whisper_downsample_rate": 2, + // wenet + "extract_wenet_feature": false, + "wenet_downsample_rate": 4, + "wenet_frameshift": 0.01, + "wenet_sample_rate": 16000, + // Fill in the content-based pretrained model's path + "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", + "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", + "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", + "whisper_model": "medium", + "whisper_model_path": "pretrained/whisper/medium.pt", + + "use_contentvec": true, + "use_whisper": true, + "use_wenet": false, + + // Extract content features using dataloader + "pin_memory": true, + "num_workers": 8, + "content_feature_batch_size": 16, + + }, + "model": { + "condition_encoder": { + // Config for features usage + "merge_mode": "add", + "use_log_loudness": true, + "use_contentvec": true, + "use_whisper": true, + "use_wenet": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "wenet_dim": 512, + }, + "vits": { + "inter_channels": 384, + "hidden_channels": 384, + "filter_channels": 256, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "n_flow_layer": 4, + "n_layers_q": 3, + "gin_channels": 256, + "n_speakers": 512, + "use_spectral_norm": false, + }, + "generator": "nsfhifigan", + }, + "train": { + "batch_size": 12, + "learning_rate": 2e-4, + "gradient_accumulation_step": 1, + "max_epoch": -1, // -1 means no limit + "save_checkpoint_stride": [ + 3, + 50 + ], + "keep_last": [ + 3, + 2 + ], + }, + "inference": { + "batch_size": 1, + } +} \ No newline at end of file diff --git a/egs/svc/QVC/inf_config_diff.json b/egs/svc/QVC/inf_config_diff.json new file mode 100644 index 00000000..7179d351 --- /dev/null +++ b/egs/svc/QVC/inf_config_diff.json @@ -0,0 +1,130 @@ +{ + "base_config": "config/svc/diffusion.json", + "model_type": "DiffWaveNetSVC", + "dataset": [ + // "m4singer", + // "opencpop", + // "opensinger", + // "svcc", + // "vctk", + "temp1" + ], + "dataset_path": { + // TODO: Fill in your dataset path + // "m4singer": "[M4Singer dataset path]", + // "opencpop": "[Opencpop dataset path]", + // "opensinger": "[OpenSinger dataset path]", + // "svcc": "[SVCC dataset path]", + // "vctk": "[VCTK dataset path]", + "temp1": "temp/temp1" + }, + "use_custom_dataset": ["temp1"], + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "temp/ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "temp/data", + // Config for features extraction + "features_extraction_mode": "offline", // Online or offline features extraction ("offline" or "online") + "extract_mel": true, + "extract_pitch": true, + "extract_energy": true, + "extract_whisper_feature": true, + "extract_contentvec_feature": true, + "extract_wenet_feature": false, + "whisper_batch_size": 30, // decrease it if your GPU is out of memory + "contentvec_batch_size": 1, + // Fill in the content-based pretrained model's path + "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", + "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", + "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", + "whisper_model": "medium", + "whisper_model_path": "pretrained/whisper/medium.pt", + // Config for features usage + "use_mel": true, + "use_min_max_norm_mel": true, + "use_frame_pitch": true, + "use_frame_energy": true, + "use_spkid": true, + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "n_mel": 100, + "sample_rate": 24000 + }, + "model": { + "condition_encoder": { + // Config for features usage + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "wenet_dim": 512, + "use_singer_encoder": false, + "pitch_min": 50, + "pitch_max": 1100 + }, + "diffusion": { + "scheduler": "ddpm", + "scheduler_settings": { + "num_train_timesteps": 1000, + "beta_start": 1.0e-4, + "beta_end": 0.02, + "beta_schedule": "linear" + }, + // Diffusion steps encoder + "step_encoder": { + "dim_raw_embedding": 128, + "dim_hidden_layer": 512, + "activation": "SiLU", + "num_layer": 2, + "max_period": 10000 + }, + // Diffusion decoder + "model_type": "bidilconv", + // bidilconv, unet2d, TODO: unet1d + "bidilconv": { + "base_channel": 512, + "n_res_block": 40, + "conv_kernel_size": 3, + "dilation_cycle_length": 4, + // specially, 1 means no dilation + "conditioner_size": 384 + } + } + }, + "train": { + "batch_size": 2, + "gradient_accumulation_step": 1, + "max_epoch": 51, // -1 means no limit + "save_checkpoint_stride": [ + 50, + 2000 + ], + "keep_last": [ + 3, + 2 + ], + "run_eval": [ + true, + true + ], + "adamw": { + "lr": 2.0e-4 + }, + "reducelronplateau": { + "factor": 0.8, + "patience": 30, + "min_lr": 1.0e-4 + }, + "dataloader": { + "num_worker": 8, + "pin_memory": true + }, + "sampler": { + "holistic_shuffle": false, + "drop_last": true + } + } +} \ No newline at end of file diff --git a/egs/svc/QVC/inf_config_vits.json b/egs/svc/QVC/inf_config_vits.json new file mode 100644 index 00000000..b8d9d110 --- /dev/null +++ b/egs/svc/QVC/inf_config_vits.json @@ -0,0 +1,102 @@ +{ + "base_config": "config/vitssvc.json", + "model_type": "VitsSVC", + "dataset": [ + // "m4singer", + // "vctk", + "temp1" + ], + "dataset_path": { + // TODO: Fill in your dataset path + // "m4singer": "[M4Singer dataset path]", + // "vctk": "[VCTK dataset path]", + "temp1": "temp/temp1" + }, + "use_custom_dataset": ["temp1"], + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "temp/ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "temp/data", + + "n_mel": 100, + "sample_rate": 24000, + + // contentvec + "extract_contentvec_feature": true, + "contentvec_sample_rate": 16000, + "contentvec_batch_size": 1, + "contentvec_frameshift": 0.02, + // whisper + "extract_whisper_feature": true, + "whisper_sample_rate": 16000, + "whisper_frameshift": 0.01, + "whisper_downsample_rate": 2, + // wenet + "extract_wenet_feature": false, + "wenet_downsample_rate": 4, + "wenet_frameshift": 0.01, + "wenet_sample_rate": 16000, + // Fill in the content-based pretrained model's path + "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", + "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", + "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", + "whisper_model": "medium", + "whisper_model_path": "pretrained/whisper/medium.pt", + + "use_contentvec": true, + "use_whisper": true, + "use_wenet": false, + + // Extract content features using dataloader + "pin_memory": true, + "num_workers": 8, + "content_feature_batch_size": 16, + + }, + "model": { + "condition_encoder": { + // Config for features usage + "merge_mode": "add", + "use_log_loudness": true, + "use_contentvec": true, + "use_whisper": true, + "use_wenet": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "wenet_dim": 512, + }, + "vits": { + "inter_channels": 384, + "hidden_channels": 384, + "filter_channels": 256, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "n_flow_layer": 4, + "n_layers_q": 3, + "gin_channels": 256, + "n_speakers": 512, + "use_spectral_norm": false, + }, + "generator": "nsfhifigan", + }, + "train": { + "batch_size": 2, + "learning_rate": 2e-4, + "gradient_accumulation_step": 1, + "max_epoch": 11, // -1 means no limit + "save_checkpoint_stride": [ + 10, + 50 + ], + "keep_last": [ + 3, + 2 + ], + }, + "inference": { + "batch_size": 1, + } +} \ No newline at end of file diff --git a/egs/svc/QVC/run.sh b/egs/svc/QVC/run.sh new file mode 100644 index 00000000..ff0c8289 --- /dev/null +++ b/egs/svc/QVC/run.sh @@ -0,0 +1,299 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,config2:,config_type:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,target_source_file:,infer_source_audio_dir:,target_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Configuration File + -c | --config2) shift; inf_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + --config_type) shift; config_type=$1 ; shift ;; + + # [Only for Training] Resume configuration + --resume) shift; resume=$1 ; shift ;; + # [Only for Training] The specific checkpoint path that you want to resume from. + --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac). + --infer_source_file) shift; infer_source_file=$1 ; shift ;; + --target_source_file) shift; target_source_file=$1 ; shift ;; + --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;; + --target_source_audio_dir) shift; target_source_audio_dir=$1 ; shift ;; + # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1". + --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;; + # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift". + --infer_key_shift) shift; infer_key_shift=$1 ; shift ;; + # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders. + --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$config_type" ]; then + echo "[Error] Please specify the model type" + exit 1 +fi + +if [ "$config_type" -eq 1 ]; then + exp_config="${exp_dir}"/exp_config_diff.json + inf_config="${exp_dir}"/inf_config_diff.json + echo "Experimental Configuration File: DiffWaveNet" +fi + +if [ "$config_type" -eq 2 ]; then + exp_config="${exp_dir}"/exp_config_vits.json + inf_config="${exp_dir}"/inf_config_vits.json + echo "Experimental Configuration File: VITS" +fi + +if [ -z "$gpu" ]; then + gpu="0" +fi + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/fvc/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + # add default value + if [ -z "$resume_from_ckpt_path" ]; then + resume_from_ckpt_path="" + fi + + if [ -z "$resume_type" ]; then + resume_type="resume" + fi + + if [ "$resume" = true ]; then + echo "Resume from the existing experiment..." + CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/fvc/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --resume \ + --resume_from_ckpt_path "$resume_from_ckpt_path" \ + --resume_type "$resume_type" + else + echo "Start a new experiment..." + CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/fvc/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info + fi +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$exp_name" ]; then + exp_name="test" + fi + echo "Exprimental Name: $exp_name" + + if [ -z "$infer_expt_dir" ]; then + infer_expt_dir="${work_dir}/temp/ckpts/svc/${exp_name}" + fi + + if [ -z "$resume_from_ckpt_path" ]; then + echo "[Error] Please specify the ckpt path." + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + echo "[Error] Please specify the output path" + exit 1 + fi + + if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then + echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)." + exit 1 + fi + + if [ -z "$infer_source_file" ]; then + infer_source=$infer_source_audio_dir + fi + + if [ -z "$infer_source_audio_dir" ]; then + infer_source=$infer_source_file + fi + + if [ -z "$target_source_file" ] && [ -z "$target_source_audio_dir" ]; then + echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)." + exit 1 + fi + + if [ -z "$target_source_file" ]; then + target_source=$target_source_audio_dir + fi + + if [ -z "$target_source_audio_dir" ]; then + target_source=$target_source_file + fi + + if [ -z "$infer_target_speaker" ]; then + infer_target_speaker="temp1_temp2" + fi + + if [ -z "$resume_type" ]; then + resume_type="finetune" + fi + + if [ -z "$infer_key_shift" ]; then + infer_key_shift="autoshift" + fi + + if [ -z "$infer_vocoder_dir" ]; then + infer_vocoder_dir="$work_dir"/pretrained/bigvgan + fi + + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/qvc/inf_preprocess.py \ + --infsource $infer_source + + + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/qvc/preprocess.py \ + --config $inf_config \ + --num_workers 8 + + CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/qvc/train.py \ + --config "$inf_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --resume \ + --resume_from_ckpt_path "$resume_from_ckpt_path" \ + --resume_type "$resume_type" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/qvc/inference.py \ + --config $inf_config \ + --acoustics_dir $infer_expt_dir \ + --vocoder_dir $infer_vocoder_dir \ + --target_singer $infer_target_speaker \ + --trans_key $infer_key_shift \ + --source $target_source \ + --output_dir $infer_output_dir \ + --log_level debug + + python "${work_dir}"/bins/qvc/post_process.py + +fi +if [ $running_stage -eq 4 ]; then + if [ -z "$exp_name" ]; then + exp_name="test" + fi + echo "Exprimental Name: $exp_name" + + if [ -z "$infer_expt_dir" ]; then + infer_expt_dir="${work_dir}/temp/ckpts/svc/${exp_name}" + fi + + if [ -z "$resume_from_ckpt_path" ]; then + echo "[Error] Please specify the ckpt path." + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="${work_dir}/temp" + fi + + if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then + echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)." + exit 1 + fi + + if [ -z "$infer_source_file" ]; then + infer_source=$infer_source_audio_dir + fi + + if [ -z "$infer_source_audio_dir" ]; then + infer_source=$infer_source_file + fi + + if [ -z "$target_source_file" ] && [ -z "$target_source_audio_dir" ]; then + echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)." + exit 1 + fi + + if [ -z "$target_source_file" ]; then + target_source=$target_source_audio_dir + fi + + if [ -z "$target_source_audio_dir" ]; then + target_source=$target_source_file + fi + + if [ -z "$infer_target_speaker" ]; then + infer_target_speaker="temp1_temp2" + fi + + if [ -z "$resume_type" ]; then + resume_type="finetune" + fi + + if [ -z "$infer_key_shift" ]; then + infer_key_shift="autoshift" + fi + + if [ -z "$infer_vocoder_dir" ]; then + infer_vocoder_dir="$work_dir"/pretrained/bigvgan + fi + + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/qvc/webui.py \ + --config $inf_config \ + --num_workers 8 \ + --exp_name "$exp_name" \ + --resume \ + --resume_from_ckpt_path "$resume_from_ckpt_path" \ + --resume_type "$resume_type" \ + --acoustics_dir $infer_expt_dir \ + --vocoder_dir $infer_vocoder_dir \ + --target_singer $infer_target_speaker \ + --trans_key $infer_key_shift \ + --source $target_source \ + --log_level info \ + --output_dir $infer_output_dir + +fi \ No newline at end of file