diff --git a/bins/qvc/inf_preprocess.py b/bins/qvc/inf_preprocess.py
new file mode 100644
index 00000000..6af6301e
--- /dev/null
+++ b/bins/qvc/inf_preprocess.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import shutil
+import argparse
+#from utils.util import load_config
+
+def inf_preprocess(file1, file2):
+    #cfg = load_config(args.config)
+    source_file = file1
+    target_folder1 = 'temp/temp1/temp2/song1'
+    if not os.path.exists(target_folder1):
+        os.makedirs(target_folder1)
+    for i in range(1, 5):
+        new_file_name = f'A{i}.wav'
+        new_file_path = os.path.join(target_folder1, new_file_name)
+        shutil.copy(source_file, new_file_path)
+        print(f'Copied {source_file} to {new_file_path}')
+    target_folder2 = 'temp/temp0'
+    source_file = file2
+    if not os.path.exists(target_folder2):
+        os.makedirs(target_folder2)
+    new_file_name = f'B.wav'
+    new_file_path = os.path.join(target_folder2, new_file_name)
+    shutil.copy(source_file, new_file_path)
+    print(f'Copied {source_file} to {new_file_path}')
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--infsource",
+        type=str,
+        default="source_audio",
+        help="Source audio file or directory. If a JSON file is given, "
+        "inference from dataset is applied. If a directory is given, "
+        "inference from all wav/flac/mp3 audio files in the directory is applied. "
+        "Default: inference from all wav/flac/mp3 audio files in ./source_audio",
+    )
+    args = parser.parse_args()
+    #cfg = load_config(args.config)
+    source_file = args.infsource
+    target_folder = 'temp/temp1/temp2/song1'
+    if not os.path.exists(target_folder):
+        os.makedirs(target_folder)
+
+    for i in range(1, 5):
+        new_file_name = f'A{i}.wav'
+        new_file_path = os.path.join(target_folder, new_file_name)
+        shutil.copy(source_file, new_file_path)
+        print(f'Copied {source_file} to {new_file_path}')
+        
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/bins/qvc/inference.py b/bins/qvc/inference.py
new file mode 100644
index 00000000..7f8a0f4d
--- /dev/null
+++ b/bins/qvc/inference.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import glob
+from tqdm import tqdm
+import json
+import torch
+import time
+
+from models.svc.diffusion.diffusion_inference import DiffusionInference
+from models.svc.comosvc.comosvc_inference import ComoSVCInference
+from models.svc.transformer.transformer_inference import TransformerInference
+from models.svc.vits.vits_inference import VitsInference
+from utils.util import load_config
+from utils.audio_slicer import split_audio, merge_segments_encodec
+from processors import acoustic_extractor, content_extractor
+
+
+def build_inference(args, cfg, infer_type="from_dataset"):
+    supported_inference = {
+        "DiffWaveNetSVC": DiffusionInference,
+        "DiffComoSVC": ComoSVCInference,
+        "TransformerSVC": TransformerInference,
+        "VitsSVC": VitsInference,
+    }
+
+    inference_class = supported_inference[cfg.model_type]
+    return inference_class(args, cfg, infer_type)
+
+
+def prepare_for_audio_file(args, cfg, num_workers=1):
+    preprocess_path = cfg.preprocess.processed_dir
+    audio_name = cfg.inference.source_audio_name
+    temp_audio_dir = os.path.join(preprocess_path, audio_name)
+
+    ### eval file
+    t = time.time()
+    eval_file = prepare_source_eval_file(cfg, temp_audio_dir, audio_name)
+    args.source = eval_file
+    with open(eval_file, "r") as f:
+        metadata = json.load(f)
+    print("Prepare for meta eval data: {:.1f}s".format(time.time() - t))
+
+    ### acoustic features
+    t = time.time()
+    acoustic_extractor.extract_utt_acoustic_features_serial(
+        metadata, temp_audio_dir, cfg
+    )
+    if cfg.preprocess.use_min_max_norm_mel == True:
+        acoustic_extractor.cal_mel_min_max(
+            dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
+        )
+    acoustic_extractor.cal_pitch_statistics_svc(
+        dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
+    )
+    print("Prepare for acoustic features: {:.1f}s".format(time.time() - t))
+
+    ### content features
+    t = time.time()
+    content_extractor.extract_utt_content_features_dataloader(
+        cfg, metadata, num_workers
+    )
+    print("Prepare for content features: {:.1f}s".format(time.time() - t))
+    return args, cfg, temp_audio_dir
+
+
+def merge_for_audio_segments(audio_files, args, cfg):
+    audio_name = cfg.inference.source_audio_name
+    target_singer_name = "result"
+
+    merge_segments_encodec(
+        wav_files=audio_files,
+        fs=cfg.preprocess.sample_rate,
+        output_path=os.path.join(
+            args.output_dir, "result.wav"
+        ),
+        overlap_duration=cfg.inference.segments_overlap_duration,
+    )
+    
+    for tmp_file in audio_files:
+        os.remove(tmp_file)
+    result = os.path.join(args.output_dir, "result.wav")
+    return result
+    
+
+def prepare_source_eval_file(cfg, temp_audio_dir, audio_name):
+    """
+    Prepare the eval file (json) for an audio
+    """
+
+    audio_chunks_results = split_audio(
+        wav_file=cfg.inference.source_audio_path,
+        target_sr=cfg.preprocess.sample_rate,
+        output_dir=os.path.join(temp_audio_dir, "wavs"),
+        max_duration_of_segment=cfg.inference.segments_max_duration,
+        overlap_duration=cfg.inference.segments_overlap_duration,
+    )
+
+    metadata = []
+    for i, res in enumerate(audio_chunks_results):
+        res["index"] = i
+        res["Dataset"] = audio_name
+        res["Singer"] = audio_name
+        res["Uid"] = "{}_{}".format(audio_name, res["Uid"])
+        metadata.append(res)
+
+    eval_file = os.path.join(temp_audio_dir, "eval.json")
+    with open(eval_file, "w") as f:
+        json.dump(metadata, f, indent=4, ensure_ascii=False, sort_keys=True)
+
+    return eval_file
+
+
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+
+
+def infer(args, cfg, infer_type):
+    # Build inference
+    t = time.time()
+    trainer = build_inference(args, cfg, infer_type)
+    print("Model Init: {:.1f}s".format(time.time() - t))
+
+    # Run inference
+    t = time.time()
+    output_audio_files = trainer.inference()
+    print("Model inference: {:.1f}s".format(time.time() - t))
+    return output_audio_files
+
+
+def build_parser():
+    r"""Build argument parser for inference.py.
+    Anything else should be put in an extra config YAML file.
+    """
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="JSON/YAML file for configurations.",
+    )
+    parser.add_argument(
+        "--acoustics_dir",
+        type=str,
+        help="Acoustics model checkpoint directory. If a directory is given, "
+        "search for the latest checkpoint dir in the directory. If a specific "
+        "checkpoint dir is given, directly load the checkpoint.",
+    )
+    parser.add_argument(
+        "--vocoder_dir",
+        type=str,
+        required=True,
+        help="Vocoder checkpoint directory. Searching behavior is the same as "
+        "the acoustics one.",
+    )
+    parser.add_argument(
+        "--target_singer",
+        type=str,
+        required=True,
+        help="convert to a specific singer (e.g. --target_singers singer_id).",
+    )
+    parser.add_argument(
+        "--trans_key",
+        default=0,
+        help="0: no pitch shift; autoshift: pitch shift;  int: key shift.",
+    )
+    parser.add_argument(
+        "--source",
+        type=str,
+        default="source_audio",
+        help="Source audio file or directory. If a JSON file is given, "
+        "inference from dataset is applied. If a directory is given, "
+        "inference from all wav/flac/mp3 audio files in the directory is applied. "
+        "Default: inference from all wav/flac/mp3 audio files in ./source_audio",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="conversion_results",
+        help="Output directory. Default: ./conversion_results",
+    )
+    parser.add_argument(
+        "--log_level",
+        type=str,
+        default="warning",
+        help="Logging level. Default: warning",
+    )
+    parser.add_argument(
+        "--keep_cache",
+        action="store_true",
+        default=True,
+        help="Keep cache files. Only applicable to inference from files.",
+    )
+    parser.add_argument(
+        "--diffusion_inference_steps",
+        type=int,
+        default=1000,
+        help="Number of inference steps. Only applicable to diffusion inference.",
+    )
+    return parser
+
+
+def main():
+    ### Parse arguments and config
+    args = build_parser().parse_args()
+    cfg = load_config(args.config)
+
+    # CUDA settings
+    cuda_relevant()
+
+    if os.path.isdir(args.source):
+        ### Infer from file
+
+        # Get all the source audio files (.wav, .flac, .mp3)
+        source_audio_dir = args.source
+        audio_list = []
+        for suffix in ["wav", "flac", "mp3"]:
+            audio_list += glob.glob(
+                os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True
+            )
+        print("There are {} source audios: ".format(len(audio_list)))
+
+        # Infer for every file as dataset
+        output_root_path = args.output_dir
+        for audio_path in tqdm(audio_list):
+            audio_name = audio_path.split("/")[-1].split(".")[0]
+            args.output_dir = os.path.join(output_root_path, audio_name)
+            print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name))
+
+            cfg.inference.source_audio_path = audio_path
+            cfg.inference.source_audio_name = audio_name
+            cfg.inference.segments_max_duration = 10.0
+            cfg.inference.segments_overlap_duration = 1.0
+
+            # Prepare metadata and features
+            args, cfg, cache_dir = prepare_for_audio_file(args, cfg)
+
+            # Infer from file
+            output_audio_files = infer(args, cfg, infer_type="from_file")
+
+            # Merge the split segments
+            merge_for_audio_segments(output_audio_files, args, cfg)
+
+            # Keep or remove caches
+            if not args.keep_cache:
+                os.removedirs(cache_dir)
+
+    else:
+        ### Infer from dataset
+        infer(args, cfg, infer_type="from_dataset")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bins/qvc/post_process.py b/bins/qvc/post_process.py
new file mode 100644
index 00000000..d7dc9cb3
--- /dev/null
+++ b/bins/qvc/post_process.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import shutil
+
+def main():
+    target_folder = 'temp'
+    if os.path.exists(target_folder):
+        shutil.rmtree(target_folder)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/bins/qvc/preprocess.py b/bins/qvc/preprocess.py
new file mode 100644
index 00000000..453b5001
--- /dev/null
+++ b/bins/qvc/preprocess.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faulthandler
+
+faulthandler.enable()
+
+import os
+import argparse
+import json
+from multiprocessing import cpu_count
+
+
+from utils.util import load_config
+from preprocessors.processor import preprocess_dataset
+from preprocessors.metadata import cal_metadata
+from processors import acoustic_extractor, content_extractor, data_augment
+
+
+def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
+    """Extract acoustic features of utterances in the dataset
+
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+        n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
+    """
+    types = ["train", "test"] if "eval" not in dataset else ["test"]
+    metadata = []
+    dataset_output = os.path.join(output_path, dataset)
+
+    for dataset_type in types:
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+
+        # acoustic_extractor.extract_utt_acoustic_features_parallel(
+        #     metadata, dataset_output, cfg, n_workers=n_workers
+        # )
+    acoustic_extractor.extract_utt_acoustic_features_serial(
+        metadata, dataset_output, cfg
+    )
+
+
+def extract_content_features(dataset, output_path, cfg, num_workers=1):
+    """Extract content features of utterances in the dataset
+
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+    """
+    types = ["train", "test"] if "eval" not in dataset else ["test"]
+    metadata = []
+    for dataset_type in types:
+        dataset_output = os.path.join(output_path, dataset)
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+
+    content_extractor.extract_utt_content_features_dataloader(
+        cfg, metadata, num_workers
+    )
+
+
+def preprocess(cfg, args):
+    """Proprocess raw data of single or multiple datasets (in cfg.dataset)
+
+    Args:
+        cfg (dict): dictionary that stores configurations
+        args (ArgumentParser): specify the configuration file and num_workers
+    """
+    # Specify the output root path to save the processed data
+    output_path = cfg.preprocess.processed_dir
+    os.makedirs(output_path, exist_ok=True)
+
+    ## Split train and test sets
+    for dataset in cfg.dataset:
+        print("Preprocess {}...".format(dataset))
+        preprocess_dataset(
+            dataset,
+            cfg.dataset_path[dataset],
+            output_path,
+            cfg.preprocess,
+            cfg.task_type,
+            is_custom_dataset=dataset in cfg.use_custom_dataset,
+        )
+
+    # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
+    try:
+        assert isinstance(
+            cfg.preprocess.data_augment, list
+        ), "Please provide a list of datasets need to be augmented."
+        if len(cfg.preprocess.data_augment) > 0:
+            new_datasets_list = []
+            for dataset in cfg.preprocess.data_augment:
+                new_datasets = data_augment.augment_dataset(cfg, dataset)
+                new_datasets_list.extend(new_datasets)
+            cfg.dataset.extend(new_datasets_list)
+            print("Augmentation datasets: ", cfg.dataset)
+    except:
+        print("No Data Augmentation.")
+
+    # Dump metadata of datasets (singers, train/test durations, etc.)
+    cal_metadata(cfg)
+
+    ## Prepare the acoustic features
+    for dataset in cfg.dataset:
+        # Skip augmented datasets which do not need to extract acoustic features
+        # We will copy acoustic features from the original dataset later
+        if (
+            "pitch_shift" in dataset
+            or "formant_shift" in dataset
+            or "equalizer" in dataset in dataset
+        ):
+            continue
+        print(
+            "Extracting acoustic features for {} using {} workers ...".format(
+                dataset, args.num_workers
+            )
+        )
+        extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
+        # Calculate the statistics of acoustic features
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics_svc(dataset, output_path, cfg)
+
+    # Copy acoustic features for augmented datasets by creating soft-links
+    for dataset in cfg.dataset:
+        if "pitch_shift" in dataset:
+            src_dataset = dataset.replace("_pitch_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "formant_shift" in dataset:
+            src_dataset = dataset.replace("_formant_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "equalizer" in dataset:
+            src_dataset = dataset.replace("_equalizer", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        else:
+            continue
+        dataset_dir = os.path.join(output_path, dataset)
+        metadata = []
+        for split in ["train", "test"] if not "eval" in dataset else ["test"]:
+            metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
+            with open(metadata_file_path, "r") as f:
+                metadata.extend(json.load(f))
+        print("Copying acoustic features for {}...".format(dataset))
+        acoustic_extractor.copy_acoustic_features(
+            metadata, dataset_dir, src_dataset_dir, cfg
+        )
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
+
+    # Prepare the content features
+    for dataset in cfg.dataset:
+        print("Extracting content features for {}...".format(dataset))
+        extract_content_features(dataset, output_path, cfg, args.num_workers)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", default="config.json", help="json files for configurations."
+    )
+    parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
+    parser.add_argument("--prepare_alignment", type=bool, default=False)
+
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+
+    preprocess(cfg, args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bins/qvc/train.py b/bins/qvc/train.py
new file mode 100644
index 00000000..0c20d5b4
--- /dev/null
+++ b/bins/qvc/train.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+import torch
+
+from models.svc.diffusion.diffusion_trainer import DiffusionTrainer
+from models.svc.comosvc.comosvc_trainer import ComoSVCTrainer
+from models.svc.transformer.transformer_trainer import TransformerTrainer
+from models.svc.vits.vits_trainer import VitsSVCTrainer
+from utils.util import load_config
+
+
+def build_trainer(args, cfg):
+    supported_trainer = {
+        "DiffWaveNetSVC": DiffusionTrainer,
+        "DiffComoSVC": ComoSVCTrainer,
+        "TransformerSVC": TransformerTrainer,
+        "VitsSVC": VitsSVCTrainer,
+    }
+
+    trainer_class = supported_trainer[cfg.model_type]
+    trainer = trainer_class(args, cfg)
+    return trainer
+
+
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        default="config.json",
+        help="json files for configurations.",
+        required=True,
+    )
+    parser.add_argument(
+        "--exp_name",
+        type=str,
+        default="exp_name",
+        help="A specific name to note the experiment",
+        required=True,
+    )
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="If specified, to resume from the existing checkpoint.",
+    )
+    parser.add_argument(
+        "--resume_from_ckpt_path",
+        type=str,
+        default="",
+        help="The specific checkpoint path that you want to resume from.",
+    )
+    parser.add_argument(
+        "--resume_type",
+        type=str,
+        default="",
+        help="`resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights",
+    )
+
+    parser.add_argument(
+        "--log_level", default="warning", help="logging level (debug, info, warning)"
+    )
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+
+    # Data Augmentation
+    if (
+        type(cfg.preprocess.data_augment) == list
+        and len(cfg.preprocess.data_augment) > 0
+    ):
+        new_datasets_list = []
+        for dataset in cfg.preprocess.data_augment:
+            new_datasets = [
+                f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
+                (
+                    f"{dataset}_formant_shift"
+                    if cfg.preprocess.use_formant_shift
+                    else None
+                ),
+                f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
+                f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
+            ]
+            new_datasets_list.extend(filter(None, new_datasets))
+        cfg.dataset.extend(new_datasets_list)
+
+    # CUDA settings
+    cuda_relevant()
+
+    # Build trainer
+    trainer = build_trainer(args, cfg)
+
+    trainer.train_loop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bins/qvc/webui.py b/bins/qvc/webui.py
new file mode 100644
index 00000000..76b7a7ae
--- /dev/null
+++ b/bins/qvc/webui.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import gradio as gr
+import os
+import shutil
+from inf_preprocess import *
+from inference import *
+from post_process import *
+from preprocess import *
+from train import *
+
+def processing_audio(infsource, tarsource, shifts):
+    global cfg, args
+    target_folder = 'temp'
+    if os.path.exists(target_folder):
+        shutil.rmtree(target_folder)
+    inf_preprocess(infsource, tarsource)
+    preprocess(cfg, args)
+    if shifts is not None:
+        args.trans_key = shifts
+    if (
+        type(cfg.preprocess.data_augment) == list
+        and len(cfg.preprocess.data_augment) > 0
+    ):
+        new_datasets_list = []
+        for dataset in cfg.preprocess.data_augment:
+            new_datasets = [
+                f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
+                (
+                    f"{dataset}_formant_shift"
+                    if cfg.preprocess.use_formant_shift
+                    else None
+                ),
+                f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
+                f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
+            ]
+            new_datasets_list.extend(filter(None, new_datasets))
+        cfg.dataset.extend(new_datasets_list)
+
+    # CUDA settings
+    cuda_relevant()
+    # Build trainer
+    trainer = build_trainer(args, cfg)
+    trainer.train_loop()
+    
+    args.source = 'temp/temp0'
+    if os.path.isdir(args.source):
+        ### Infer from file
+
+        # Get all the source audio files (.wav, .flac, .mp3)
+        source_audio_dir = args.source
+        audio_list = []
+        for suffix in ["wav", "flac", "mp3"]:
+            audio_list += glob.glob(
+                os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True
+            )
+        print("There are {} source audios: ".format(len(audio_list)))
+
+        # Infer for every file as dataset
+        output_root_path = args.output_dir
+        for audio_path in tqdm(audio_list):
+            audio_name = audio_path.split("/")[-1].split(".")[0]
+            args.output_dir = os.path.join(output_root_path, audio_name)
+            print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name))
+
+            cfg.inference.source_audio_path = audio_path
+            cfg.inference.source_audio_name = audio_name
+            cfg.inference.segments_max_duration = 10.0
+            cfg.inference.segments_overlap_duration = 1.0
+
+            # Prepare metadata and features
+            args, cfg, cache_dir = prepare_for_audio_file(args, cfg)
+
+            # Infer from file
+            output_audio_files = infer(args, cfg, infer_type="from_file")
+
+            # Merge the split segments
+            result = merge_for_audio_segments(output_audio_files, args, cfg)
+
+            # Keep or remove caches
+            if not args.keep_cache:
+                os.removedirs(cache_dir)
+
+    else:
+        ### Infer from dataset
+        infer(args, cfg, infer_type="from_dataset")
+    return result
+
+
+def main():
+    infsource_audio = gr.Audio(label="Source Audio", type="filepath")
+    tarsource_audio = gr.Audio(label="Target Audio", type="filepath")
+    options1 = gr.Dropdown(["autoshift", "-10", "-9", "-8", "-7", "-6", "-5", "-4", "-3", "-2", "-1", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"], label="How many semitones you want to transpose?")
+    outputs =  gr.Audio(label="Output Audio")
+    inputs = [tarsource_audio, infsource_audio, options1]
+    title = "Amphion-QuickVC"
+    
+    gr.Interface(processing_audio, inputs, outputs, title=title).queue().launch(server_name="0.0.0.0", share=True)
+    
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        default="config.json",
+        help="json files for configurations.",
+        required=True,
+    )
+    parser.add_argument(
+        "--exp_name",
+        type=str,
+        default="exp_name",
+        help="A specific name to note the experiment",
+        required=True,
+    )
+    parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
+    parser.add_argument("--prepare_alignment", type=bool, default=False)
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="If specified, to resume from the existing checkpoint.",
+    )
+    parser.add_argument(
+        "--resume_from_ckpt_path",
+        type=str,
+        default="",
+        help="The specific checkpoint path that you want to resume from.",
+    )
+    parser.add_argument(
+        "--resume_type",
+        type=str,
+        default="",
+        help="`resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights",
+    )
+    parser.add_argument(
+        "--log_level", default="warning", help="logging level (debug, info, warning)"
+    )
+    parser.add_argument(
+        "--acoustics_dir",
+        type=str,
+        help="Acoustics model checkpoint directory. If a directory is given, "
+        "search for the latest checkpoint dir in the directory. If a specific "
+        "checkpoint dir is given, directly load the checkpoint.",
+    )
+    parser.add_argument(
+        "--vocoder_dir",
+        type=str,
+        required=True,
+        help="Vocoder checkpoint directory. Searching behavior is the same as "
+        "the acoustics one.",
+    )
+    parser.add_argument(
+        "--target_singer",
+        type=str,
+        required=True,
+        help="convert to a specific singer (e.g. --target_singers singer_id).",
+    )
+    parser.add_argument(
+        "--trans_key",
+        default=0,
+        help="0: no pitch shift; autoshift: pitch shift;  int: key shift.",
+    )
+    parser.add_argument(
+        "--source",
+        type=str,
+        default="source_audio",
+        help="Source audio file or directory. If a JSON file is given, "
+        "inference from dataset is applied. If a directory is given, "
+        "inference from all wav/flac/mp3 audio files in the directory is applied. "
+        "Default: inference from all wav/flac/mp3 audio files in ./source_audio",
+    )
+    parser.add_argument(
+        "--infsource",
+        type=str,
+        default="source_audio",
+        help="Source audio file or directory. If a JSON file is given, "
+        "inference from dataset is applied. If a directory is given, "
+        "inference from all wav/flac/mp3 audio files in the directory is applied. "
+        "Default: inference from all wav/flac/mp3 audio files in ./source_audio",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="conversion_results",
+        help="Output directory. Default: ./conversion_results",
+    )
+
+    parser.add_argument(
+        "--keep_cache",
+        action="store_true",
+        default=True,
+        help="Keep cache files. Only applicable to inference from files.",
+    )
+    parser.add_argument(
+        "--diffusion_inference_steps",
+        type=int,
+        default=1000,
+        help="Number of inference steps. Only applicable to diffusion inference.",
+    )
+    
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    main()
diff --git a/egs/svc/QVC/README.md b/egs/svc/QVC/README.md
new file mode 100644
index 00000000..d4595e0d
--- /dev/null
+++ b/egs/svc/QVC/README.md
@@ -0,0 +1,142 @@
+# Quick (Singing) Voice Conversion
+
+This is an implementation of a simple Webui which provides a simple and quick text-free one-shot voice conversion for the uninitiated. Thereotically, the user only takes two short audios (source and target) and a few minutes to receive the VC result. 
+It aims to use the base model (checkpoint) trained from the VCTK, M4Singer datasets (or other supported datasets) as a foundation, and then fine-tune the base model using the input source audio for voice conversion and output. Now it supports MultipleContentSVC and VITS. 
+
+Like other SVC tasks, There are four stages in total:
+
+1. Data preparation
+2. Features extraction
+3. Training
+4. Inference/conversion
+
+> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
+> ```bash
+> cd Amphion
+> ```
+
+## 1. Data Preparation
+
+### Dataset Download
+
+By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
+
+### Configuration
+
+Specify the dataset paths in  `exp_config_[model_type].json`. Note that you can change the `dataset` list to use your preferred datasets.
+
+```json
+    "dataset": [
+        "m4singer",
+        "opencpop",
+        "opensinger",
+        "svcc",
+        "vctk"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+        "m4singer": "[M4Singer dataset path]",
+        "opencpop": "[Opencpop dataset path]",
+        "opensinger": "[OpenSinger dataset path]",
+        "svcc": "[SVCC dataset path]",
+        "vctk": "[VCTK dataset path]"
+    },
+```
+
+## 2. Features Extraction
+
+### Content-based Pretrained Models Download
+
+By default, we utilize ContentVec and Whisper to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
+
+### Configuration
+
+Specify the dataset path and the output path for saving the processed data and the training model in `exp_config_[model_type].json`:
+
+```json
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+    "log_dir": "ckpts/svc",
+    "preprocess": {
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "data",
+        ...
+    },
+```
+
+### Run
+
+Run the `run.sh` as the preproces stage (set  `--stage 1`). Config_type 1 means MultipleContentSVC (DiffWaveNet); 2 means VITS. 
+
+```bash
+sh egs/svc/QVC/run.sh --stage 1 --config_type (1 or 2)
+```
+
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
+
+## 3. Training
+
+### Configuration
+
+We provide the default hyparameters in the `exp_config_[model_type].json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
+
+```json
+"train": {
+        "batch_size": 32,
+        ...
+        "adamw": {
+            "lr": 2.0e-4
+        },
+        ...
+    }
+```
+
+### Run
+
+Run the `run.sh` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
+
+```bash
+sh egs/svc/QVC/run.sh --stage 2 --name [YourExptName] --config_type (1 or 2)
+```
+
+## 4. Inference/Conversion
+
+### Run
+
+`inf_config_[model_type].json` is a file similar to `exp_config_[model_type].json` storing training parameters, but you need to be careful when modifying configs here (especially the temp storing path here). 
+For inference/conversion, you need to specify the following configurations when running `run.sh`:
+
+| Parameters                                          | Description                                                                                                                                                       | Example                                                                                                                                                                                                  |
+| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `--infer_output_dir`                                | The output directory to save inferred audios.                                                                                                                     | `[Your path to save logs and checkpoints]/[YourExptName]/result`                
+| `--resume_from_ckpt_path`                                | The checkpoint path to load model parameters.                                                                                                                     | `[Your path to save logs and checkpoints]` |
+| `--infer_source_file`                                `   | The `infer_source_file` should be a file with *.wav, *.mp3 or *.flac. |
+| `--infer_target_speaker`                            | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`.                                                                                                                                      |
+| `--infer_key_shift`                                 | How many semitones you want to transpose.                                                                                                                         | `"autoshfit"` (by default), `3`, `-3`, etc.                                                                                                                                                              |
+
+Note that type of checkpoint model weights must match the model config type, now run:
+
+```bash
+sh egs/svc/QVC/run.sh --stage 3 --gpu "0" --config_type (1 or 2) \
+    --resume_from_ckpt_path [Your checkpoint Path] \
+	--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
+    --infer_source_file [Your Audio Path] \
+	--target_source_dir [Your Audios Folder] \
+	--infer_key_shift "autoshift"
+```
+
+Before opening the Webui, you need to install:
+```
+pip install gradio==3.42.0
+```
+
+Then you can initilize the Webui by running: 
+
+```bash
+sh egs/svc/QVC/run.sh --stage 4 --gpu "0" --config_type (1 or 2) \
+    --resume_from_ckpt_path [Your checkpoint Path] \
+	--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
+    --infer_source_file [Your Audio Path] \
+	--target_source_dir [Your Audios Folder] \
+	--infer_key_shift "autoshift"
+
+```
\ No newline at end of file
diff --git a/egs/svc/QVC/exp_config_diff.json b/egs/svc/QVC/exp_config_diff.json
new file mode 100644
index 00000000..b607494c
--- /dev/null
+++ b/egs/svc/QVC/exp_config_diff.json
@@ -0,0 +1,127 @@
+{
+    "base_config": "config/svc/diffusion.json",
+    "model_type": "DiffWaveNetSVC",
+    "dataset": [
+        "m4singer",
+        "opencpop",
+        "opensinger",
+        "svcc",
+        "vctk"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+        "m4singer": "[M4Singer dataset path]",
+        "opencpop": "[Opencpop dataset path]",
+        "opensinger": "[OpenSinger dataset path]",
+        "svcc": "[SVCC dataset path]",
+        "vctk": "[VCTK dataset path]"
+    },
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+    "log_dir": "ckpts/svc",
+    "preprocess": {
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "data",
+        // Config for features extraction
+        "features_extraction_mode": "offline", // Online or offline features extraction ("offline" or "online")
+        "extract_mel": true,
+        "extract_pitch": true,
+        "extract_energy": true,
+        "extract_whisper_feature": true,
+        "extract_contentvec_feature": true,
+        "extract_wenet_feature": false,
+        "whisper_batch_size": 30, // decrease it if your GPU is out of memory
+        "contentvec_batch_size": 1,
+        // Fill in the content-based pretrained model's path
+        "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
+        "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
+        "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
+        "whisper_model": "medium",
+        "whisper_model_path": "pretrained/whisper/medium.pt",
+        // Config for features usage
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_frame_energy": true,
+        "use_spkid": true,
+        "use_whisper": true,
+        "use_contentvec": true,
+        "use_wenet": false,
+        "n_mel": 100,
+        "sample_rate": 24000
+    },
+    "model": {
+        "condition_encoder": {
+            // Config for features usage
+            "use_whisper": true,
+            "use_contentvec": true,
+            "use_wenet": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "wenet_dim": 512,
+            "use_singer_encoder": false,
+            "pitch_min": 50,
+            "pitch_max": 1100
+        },
+        "diffusion": {
+            "scheduler": "ddpm",
+            "scheduler_settings": {
+                "num_train_timesteps": 1000,
+                "beta_start": 1.0e-4,
+                "beta_end": 0.02,
+                "beta_schedule": "linear"
+            },
+            // Diffusion steps encoder
+            "step_encoder": {
+                "dim_raw_embedding": 128,
+                "dim_hidden_layer": 512,
+                "activation": "SiLU",
+                "num_layer": 2,
+                "max_period": 10000
+            },
+            // Diffusion decoder
+            "model_type": "bidilconv",
+            // bidilconv, unet2d, TODO: unet1d
+            "bidilconv": {
+                "base_channel": 512,
+                "n_res_block": 40,
+                "conv_kernel_size": 3,
+                "dilation_cycle_length": 4,
+                // specially, 1 means no dilation
+                "conditioner_size": 384
+            }
+        }
+    },
+    "train": {
+        "batch_size": 32,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1, // -1 means no limit
+        "save_checkpoint_stride": [
+            3,
+            50
+        ],
+        "keep_last": [
+            3,
+            2
+        ],
+        "run_eval": [
+            true,
+            true
+        ],
+        "adamw": {
+            "lr": 2.0e-4
+        },
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 30,
+            "min_lr": 1.0e-4
+        },
+        "dataloader": {
+            "num_worker": 8,
+            "pin_memory": true
+        },
+        "sampler": {
+            "holistic_shuffle": false,
+            "drop_last": true
+        }
+    }
+}
\ No newline at end of file
diff --git a/egs/svc/QVC/exp_config_vits.json b/egs/svc/QVC/exp_config_vits.json
new file mode 100644
index 00000000..24b53ecd
--- /dev/null
+++ b/egs/svc/QVC/exp_config_vits.json
@@ -0,0 +1,105 @@
+{
+    "base_config": "config/vitssvc.json",
+    "model_type": "VitsSVC",
+    "dataset": [
+        "m4singer",
+        "opencpop",
+        "opensinger",
+        "svcc",
+        "vctk"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+        "m4singer": "[M4Singer dataset path]",
+        "opencpop": "[Opencpop dataset path]",
+        "opensinger": "[OpenSinger dataset path]",
+        "svcc": "[SVCC dataset path]",
+        "vctk": "[VCTK dataset path]"
+    },
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+    "log_dir": "ckpts/svc",
+    "preprocess": {
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "data",
+        
+        "n_mel": 100,
+        "sample_rate": 24000,
+
+        // contentvec
+        "extract_contentvec_feature": true,
+        "contentvec_sample_rate": 16000,
+        "contentvec_batch_size": 1,
+        "contentvec_frameshift": 0.02,
+        // whisper
+        "extract_whisper_feature": true,
+        "whisper_sample_rate": 16000,
+        "whisper_frameshift": 0.01,
+        "whisper_downsample_rate": 2,
+        // wenet
+        "extract_wenet_feature": false,
+        "wenet_downsample_rate": 4,
+        "wenet_frameshift": 0.01,
+        "wenet_sample_rate": 16000,
+        // Fill in the content-based pretrained model's path
+        "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
+        "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
+        "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
+        "whisper_model": "medium",
+        "whisper_model_path": "pretrained/whisper/medium.pt",
+
+        "use_contentvec": true,
+        "use_whisper": true,
+        "use_wenet": false,
+        
+        // Extract content features using dataloader
+        "pin_memory": true,
+        "num_workers": 8,
+        "content_feature_batch_size": 16,
+
+    },
+    "model": {
+        "condition_encoder": {
+            // Config for features usage
+            "merge_mode": "add",
+            "use_log_loudness": true,
+            "use_contentvec": true,
+            "use_whisper": true,
+            "use_wenet": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "wenet_dim": 512,
+        },
+        "vits": {
+            "inter_channels": 384,
+            "hidden_channels": 384,
+            "filter_channels": 256,
+            "n_heads": 2,
+            "n_layers": 6,
+            "kernel_size": 3,
+            "p_dropout": 0.1,
+            "n_flow_layer": 4,
+            "n_layers_q": 3,
+            "gin_channels": 256,
+            "n_speakers": 512,
+            "use_spectral_norm": false,
+        },
+        "generator": "nsfhifigan",
+    },
+    "train": {
+        "batch_size": 12,
+        "learning_rate": 2e-4,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1, // -1 means no limit
+        "save_checkpoint_stride": [
+            3,
+            50
+        ],
+        "keep_last": [
+            3,
+            2
+        ],
+    },
+    "inference": {
+        "batch_size": 1,
+    }
+}
\ No newline at end of file
diff --git a/egs/svc/QVC/inf_config_diff.json b/egs/svc/QVC/inf_config_diff.json
new file mode 100644
index 00000000..7179d351
--- /dev/null
+++ b/egs/svc/QVC/inf_config_diff.json
@@ -0,0 +1,130 @@
+{
+    "base_config": "config/svc/diffusion.json",
+    "model_type": "DiffWaveNetSVC",
+    "dataset": [
+      //  "m4singer",
+      //  "opencpop",
+      //  "opensinger",
+      //  "svcc",
+      //  "vctk",
+        "temp1"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+      //  "m4singer": "[M4Singer dataset path]",
+      //  "opencpop": "[Opencpop dataset path]",
+      //  "opensinger": "[OpenSinger dataset path]",
+     //   "svcc": "[SVCC dataset path]",
+     //   "vctk": "[VCTK dataset path]",
+        "temp1": "temp/temp1"
+    },
+    "use_custom_dataset": ["temp1"],
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+    "log_dir": "temp/ckpts/svc",
+    "preprocess": {
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "temp/data",
+        // Config for features extraction
+        "features_extraction_mode": "offline", // Online or offline features extraction ("offline" or "online")
+        "extract_mel": true,
+        "extract_pitch": true,
+        "extract_energy": true,
+        "extract_whisper_feature": true,
+        "extract_contentvec_feature": true,
+        "extract_wenet_feature": false,
+        "whisper_batch_size": 30, // decrease it if your GPU is out of memory
+        "contentvec_batch_size": 1,
+        // Fill in the content-based pretrained model's path
+        "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
+        "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
+        "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
+        "whisper_model": "medium",
+        "whisper_model_path": "pretrained/whisper/medium.pt",
+        // Config for features usage
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_frame_energy": true,
+        "use_spkid": true,
+        "use_whisper": true,
+        "use_contentvec": true,
+        "use_wenet": false,
+        "n_mel": 100,
+        "sample_rate": 24000
+    },
+    "model": {
+        "condition_encoder": {
+            // Config for features usage
+            "use_whisper": true,
+            "use_contentvec": true,
+            "use_wenet": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "wenet_dim": 512,
+            "use_singer_encoder": false,
+            "pitch_min": 50,
+            "pitch_max": 1100
+        },
+        "diffusion": {
+            "scheduler": "ddpm",
+            "scheduler_settings": {
+                "num_train_timesteps": 1000,
+                "beta_start": 1.0e-4,
+                "beta_end": 0.02,
+                "beta_schedule": "linear"
+            },
+            // Diffusion steps encoder
+            "step_encoder": {
+                "dim_raw_embedding": 128,
+                "dim_hidden_layer": 512,
+                "activation": "SiLU",
+                "num_layer": 2,
+                "max_period": 10000
+            },
+            // Diffusion decoder
+            "model_type": "bidilconv",
+            // bidilconv, unet2d, TODO: unet1d
+            "bidilconv": {
+                "base_channel": 512,
+                "n_res_block": 40,
+                "conv_kernel_size": 3,
+                "dilation_cycle_length": 4,
+                // specially, 1 means no dilation
+                "conditioner_size": 384
+            }
+        }
+    },
+    "train": {
+        "batch_size": 2,
+        "gradient_accumulation_step": 1,
+        "max_epoch": 51, // -1 means no limit
+        "save_checkpoint_stride": [
+            50,
+            2000
+        ],
+        "keep_last": [
+            3,
+            2
+        ],
+        "run_eval": [
+            true,
+            true
+        ],
+        "adamw": {
+            "lr": 2.0e-4
+        },
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 30,
+            "min_lr": 1.0e-4
+        },
+        "dataloader": {
+            "num_worker": 8,
+            "pin_memory": true
+        },
+        "sampler": {
+            "holistic_shuffle": false,
+            "drop_last": true
+        }
+    }
+}
\ No newline at end of file
diff --git a/egs/svc/QVC/inf_config_vits.json b/egs/svc/QVC/inf_config_vits.json
new file mode 100644
index 00000000..b8d9d110
--- /dev/null
+++ b/egs/svc/QVC/inf_config_vits.json
@@ -0,0 +1,102 @@
+{
+    "base_config": "config/vitssvc.json",
+    "model_type": "VitsSVC",
+    "dataset": [
+       //  "m4singer",
+       //  "vctk",
+         "temp1"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+       //  "m4singer": "[M4Singer dataset path]",
+       //   "vctk": "[VCTK dataset path]",
+         "temp1": "temp/temp1"
+    },
+    "use_custom_dataset": ["temp1"],
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+    "log_dir": "temp/ckpts/svc",
+    "preprocess": {
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "temp/data",
+        
+        "n_mel": 100,
+        "sample_rate": 24000,
+
+        // contentvec
+        "extract_contentvec_feature": true,
+        "contentvec_sample_rate": 16000,
+        "contentvec_batch_size": 1,
+        "contentvec_frameshift": 0.02,
+        // whisper
+        "extract_whisper_feature": true,
+        "whisper_sample_rate": 16000,
+        "whisper_frameshift": 0.01,
+        "whisper_downsample_rate": 2,
+        // wenet
+        "extract_wenet_feature": false,
+        "wenet_downsample_rate": 4,
+        "wenet_frameshift": 0.01,
+        "wenet_sample_rate": 16000,
+        // Fill in the content-based pretrained model's path
+        "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
+        "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
+        "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
+        "whisper_model": "medium",
+        "whisper_model_path": "pretrained/whisper/medium.pt",
+
+        "use_contentvec": true,
+        "use_whisper": true,
+        "use_wenet": false,
+        
+        // Extract content features using dataloader
+        "pin_memory": true,
+        "num_workers": 8,
+        "content_feature_batch_size": 16,
+
+    },
+    "model": {
+        "condition_encoder": {
+            // Config for features usage
+            "merge_mode": "add",
+            "use_log_loudness": true,
+            "use_contentvec": true,
+            "use_whisper": true,
+            "use_wenet": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "wenet_dim": 512,
+        },
+        "vits": {
+            "inter_channels": 384,
+            "hidden_channels": 384,
+            "filter_channels": 256,
+            "n_heads": 2,
+            "n_layers": 6,
+            "kernel_size": 3,
+            "p_dropout": 0.1,
+            "n_flow_layer": 4,
+            "n_layers_q": 3,
+            "gin_channels": 256,
+            "n_speakers": 512,
+            "use_spectral_norm": false,
+        },
+        "generator": "nsfhifigan",
+    },
+    "train": {
+        "batch_size": 2,
+        "learning_rate": 2e-4,
+        "gradient_accumulation_step": 1,
+        "max_epoch": 11, // -1 means no limit
+        "save_checkpoint_stride": [
+            10,
+            50
+        ],
+        "keep_last": [
+            3,
+            2
+        ],
+    },
+    "inference": {
+        "batch_size": 1,
+    }
+}
\ No newline at end of file
diff --git a/egs/svc/QVC/run.sh b/egs/svc/QVC/run.sh
new file mode 100644
index 00000000..ff0c8289
--- /dev/null
+++ b/egs/svc/QVC/run.sh
@@ -0,0 +1,299 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $exp_dir)))
+
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,config2:,config_type:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,target_source_file:,infer_source_audio_dir:,target_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
+eval set -- "$options"
+
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Configuration File
+    -c | --config2) shift; inf_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    --config_type) shift; config_type=$1 ; shift ;;
+
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
+    --infer_source_file) shift; infer_source_file=$1 ; shift ;;
+    --target_source_file) shift; target_source_file=$1 ; shift ;;
+    --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
+    --target_source_audio_dir) shift; target_source_audio_dir=$1 ; shift ;;
+    # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
+    --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
+    # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
+    --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
+    # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
+    --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
+
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+
+
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+
+if [ -z "$config_type" ]; then
+    echo "[Error] Please specify the model type"
+    exit 1
+fi
+
+if [ "$config_type" -eq 1 ]; then
+    exp_config="${exp_dir}"/exp_config_diff.json
+    inf_config="${exp_dir}"/inf_config_diff.json
+    echo "Experimental Configuration File: DiffWaveNet"
+fi
+
+if [ "$config_type" -eq 2 ]; then
+    exp_config="${exp_dir}"/exp_config_vits.json
+    inf_config="${exp_dir}"/inf_config_vits.json
+    echo "Experimental Configuration File: VITS"
+fi
+
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/fvc/preprocess.py \
+        --config $exp_config \
+        --num_workers 8
+fi
+
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+
+    # add default value
+    if [ -z "$resume_from_ckpt_path" ]; then
+        resume_from_ckpt_path=""
+    fi
+
+    if [ -z "$resume_type" ]; then
+        resume_type="resume"
+    fi
+
+    if [ "$resume" = true ]; then
+        echo "Resume from the existing experiment..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/fvc/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume \
+            --resume_from_ckpt_path "$resume_from_ckpt_path" \
+            --resume_type "$resume_type"
+    else
+        echo "Start a new experiment..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/fvc/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info
+    fi
+fi
+
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$exp_name" ]; then
+        exp_name="test"
+    fi
+    echo "Exprimental Name: $exp_name"
+
+    if [ -z "$infer_expt_dir" ]; then
+        infer_expt_dir="${work_dir}/temp/ckpts/svc/${exp_name}"
+    fi
+    
+    if [ -z "$resume_from_ckpt_path" ]; then
+        echo "[Error] Please specify the ckpt path."
+        exit 1
+    fi
+    
+    if [ -z "$infer_output_dir" ]; then
+        echo "[Error] Please specify the output path"
+        exit 1
+    fi
+
+    if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
+        echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
+        exit 1
+    fi
+
+    if [ -z "$infer_source_file" ]; then
+        infer_source=$infer_source_audio_dir
+    fi
+
+    if [ -z "$infer_source_audio_dir" ]; then
+        infer_source=$infer_source_file
+    fi
+
+    if [ -z "$target_source_file" ] && [ -z "$target_source_audio_dir" ]; then
+        echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
+        exit 1
+    fi
+    
+    if [ -z "$target_source_file" ]; then
+        target_source=$target_source_audio_dir
+    fi
+
+    if [ -z "$target_source_audio_dir" ]; then
+        target_source=$target_source_file
+    fi
+
+    if [ -z "$infer_target_speaker" ]; then
+        infer_target_speaker="temp1_temp2"
+    fi
+    
+    if [ -z "$resume_type" ]; then
+        resume_type="finetune"
+    fi
+    
+    if [ -z "$infer_key_shift" ]; then
+        infer_key_shift="autoshift"
+    fi
+
+    if [ -z "$infer_vocoder_dir" ]; then
+        infer_vocoder_dir="$work_dir"/pretrained/bigvgan
+    fi
+    
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/qvc/inf_preprocess.py \
+        --infsource $infer_source
+    
+    
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/qvc/preprocess.py \
+        --config $inf_config \
+        --num_workers 8
+        
+    CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/qvc/train.py \
+            --config "$inf_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume \
+            --resume_from_ckpt_path "$resume_from_ckpt_path" \
+            --resume_type "$resume_type"
+    
+    CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/qvc/inference.py \
+        --config $inf_config \
+        --acoustics_dir $infer_expt_dir \
+        --vocoder_dir $infer_vocoder_dir \
+        --target_singer $infer_target_speaker \
+        --trans_key $infer_key_shift \
+        --source $target_source \
+        --output_dir $infer_output_dir  \
+        --log_level debug
+
+    python "${work_dir}"/bins/qvc/post_process.py
+    
+fi
+if [ $running_stage -eq 4 ]; then
+    if [ -z "$exp_name" ]; then
+        exp_name="test"
+    fi
+    echo "Exprimental Name: $exp_name"
+
+    if [ -z "$infer_expt_dir" ]; then
+        infer_expt_dir="${work_dir}/temp/ckpts/svc/${exp_name}"
+    fi
+    
+    if [ -z "$resume_from_ckpt_path" ]; then
+        echo "[Error] Please specify the ckpt path."
+        exit 1
+    fi
+    
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="${work_dir}/temp"
+    fi
+
+    if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
+        echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
+        exit 1
+    fi
+
+    if [ -z "$infer_source_file" ]; then
+        infer_source=$infer_source_audio_dir
+    fi
+
+    if [ -z "$infer_source_audio_dir" ]; then
+        infer_source=$infer_source_file
+    fi
+
+    if [ -z "$target_source_file" ] && [ -z "$target_source_audio_dir" ]; then
+        echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
+        exit 1
+    fi
+    
+    if [ -z "$target_source_file" ]; then
+        target_source=$target_source_audio_dir
+    fi
+
+    if [ -z "$target_source_audio_dir" ]; then
+        target_source=$target_source_file
+    fi
+
+    if [ -z "$infer_target_speaker" ]; then
+        infer_target_speaker="temp1_temp2"
+    fi
+    
+    if [ -z "$resume_type" ]; then
+        resume_type="finetune"
+    fi
+    
+    if [ -z "$infer_key_shift" ]; then
+        infer_key_shift="autoshift"
+    fi
+
+    if [ -z "$infer_vocoder_dir" ]; then
+        infer_vocoder_dir="$work_dir"/pretrained/bigvgan
+    fi
+
+
+    CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/qvc/webui.py \
+        --config $inf_config \
+        --num_workers 8 \
+        --exp_name "$exp_name" \
+        --resume \
+        --resume_from_ckpt_path "$resume_from_ckpt_path" \
+        --resume_type "$resume_type" \
+        --acoustics_dir $infer_expt_dir \
+        --vocoder_dir $infer_vocoder_dir \
+        --target_singer $infer_target_speaker \
+        --trans_key $infer_key_shift \
+        --source $target_source \
+        --log_level info \
+        --output_dir $infer_output_dir  
+
+fi
\ No newline at end of file