Spaces:

Billpai
/

test2

Build error

App Files Files Community

Billpai commited on Apr 30, 2024

Commit

4e04d23

1 Parent(s): 0312eff

test

Browse files

Files changed (44) hide show

processors/__init__.py +0 -0
processors/acoustic_extractor.py +903 -0
processors/content_extractor.py +540 -0
processors/data_augment.py +378 -0
processors/phone_extractor.py +142 -0
text/__init__.py +79 -0
text/cleaners.py +98 -0
text/cmudict.py +145 -0
text/g2p.py +38 -0
text/g2p_module.py +221 -0
text/lexicon/librispeech-lexicon.txt +0 -0
text/lexicon/pinyin-lexicon-r.txt +4120 -0
text/numbers.py +77 -0
text/pinyin.py +218 -0
text/symbol_table.py +284 -0
text/symbols.py +34 -0
text/text_token_collation.py +131 -0
utils/HyperParams/__init__.py +6 -0
utils/HyperParams/hps.py +43 -0
utils/__init__.py +0 -0
utils/audio.py +74 -0
utils/audio_slicer.py +476 -0
utils/data_utils.py +575 -0
utils/distribution.py +270 -0
utils/dsp.py +97 -0
utils/duration.py +86 -0
utils/f0.py +320 -0
utils/hparam.py +659 -0
utils/hubert.py +155 -0
utils/io.py +153 -0
utils/io_optim.py +123 -0
utils/mel.py +283 -0
utils/mert.py +139 -0
utils/model_summary.py +74 -0
utils/prompt_preparer.py +73 -0
utils/ssim.py +80 -0
utils/stft.py +278 -0
utils/symbol_table.py +313 -0
utils/tokenizer.py +151 -0
utils/topk_sampling.py +87 -0
utils/trainer_utils.py +16 -0
utils/util.py +688 -0
utils/whisper.py +165 -0
utils/world.py +92 -0

processors/__init__.py ADDED Viewed

File without changes

processors/acoustic_extractor.py ADDED Viewed

	@@ -0,0 +1,903 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import torch
+import numpy as np
+import json
+from tqdm import tqdm
+from sklearn.preprocessing import StandardScaler
+from utils.io import save_feature, save_txt
+from utils.util import has_existed
+from utils.tokenizer import extract_encodec_token
+from utils.stft import TacotronSTFT
+from utils.dsp import compress, audio_to_label
+from utils.data_utils import remove_outlier
+from preprocessors.metadata import replace_augment_name
+from scipy.interpolate import interp1d
+ZERO = 1e-12
+def extract_utt_acoustic_features_parallel(metadata, dataset_output, cfg, n_workers=1):
+    """Extract acoustic features from utterances using muliprocess
+    Args:
+        metadata (dict): dictionary that stores data in train.json and test.json files
+        dataset_output (str): directory to store acoustic features
+        cfg (dict): dictionary that stores configurations
+        n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
+    Returns:
+        list: acoustic features
+    """
+    for utt in tqdm(metadata):
+        if cfg.task_type == "tts":
+            extract_utt_acoustic_features_tts(dataset_output, cfg, utt)
+        if cfg.task_type == "svc":
+            extract_utt_acoustic_features_svc(dataset_output, cfg, utt)
+        if cfg.task_type == "vocoder":
+            extract_utt_acoustic_features_vocoder(dataset_output, cfg, utt)
+        if cfg.task_type == "tta":
+            extract_utt_acoustic_features_tta(dataset_output, cfg, utt)
+def avg_phone_feature(feature, duration, interpolation=False):
+    feature = feature[: sum(duration)]
+    if interpolation:
+        nonzero_ids = np.where(feature != 0)[0]
+        interp_fn = interp1d(
+            nonzero_ids,
+            feature[nonzero_ids],
+            fill_value=(feature[nonzero_ids[0]], feature[nonzero_ids[-1]]),
+            bounds_error=False,
+        )
+        feature = interp_fn(np.arange(0, len(feature)))
+    # Phoneme-level average
+    pos = 0
+    for i, d in enumerate(duration):
+        if d > 0:
+            feature[i] = np.mean(feature[pos : pos + d])
+        else:
+            feature[i] = 0
+        pos += d
+    feature = feature[: len(duration)]
+    return feature
+def extract_utt_acoustic_features_serial(metadata, dataset_output, cfg):
+    """Extract acoustic features from utterances (in single process)
+    Args:
+        metadata (dict): dictionary that stores data in train.json and test.json files
+        dataset_output (str): directory to store acoustic features
+        cfg (dict): dictionary that stores configurations
+    """
+    for utt in tqdm(metadata):
+        if cfg.task_type == "tts":
+            extract_utt_acoustic_features_tts(dataset_output, cfg, utt)
+        if cfg.task_type == "svc":
+            extract_utt_acoustic_features_svc(dataset_output, cfg, utt)
+        if cfg.task_type == "vocoder":
+            extract_utt_acoustic_features_vocoder(dataset_output, cfg, utt)
+        if cfg.task_type == "tta":
+            extract_utt_acoustic_features_tta(dataset_output, cfg, utt)
+def __extract_utt_acoustic_features(dataset_output, cfg, utt):
+    """Extract acoustic features from utterances (in single process)
+    Args:
+        dataset_output (str): directory to store acoustic features
+        cfg (dict): dictionary that stores configurations
+        utt (dict): utterance info including dataset, singer, uid:{singer}_{song}_{index},
+                    path to utternace, duration, utternace index
+    """
+    from utils import audio, f0, world, duration
+    uid = utt["Uid"]
+    wav_path = utt["Path"]
+    if os.path.exists(os.path.join(dataset_output, cfg.preprocess.raw_data)):
+        wav_path = os.path.join(
+            dataset_output, cfg.preprocess.raw_data, utt["Singer"], uid + ".wav"
+        )
+    with torch.no_grad():
+        # Load audio data into tensor with sample rate of the config file
+        wav_torch, _ = audio.load_audio_torch(wav_path, cfg.preprocess.sample_rate)
+        wav = wav_torch.cpu().numpy()
+        # extract features
+        if cfg.preprocess.extract_duration:
+            durations, phones, start, end = duration.get_duration(
+                utt, wav, cfg.preprocess
+            )
+            save_feature(dataset_output, cfg.preprocess.duration_dir, uid, durations)
+            save_txt(dataset_output, cfg.preprocess.lab_dir, uid, phones)
+            wav = wav[start:end].astype(np.float32)
+            wav_torch = torch.from_numpy(wav).to(wav_torch.device)
+        if cfg.preprocess.extract_linear_spec:
+            from utils.mel import extract_linear_features
+            linear = extract_linear_features(wav_torch.unsqueeze(0), cfg.preprocess)
+            save_feature(
+                dataset_output, cfg.preprocess.linear_dir, uid, linear.cpu().numpy()
+            )
+        if cfg.preprocess.extract_mel:
+            from utils.mel import extract_mel_features
+            if cfg.preprocess.mel_extract_mode == "taco":
+                _stft = TacotronSTFT(
+                    sampling_rate=cfg.preprocess.sample_rate,
+                    win_length=cfg.preprocess.win_size,
+                    hop_length=cfg.preprocess.hop_size,
+                    filter_length=cfg.preprocess.n_fft,
+                    n_mel_channels=cfg.preprocess.n_mel,
+                    mel_fmin=cfg.preprocess.fmin,
+                    mel_fmax=cfg.preprocess.fmax,
+                )
+                mel = extract_mel_features(
+                    wav_torch.unsqueeze(0), cfg.preprocess, taco=True, _stft=_stft
+                )
+                if cfg.preprocess.extract_duration:
+                    mel = mel[:, : sum(durations)]
+            else:
+                mel = extract_mel_features(wav_torch.unsqueeze(0), cfg.preprocess)
+            save_feature(dataset_output, cfg.preprocess.mel_dir, uid, mel.cpu().numpy())
+        if cfg.preprocess.extract_energy:
+            if (
+                cfg.preprocess.energy_extract_mode == "from_mel"
+                and cfg.preprocess.extract_mel
+            ):
+                energy = (mel.exp() ** 2).sum(0).sqrt().cpu().numpy()
+            elif cfg.preprocess.energy_extract_mode == "from_waveform":
+                energy = audio.energy(wav, cfg.preprocess)
+            elif cfg.preprocess.energy_extract_mode == "from_tacotron_stft":
+                _stft = TacotronSTFT(
+                    sampling_rate=cfg.preprocess.sample_rate,
+                    win_length=cfg.preprocess.win_size,
+                    hop_length=cfg.preprocess.hop_size,
+                    filter_length=cfg.preprocess.n_fft,
+                    n_mel_channels=cfg.preprocess.n_mel,
+                    mel_fmin=cfg.preprocess.fmin,
+                    mel_fmax=cfg.preprocess.fmax,
+                )
+                _, energy = audio.get_energy_from_tacotron(wav, _stft)
+            else:
+                assert cfg.preprocess.energy_extract_mode in [
+                    "from_mel",
+                    "from_waveform",
+                    "from_tacotron_stft",
+                ], f"{cfg.preprocess.energy_extract_mode} not in supported energy_extract_mode [from_mel, from_waveform, from_tacotron_stft]"
+            if cfg.preprocess.extract_duration:
+                energy = energy[: sum(durations)]
+                phone_energy = avg_phone_feature(energy, durations)
+                save_feature(
+                    dataset_output, cfg.preprocess.phone_energy_dir, uid, phone_energy
+                )
+            save_feature(dataset_output, cfg.preprocess.energy_dir, uid, energy)
+        if cfg.preprocess.extract_pitch:
+            pitch = f0.get_f0(wav, cfg.preprocess)
+            if cfg.preprocess.extract_duration:
+                pitch = pitch[: sum(durations)]
+                phone_pitch = avg_phone_feature(pitch, durations, interpolation=True)
+                save_feature(
+                    dataset_output, cfg.preprocess.phone_pitch_dir, uid, phone_pitch
+                )
+            save_feature(dataset_output, cfg.preprocess.pitch_dir, uid, pitch)
+            if cfg.preprocess.extract_uv:
+                assert isinstance(pitch, np.ndarray)
+                uv = pitch != 0
+                save_feature(dataset_output, cfg.preprocess.uv_dir, uid, uv)
+        if cfg.preprocess.extract_audio:
+            save_feature(dataset_output, cfg.preprocess.audio_dir, uid, wav)
+        if cfg.preprocess.extract_label:
+            if cfg.preprocess.is_mu_law:
+                # compress audio
+                wav = compress(wav, cfg.preprocess.bits)
+            label = audio_to_label(wav, cfg.preprocess.bits)
+            save_feature(dataset_output, cfg.preprocess.label_dir, uid, label)
+        if cfg.preprocess.extract_acoustic_token:
+            if cfg.preprocess.acoustic_token_extractor == "Encodec":
+                codes = extract_encodec_token(wav_path)
+                save_feature(
+                    dataset_output, cfg.preprocess.acoustic_token_dir, uid, codes
+                )
+def extract_utt_acoustic_features_tts(dataset_output, cfg, utt):
+    __extract_utt_acoustic_features(dataset_output, cfg, utt)
+def extract_utt_acoustic_features_svc(dataset_output, cfg, utt):
+    """Extract acoustic features from utterances (in single process)
+    Args:
+        dataset_output (str): directory to store acoustic features
+        cfg (dict): dictionary that stores configurations
+        utt (dict): utterance info including dataset, singer, uid:{singer}_{song}_{index},
+                    path to utternace, duration, utternace index
+    """
+    from utils import audio, f0, world, duration
+    uid = utt["Uid"]
+    wav_path = utt["Path"]
+    with torch.no_grad():
+        # Load audio data into tensor with sample rate of the config file
+        wav_torch, _ = audio.load_audio_torch(wav_path, cfg.preprocess.sample_rate)
+        wav = wav_torch.cpu().numpy()
+        # extract features
+        if cfg.preprocess.extract_mel:
+            from utils.mel import extract_mel_features
+            mel = extract_mel_features(wav_torch.unsqueeze(0), cfg.preprocess)
+            save_feature(dataset_output, cfg.preprocess.mel_dir, uid, mel.cpu().numpy())
+        if cfg.preprocess.extract_energy:
+            energy = (mel.exp() ** 2).sum(0).sqrt().cpu().numpy()
+            save_feature(dataset_output, cfg.preprocess.energy_dir, uid, energy)
+        if cfg.preprocess.extract_pitch:
+            pitch = f0.get_f0(wav, cfg.preprocess)
+            save_feature(dataset_output, cfg.preprocess.pitch_dir, uid, pitch)
+            if cfg.preprocess.extract_uv:
+                assert isinstance(pitch, np.ndarray)
+                uv = pitch != 0
+                save_feature(dataset_output, cfg.preprocess.uv_dir, uid, uv)
+def extract_utt_acoustic_features_tta(dataset_output, cfg, utt):
+    __extract_utt_acoustic_features(dataset_output, cfg, utt)
+def extract_utt_acoustic_features_vocoder(dataset_output, cfg, utt):
+    """Extract acoustic features from utterances (in single process)
+    Args:
+        dataset_output (str): directory to store acoustic features
+        cfg (dict): dictionary that stores configurations
+        utt (dict): utterance info including dataset, singer, uid:{singer}_{song}_{index},
+                    path to utternace, duration, utternace index
+    """
+    from utils import audio, f0, world, duration
+    uid = utt["Uid"]
+    wav_path = utt["Path"]
+    with torch.no_grad():
+        # Load audio data into tensor with sample rate of the config file
+        wav_torch, _ = audio.load_audio_torch(wav_path, cfg.preprocess.sample_rate)
+        wav = wav_torch.cpu().numpy()
+        # extract features
+        if cfg.preprocess.extract_mel:
+            from utils.mel import extract_mel_features
+            mel = extract_mel_features(wav_torch.unsqueeze(0), cfg.preprocess)
+            save_feature(dataset_output, cfg.preprocess.mel_dir, uid, mel.cpu().numpy())
+        if cfg.preprocess.extract_energy:
+            if (
+                cfg.preprocess.energy_extract_mode == "from_mel"
+                and cfg.preprocess.extract_mel
+            ):
+                energy = (mel.exp() ** 2).sum(0).sqrt().cpu().numpy()
+            elif cfg.preprocess.energy_extract_mode == "from_waveform":
+                energy = audio.energy(wav, cfg.preprocess)
+            else:
+                assert cfg.preprocess.energy_extract_mode in [
+                    "from_mel",
+                    "from_waveform",
+                ], f"{cfg.preprocess.energy_extract_mode} not in supported energy_extract_mode [from_mel, from_waveform, from_tacotron_stft]"
+            save_feature(dataset_output, cfg.preprocess.energy_dir, uid, energy)
+        if cfg.preprocess.extract_pitch:
+            pitch = f0.get_f0(wav, cfg.preprocess)
+            save_feature(dataset_output, cfg.preprocess.pitch_dir, uid, pitch)
+            if cfg.preprocess.extract_uv:
+                assert isinstance(pitch, np.ndarray)
+                uv = pitch != 0
+                save_feature(dataset_output, cfg.preprocess.uv_dir, uid, uv)
+        if cfg.preprocess.extract_audio:
+            save_feature(dataset_output, cfg.preprocess.audio_dir, uid, wav)
+        if cfg.preprocess.extract_label:
+            if cfg.preprocess.is_mu_law:
+                # compress audio
+                wav = compress(wav, cfg.preprocess.bits)
+            label = audio_to_label(wav, cfg.preprocess.bits)
+            save_feature(dataset_output, cfg.preprocess.label_dir, uid, label)
+def cal_normalized_mel(mel, dataset_name, cfg):
+    mel_min, mel_max = load_mel_extrema(cfg, dataset_name)
+    mel_norm = normalize_mel_channel(mel, mel_min, mel_max)
+    return mel_norm
+def cal_mel_min_max(dataset, output_path, cfg, metadata=None):
+    dataset_output = os.path.join(output_path, dataset)
+    if metadata is None:
+        metadata = []
+        for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
+            dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+            with open(dataset_file, "r") as f:
+                metadata.extend(json.load(f))
+    tmp_mel_min = []
+    tmp_mel_max = []
+    for item in metadata:
+        mel_path = os.path.join(
+            dataset_output, cfg.preprocess.mel_dir, item["Uid"] + ".npy"
+        )
+        if not os.path.exists(mel_path):
+            continue
+        mel = np.load(mel_path)
+        if mel.shape[0] != cfg.preprocess.n_mel:
+            mel = mel.T
+        # mel: (n_mels, T)
+        assert mel.shape[0] == cfg.preprocess.n_mel
+        tmp_mel_min.append(np.min(mel, axis=-1))
+        tmp_mel_max.append(np.max(mel, axis=-1))
+    mel_min = np.min(tmp_mel_min, axis=0)
+    mel_max = np.max(tmp_mel_max, axis=0)
+    ## save mel min max data
+    mel_min_max_dir = os.path.join(dataset_output, cfg.preprocess.mel_min_max_stats_dir)
+    os.makedirs(mel_min_max_dir, exist_ok=True)
+    mel_min_path = os.path.join(mel_min_max_dir, "mel_min.npy")
+    mel_max_path = os.path.join(mel_min_max_dir, "mel_max.npy")
+    np.save(mel_min_path, mel_min)
+    np.save(mel_max_path, mel_max)
+def denorm_for_pred_mels(cfg, dataset_name, split, pred):
+    """
+    Args:
+        pred: a list whose every element is (frame_len, n_mels)
+    Return:
+        similar like pred
+    """
+    mel_min, mel_max = load_mel_extrema(cfg.preprocess, dataset_name)
+    recovered_mels = [
+        denormalize_mel_channel(mel.T, mel_min, mel_max).T for mel in pred
+    ]
+    return recovered_mels
+def load_mel_extrema(cfg, dataset_name):
+    data_dir = os.path.join(cfg.processed_dir, dataset_name, cfg.mel_min_max_stats_dir)
+    min_file = os.path.join(data_dir, "mel_min.npy")
+    max_file = os.path.join(data_dir, "mel_max.npy")
+    mel_min = np.load(min_file)
+    mel_max = np.load(max_file)
+    return mel_min, mel_max
+def denormalize_mel_channel(mel, mel_min, mel_max):
+    mel_min = np.expand_dims(mel_min, -1)
+    mel_max = np.expand_dims(mel_max, -1)
+    return (mel + 1) / 2 * (mel_max - mel_min + ZERO) + mel_min
+def normalize_mel_channel(mel, mel_min, mel_max):
+    mel_min = np.expand_dims(mel_min, -1)
+    mel_max = np.expand_dims(mel_max, -1)
+    return (mel - mel_min) / (mel_max - mel_min + ZERO) * 2 - 1
+def normalize(dataset, feat_dir, cfg):
+    dataset_output = os.path.join(cfg.preprocess.processed_dir, dataset)
+    print(f"normalize {feat_dir}")
+    max_value = np.finfo(np.float64).min
+    min_value = np.finfo(np.float64).max
+    scaler = StandardScaler()
+    feat_files = os.listdir(os.path.join(dataset_output, feat_dir))
+    for feat_file in tqdm(feat_files):
+        feat_file = os.path.join(dataset_output, feat_dir, feat_file)
+        if not feat_file.endswith(".npy"):
+            continue
+        feat = np.load(feat_file)
+        max_value = max(max_value, max(feat))
+        min_value = min(min_value, min(feat))
+        scaler.partial_fit(feat.reshape((-1, 1)))
+    mean = scaler.mean_[0]
+    std = scaler.scale_[0]
+    stat = np.array([min_value, max_value, mean, std])
+    stat_npy = os.path.join(dataset_output, f"{feat_dir}_stat.npy")
+    np.save(stat_npy, stat)
+    return mean, std, min_value, max_value
+def load_normalized(feat_dir, dataset_name, cfg):
+    dataset_output = os.path.join(cfg.preprocess.processed_dir, dataset_name)
+    stat_npy = os.path.join(dataset_output, f"{feat_dir}_stat.npy")
+    min_value, max_value, mean, std = np.load(stat_npy)
+    return mean, std, min_value, max_value
+def cal_pitch_statistics_svc(dataset, output_path, cfg, metadata=None):
+    # path of dataset
+    dataset_dir = os.path.join(output_path, dataset)
+    save_dir = os.path.join(dataset_dir, cfg.preprocess.pitch_dir)
+    os.makedirs(save_dir, exist_ok=True)
+    if has_existed(os.path.join(save_dir, "statistics.json")):
+        return
+    if metadata is None:
+        # load singers and ids
+        singers = json.load(open(os.path.join(dataset_dir, "singers.json"), "r"))
+        # combine train and test metadata
+        metadata = []
+        for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
+            dataset_file = os.path.join(dataset_dir, "{}.json".format(dataset_type))
+            with open(dataset_file, "r") as f:
+                metadata.extend(json.load(f))
+    else:
+        singers = list(set([item["Singer"] for item in metadata]))
+        singers = {
+            "{}_{}".format(dataset, name): idx for idx, name in enumerate(singers)
+        }
+    # use different scalers for each singer
+    pitch_scalers = [[] for _ in range(len(singers))]
+    total_pitch_scalers = [[] for _ in range(len(singers))]
+    for utt_info in tqdm(metadata, desc="Loading F0..."):
+        # utt = f'{utt_info["Dataset"]}_{utt_info["Uid"]}'
+        singer = utt_info["Singer"]
+        pitch_path = os.path.join(
+            dataset_dir, cfg.preprocess.pitch_dir, utt_info["Uid"] + ".npy"
+        )
+        # total_pitch contains all pitch including unvoiced frames
+        if not os.path.exists(pitch_path):
+            continue
+        total_pitch = np.load(pitch_path)
+        assert len(total_pitch) > 0
+        # pitch contains only voiced frames
+        pitch = total_pitch[total_pitch != 0]
+        spkid = singers[f"{replace_augment_name(dataset)}_{singer}"]
+        # update pitch scalers
+        pitch_scalers[spkid].extend(pitch.tolist())
+        # update total pitch scalers
+        total_pitch_scalers[spkid].extend(total_pitch.tolist())
+    # save pitch statistics for each singer in dict
+    sta_dict = {}
+    for singer in tqdm(singers, desc="Singers statistics"):
+        spkid = singers[singer]
+        # voiced pitch statistics
+        mean, std, min, max, median = (
+            np.mean(pitch_scalers[spkid]),
+            np.std(pitch_scalers[spkid]),
+            np.min(pitch_scalers[spkid]),
+            np.max(pitch_scalers[spkid]),
+            np.median(pitch_scalers[spkid]),
+        )
+        # total pitch statistics
+        mean_t, std_t, min_t, max_t, median_t = (
+            np.mean(total_pitch_scalers[spkid]),
+            np.std(total_pitch_scalers[spkid]),
+            np.min(total_pitch_scalers[spkid]),
+            np.max(total_pitch_scalers[spkid]),
+            np.median(total_pitch_scalers[spkid]),
+        )
+        sta_dict[singer] = {
+            "voiced_positions": {
+                "mean": mean,
+                "std": std,
+                "median": median,
+                "min": min,
+                "max": max,
+            },
+            "total_positions": {
+                "mean": mean_t,
+                "std": std_t,
+                "median": median_t,
+                "min": min_t,
+                "max": max_t,
+            },
+        }
+    # save statistics
+    with open(os.path.join(save_dir, "statistics.json"), "w") as f:
+        json.dump(sta_dict, f, indent=4, ensure_ascii=False)
+def cal_pitch_statistics(dataset, output_path, cfg):
+    # path of dataset
+    dataset_dir = os.path.join(output_path, dataset)
+    if cfg.preprocess.use_phone_pitch:
+        pitch_dir = cfg.preprocess.phone_pitch_dir
+    else:
+        pitch_dir = cfg.preprocess.pitch_dir
+    save_dir = os.path.join(dataset_dir, pitch_dir)
+    os.makedirs(save_dir, exist_ok=True)
+    if has_existed(os.path.join(save_dir, "statistics.json")):
+        return
+    # load singers and ids
+    singers = json.load(open(os.path.join(dataset_dir, "singers.json"), "r"))
+    # combine train and test metadata
+    metadata = []
+    for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
+        dataset_file = os.path.join(dataset_dir, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+    # use different scalers for each singer
+    pitch_scalers = [[] for _ in range(len(singers))]
+    total_pitch_scalers = [[] for _ in range(len(singers))]
+    for utt_info in metadata:
+        utt = f'{utt_info["Dataset"]}_{utt_info["Uid"]}'
+        singer = utt_info["Singer"]
+        pitch_path = os.path.join(dataset_dir, pitch_dir, utt_info["Uid"] + ".npy")
+        # total_pitch contains all pitch including unvoiced frames
+        if not os.path.exists(pitch_path):
+            continue
+        total_pitch = np.load(pitch_path)
+        assert len(total_pitch) > 0
+        # pitch contains only voiced frames
+        # pitch = total_pitch[total_pitch != 0]
+        if cfg.preprocess.pitch_remove_outlier:
+            pitch = remove_outlier(total_pitch)
+        spkid = singers[f"{replace_augment_name(dataset)}_{singer}"]
+        # update pitch scalers
+        pitch_scalers[spkid].extend(pitch.tolist())
+        # update total pitch scalers
+        total_pitch_scalers[spkid].extend(total_pitch.tolist())
+    # save pitch statistics for each singer in dict
+    sta_dict = {}
+    for singer in singers:
+        spkid = singers[singer]
+        # voiced pitch statistics
+        mean, std, min, max, median = (
+            np.mean(pitch_scalers[spkid]),
+            np.std(pitch_scalers[spkid]),
+            np.min(pitch_scalers[spkid]),
+            np.max(pitch_scalers[spkid]),
+            np.median(pitch_scalers[spkid]),
+        )
+        # total pitch statistics
+        mean_t, std_t, min_t, max_t, median_t = (
+            np.mean(total_pitch_scalers[spkid]),
+            np.std(total_pitch_scalers[spkid]),
+            np.min(total_pitch_scalers[spkid]),
+            np.max(total_pitch_scalers[spkid]),
+            np.median(total_pitch_scalers[spkid]),
+        )
+        sta_dict[singer] = {
+            "voiced_positions": {
+                "mean": mean,
+                "std": std,
+                "median": median,
+                "min": min,
+                "max": max,
+            },
+            "total_positions": {
+                "mean": mean_t,
+                "std": std_t,
+                "median": median_t,
+                "min": min_t,
+                "max": max_t,
+            },
+        }
+    # save statistics
+    with open(os.path.join(save_dir, "statistics.json"), "w") as f:
+        json.dump(sta_dict, f, indent=4, ensure_ascii=False)
+def cal_energy_statistics(dataset, output_path, cfg):
+    # path of dataset
+    dataset_dir = os.path.join(output_path, dataset)
+    if cfg.preprocess.use_phone_energy:
+        energy_dir = cfg.preprocess.phone_energy_dir
+    else:
+        energy_dir = cfg.preprocess.energy_dir
+    save_dir = os.path.join(dataset_dir, energy_dir)
+    os.makedirs(save_dir, exist_ok=True)
+    print(os.path.join(save_dir, "statistics.json"))
+    if has_existed(os.path.join(save_dir, "statistics.json")):
+        return
+    # load singers and ids
+    singers = json.load(open(os.path.join(dataset_dir, "singers.json"), "r"))
+    # combine train and test metadata
+    metadata = []
+    for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
+        dataset_file = os.path.join(dataset_dir, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+    # use different scalers for each singer
+    energy_scalers = [[] for _ in range(len(singers))]
+    total_energy_scalers = [[] for _ in range(len(singers))]
+    for utt_info in metadata:
+        utt = f'{utt_info["Dataset"]}_{utt_info["Uid"]}'
+        singer = utt_info["Singer"]
+        energy_path = os.path.join(dataset_dir, energy_dir, utt_info["Uid"] + ".npy")
+        # total_energy contains all energy including unvoiced frames
+        if not os.path.exists(energy_path):
+            continue
+        total_energy = np.load(energy_path)
+        assert len(total_energy) > 0
+        # energy contains only voiced frames
+        # energy = total_energy[total_energy != 0]
+        if cfg.preprocess.energy_remove_outlier:
+            energy = remove_outlier(total_energy)
+        spkid = singers[f"{replace_augment_name(dataset)}_{singer}"]
+        # update energy scalers
+        energy_scalers[spkid].extend(energy.tolist())
+        # update total energyscalers
+        total_energy_scalers[spkid].extend(total_energy.tolist())
+    # save energy statistics for each singer in dict
+    sta_dict = {}
+    for singer in singers:
+        spkid = singers[singer]
+        # voiced energy statistics
+        mean, std, min, max, median = (
+            np.mean(energy_scalers[spkid]),
+            np.std(energy_scalers[spkid]),
+            np.min(energy_scalers[spkid]),
+            np.max(energy_scalers[spkid]),
+            np.median(energy_scalers[spkid]),
+        )
+        # total energy statistics
+        mean_t, std_t, min_t, max_t, median_t = (
+            np.mean(total_energy_scalers[spkid]),
+            np.std(total_energy_scalers[spkid]),
+            np.min(total_energy_scalers[spkid]),
+            np.max(total_energy_scalers[spkid]),
+            np.median(total_energy_scalers[spkid]),
+        )
+        sta_dict[singer] = {
+            "voiced_positions": {
+                "mean": mean,
+                "std": std,
+                "median": median,
+                "min": min,
+                "max": max,
+            },
+            "total_positions": {
+                "mean": mean_t,
+                "std": std_t,
+                "median": median_t,
+                "min": min_t,
+                "max": max_t,
+            },
+        }
+    # save statistics
+    with open(os.path.join(save_dir, "statistics.json"), "w") as f:
+        json.dump(sta_dict, f, indent=4, ensure_ascii=False)
+def copy_acoustic_features(metadata, dataset_dir, src_dataset_dir, cfg):
+    """Copy acoustic features from src_dataset_dir to dataset_dir
+    Args:
+        metadata (dict): dictionary that stores data in train.json and test.json files
+        dataset_dir (str): directory to store acoustic features
+        src_dataset_dir (str): directory to store acoustic features
+        cfg (dict): dictionary that stores configurations
+    """
+    if cfg.preprocess.extract_mel:
+        if not has_existed(os.path.join(dataset_dir, cfg.preprocess.mel_dir)):
+            os.makedirs(
+                os.path.join(dataset_dir, cfg.preprocess.mel_dir), exist_ok=True
+            )
+            print(
+                "Copying mel features from {} to {}...".format(
+                    src_dataset_dir, dataset_dir
+                )
+            )
+            for utt_info in tqdm(metadata):
+                src_mel_path = os.path.join(
+                    src_dataset_dir, cfg.preprocess.mel_dir, utt_info["Uid"] + ".npy"
+                )
+                dst_mel_path = os.path.join(
+                    dataset_dir, cfg.preprocess.mel_dir, utt_info["Uid"] + ".npy"
+                )
+                # create soft-links
+                if not os.path.exists(dst_mel_path):
+                    os.symlink(src_mel_path, dst_mel_path)
+    if cfg.preprocess.extract_energy:
+        if not has_existed(os.path.join(dataset_dir, cfg.preprocess.energy_dir)):
+            os.makedirs(
+                os.path.join(dataset_dir, cfg.preprocess.energy_dir), exist_ok=True
+            )
+            print(
+                "Copying energy features from {} to {}...".format(
+                    src_dataset_dir, dataset_dir
+                )
+            )
+            for utt_info in tqdm(metadata):
+                src_energy_path = os.path.join(
+                    src_dataset_dir, cfg.preprocess.energy_dir, utt_info["Uid"] + ".npy"
+                )
+                dst_energy_path = os.path.join(
+                    dataset_dir, cfg.preprocess.energy_dir, utt_info["Uid"] + ".npy"
+                )
+                # create soft-links
+                if not os.path.exists(dst_energy_path):
+                    os.symlink(src_energy_path, dst_energy_path)
+    if cfg.preprocess.extract_pitch:
+        if not has_existed(os.path.join(dataset_dir, cfg.preprocess.pitch_dir)):
+            os.makedirs(
+                os.path.join(dataset_dir, cfg.preprocess.pitch_dir), exist_ok=True
+            )
+            print(
+                "Copying pitch features from {} to {}...".format(
+                    src_dataset_dir, dataset_dir
+                )
+            )
+            for utt_info in tqdm(metadata):
+                src_pitch_path = os.path.join(
+                    src_dataset_dir, cfg.preprocess.pitch_dir, utt_info["Uid"] + ".npy"
+                )
+                dst_pitch_path = os.path.join(
+                    dataset_dir, cfg.preprocess.pitch_dir, utt_info["Uid"] + ".npy"
+                )
+                # create soft-links
+                if not os.path.exists(dst_pitch_path):
+                    os.symlink(src_pitch_path, dst_pitch_path)
+        if cfg.preprocess.extract_uv:
+            if not has_existed(os.path.join(dataset_dir, cfg.preprocess.uv_dir)):
+                os.makedirs(
+                    os.path.join(dataset_dir, cfg.preprocess.uv_dir), exist_ok=True
+                )
+                print(
+                    "Copying uv features from {} to {}...".format(
+                        src_dataset_dir, dataset_dir
+                    )
+                )
+                for utt_info in tqdm(metadata):
+                    src_uv_path = os.path.join(
+                        src_dataset_dir, cfg.preprocess.uv_dir, utt_info["Uid"] + ".npy"
+                    )
+                    dst_uv_path = os.path.join(
+                        dataset_dir, cfg.preprocess.uv_dir, utt_info["Uid"] + ".npy"
+                    )
+                    # create soft-links
+                    if not os.path.exists(dst_uv_path):
+                        os.symlink(src_uv_path, dst_uv_path)
+    if cfg.preprocess.extract_audio:
+        if not has_existed(os.path.join(dataset_dir, cfg.preprocess.audio_dir)):
+            os.makedirs(
+                os.path.join(dataset_dir, cfg.preprocess.audio_dir), exist_ok=True
+            )
+            print(
+                "Copying audio features from {} to {}...".format(
+                    src_dataset_dir, dataset_dir
+                )
+            )
+            for utt_info in tqdm(metadata):
+                src_audio_path = os.path.join(
+                    src_dataset_dir, cfg.preprocess.audio_dir, utt_info["Uid"] + ".npy"
+                )
+                dst_audio_path = os.path.join(
+                    dataset_dir, cfg.preprocess.audio_dir, utt_info["Uid"] + ".npy"
+                )
+                # create soft-links
+                if not os.path.exists(dst_audio_path):
+                    os.symlink(src_audio_path, dst_audio_path)
+    if cfg.preprocess.extract_label:
+        if not has_existed(os.path.join(dataset_dir, cfg.preprocess.label_dir)):
+            os.makedirs(
+                os.path.join(dataset_dir, cfg.preprocess.label_dir), exist_ok=True
+            )
+            print(
+                "Copying label features from {} to {}...".format(
+                    src_dataset_dir, dataset_dir
+                )
+            )
+            for utt_info in tqdm(metadata):
+                src_label_path = os.path.join(
+                    src_dataset_dir, cfg.preprocess.label_dir, utt_info["Uid"] + ".npy"
+                )
+                dst_label_path = os.path.join(
+                    dataset_dir, cfg.preprocess.label_dir, utt_info["Uid"] + ".npy"
+                )
+                # create soft-links
+                if not os.path.exists(dst_label_path):
+                    os.symlink(src_label_path, dst_label_path)
+def align_duration_mel(dataset, output_path, cfg):
+    print("align the duration and mel")
+    dataset_dir = os.path.join(output_path, dataset)
+    metadata = []
+    for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
+        dataset_file = os.path.join(dataset_dir, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+    utt2dur = {}
+    for index in tqdm(range(len(metadata))):
+        utt_info = metadata[index]
+        dataset = utt_info["Dataset"]
+        uid = utt_info["Uid"]
+        utt = "{}_{}".format(dataset, uid)
+        mel_path = os.path.join(dataset_dir, cfg.preprocess.mel_dir, uid + ".npy")
+        mel = np.load(mel_path).transpose(1, 0)
+        duration_path = os.path.join(
+            dataset_dir, cfg.preprocess.duration_dir, uid + ".npy"
+        )
+        duration = np.load(duration_path)
+        if sum(duration) != mel.shape[0]:
+            duration_sum = sum(duration)
+            mel_len = mel.shape[0]
+            mismatch = abs(duration_sum - mel_len)
+            assert mismatch <= 5, "duration and mel length mismatch!"
+            cloned = np.array(duration, copy=True)
+            if duration_sum > mel_len:
+                for j in range(1, len(duration) - 1):
+                    if mismatch == 0:
+                        break
+                    dur_val = cloned[-j]
+                    if dur_val >= mismatch:
+                        cloned[-j] -= mismatch
+                        mismatch -= dur_val
+                        break
+                    else:
+                        cloned[-j] = 0
+                        mismatch -= dur_val
+            elif duration_sum < mel_len:
+                cloned[-1] += mismatch
+            duration = cloned
+        utt2dur[utt] = duration
+        np.save(duration_path, duration)
+    return utt2dur

processors/content_extractor.py ADDED Viewed

	@@ -0,0 +1,540 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import torch
+import numpy as np
+import yaml
+import copy
+from tqdm import tqdm
+from torchaudio.compliance import kaldi
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader
+from fairseq import checkpoint_utils
+from transformers import AutoModel, Wav2Vec2FeatureExtractor
+from utils.io_optim import (
+    TorchaudioDataset,
+    LibrosaDataset,
+    FFmpegDataset,
+    collate_batch,
+)
+from modules import whisper_extractor as whisper
+from modules.wenet_extractor.utils.init_model import init_model
+from modules.wenet_extractor.utils.checkpoint import load_checkpoint
+"""
+    Extractor for content features
+    1. whisper
+    2. contentvec
+    3. wenet
+    4. mert
+    Pipeline:
+        in preprocess.py:
+            call extract_utt_content_features() to extract content features for each utterance
+            extract_utt_content_features() envelopes the following steps:
+                1. load the model (whisper, contentvec, wenet)
+                2. extract the content features
+                3. save the content features into files
+        in svc_dataset.py:
+            call offline_align() to align the content features to the given target length
+"""
+"""
+    Extractor Usage:
+        1. initialize an instance of extractor
+            extractor = WhisperExtractor(cfg)
+        2. load the specified model
+            extractor.load_model()
+        3. extract the content features
+            extractor.extract_content(utt) for single utterance
+            extractor.extract_content_batch(utts) for batch utterances
+        4. save the content features
+            extractor.save_feature(utt, content_feature) for single utterance
+"""
+class BaseExtractor:
+    def __init__(self, cfg):
+        self.cfg = cfg
+        self.extractor_type = None
+        self.model = None
+    def offline_align(self, content, target_len):
+        """
+        args:
+            content: (source_len, dim)
+            target_len: target length
+        return:
+            mapped_feature: (target_len, dim)
+        """
+        target_hop = self.cfg.preprocess.hop_size
+        assert self.extractor_type in ["whisper", "contentvec", "wenet"]
+        if self.extractor_type == "whisper":
+            source_hop = (
+                self.cfg.preprocess.whisper_frameshift
+                * self.cfg.preprocess.whisper_downsample_rate
+                * self.cfg.preprocess.sample_rate
+            )
+        elif self.extractor_type == "contentvec":
+            source_hop = (
+                self.cfg.preprocess.contentvec_frameshift
+                * self.cfg.preprocess.sample_rate
+            )
+        elif self.extractor_type == "wenet":
+            source_hop = (
+                self.cfg.preprocess.wenet_frameshift
+                * self.cfg.preprocess.wenet_downsample_rate
+                * self.cfg.preprocess.sample_rate
+            )
+        source_hop = int(source_hop)
+        factor = np.gcd(source_hop, target_hop)
+        source_hop //= factor
+        target_hop //= factor
+        # (source_len, 256)
+        _, width = content.shape
+        # slice the content from padded feature
+        source_len = min(target_len * target_hop // source_hop + 1, len(content))
+        # const ~= target_len * target_hop
+        const = source_len * source_hop // target_hop * target_hop
+        # (source_len * source_hop, dim)
+        up_sampling_feats = np.repeat(content, source_hop, axis=0)
+        # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
+        down_sampling_feats = np.average(
+            up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
+        )
+        err = abs(target_len - len(down_sampling_feats))
+        if err > 8:
+            # err_log_dir is indeterminate
+            err_log_dir = os.path.join(
+                self.cfg.preprocess.processed_dir, "align_max_err.log"
+            )
+            try:
+                with open(err_log_dir, "r") as f:
+                    err_num = int(f.read())
+            except:
+                with open(err_log_dir, "w") as f:
+                    f.write("0")
+                err_num = 0
+            if err > err_num:
+                with open(err_log_dir, "w") as f:
+                    f.write(str(err))
+        if len(down_sampling_feats) < target_len:
+            # (1, dim) -> (err, dim)
+            end = down_sampling_feats[-1][None, :].repeat(err, axis=0)
+            down_sampling_feats = np.concatenate([down_sampling_feats, end], axis=0)
+        # (target_len, dim)
+        mapped_feature = down_sampling_feats[:target_len]
+        return mapped_feature
+    def save_feature(self, utt, content_feature):
+        """Save a single utternace to path {cfg.preprocess.processed_dir}
+        Args:
+            utt (dict): one item in metadata, containing information for one utterance
+            content_feature (tensor): content feature of one utterance
+        """
+        uid = utt["Uid"]
+        assert self.extractor_type != None
+        out_dir = os.path.join(
+            self.cfg.preprocess.processed_dir, utt["Dataset"], self.extractor_type
+        )
+        os.makedirs(out_dir, exist_ok=True)
+        save_path = os.path.join(out_dir, uid + ".npy")
+        # only keep effective parts
+        duration = utt["Duration"]
+        if self.extractor_type == "whisper":
+            frameshift = (
+                self.cfg.preprocess.whisper_frameshift
+                * self.cfg.preprocess.whisper_downsample_rate
+            )  # 20ms
+        elif self.extractor_type == "contentvec":
+            frameshift = self.cfg.preprocess.contentvec_frameshift  # 20ms
+        elif self.extractor_type == "wenet":
+            frameshift = (
+                self.cfg.preprocess.wenet_frameshift
+                * self.cfg.preprocess.wenet_downsample_rate
+            )  # 40ms
+        elif self.extractor_type == "mert":
+            frameshift = self.cfg.preprocess.mert_frameshift
+        else:
+            raise NotImplementedError
+        # calculate the number of valid frames
+        num_frames = int(np.ceil((duration - frameshift) / frameshift)) + 1
+        # (num_frames, dim) -> (valid_frames, dim)
+        assert (
+            len(content_feature.shape) == 2
+        ), "content feature shape error, it should be (num_frames, dim)"
+        content_feature = content_feature[:num_frames, :]
+        np.save(save_path, content_feature.cpu().detach().numpy())
+class WhisperExtractor(BaseExtractor):
+    def __init__(self, config):
+        super(WhisperExtractor, self).__init__(config)
+        self.extractor_type = "whisper"
+    def load_model(self):
+        # load whisper checkpoint
+        print("Loading Whisper Model...")
+        checkpoint_file = (
+            self.cfg.preprocess.whisper_model_path
+            if "whisper_model_path" in self.cfg.preprocess
+            else None
+        )
+        model = whisper.load_model(
+            self.cfg.preprocess.whisper_model, checkpoint_file=checkpoint_file
+        )
+        if torch.cuda.is_available():
+            print("Using GPU...\n")
+            model = model.cuda()
+        else:
+            print("Using CPU...\n")
+        self.model = model.eval()
+    def extract_content_features(self, wavs, lens):
+        """extract content features from a batch of dataloader
+        Args:
+            wavs: tensor (batch_size, T)
+            lens: list
+        """
+        # wavs: (batch, max_len)
+        wavs = whisper.pad_or_trim(wavs)
+        # batch_mel: (batch, 80, 3000)
+        batch_mel = whisper.log_mel_spectrogram(wavs).to(self.model.device)
+        with torch.no_grad():
+            # (batch, 1500, 1024)
+            features = self.model.embed_audio(batch_mel)
+        return features
+class ContentvecExtractor(BaseExtractor):
+    def __init__(self, cfg):
+        super(ContentvecExtractor, self).__init__(cfg)
+        self.extractor_type = "contentvec"
+    def load_model(self):
+        assert self.model == None
+        # Load model
+        ckpt_path = self.cfg.preprocess.contentvec_file
+        print("Load Contentvec Model...")
+        models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+            [ckpt_path],
+            suffix="",
+        )
+        model = models[0]
+        model.eval()
+        if torch.cuda.is_available():
+            # print("Using GPU...\n")
+            model = model.cuda()
+        self.model = model
+    def extract_content_features(self, wavs, lens):
+        """extract content features from a batch of dataloader
+        Args:
+            wavs: tensor (batch, T)
+            lens: list
+        """
+        device = next(self.model.parameters()).device
+        wavs = wavs.to(device)  # (batch, max_len)
+        padding_mask = torch.eq(wavs, torch.zeros_like(wavs)).to(device)
+        with torch.no_grad():
+            logits = self.model.extract_features(
+                source=wavs, padding_mask=padding_mask, output_layer=12
+            )
+            # feats: (batch, T, 256)
+            feats = self.model.final_proj(logits[0])
+        return feats
+class WenetExtractor(BaseExtractor):
+    def __init__(self, config):
+        super(WenetExtractor, self).__init__(config)
+        self.extractor_type = "wenet"
+    def load_model(self):
+        wenet_cfg = self.cfg.preprocess.wenet_config
+        wenet_model_path = self.cfg.preprocess.wenet_model_path
+        # load Wenet config
+        with open(wenet_cfg, "r") as w:
+            wenet_configs = yaml.load(w, Loader=yaml.FullLoader)
+        self.extract_conf = copy.deepcopy(wenet_configs["dataset_conf"])
+        print("Loading Wenet Model...")
+        self.model = init_model(wenet_configs)
+        load_checkpoint(self.model, wenet_model_path)
+        if torch.cuda.is_available():
+            print("Using GPU...\n")
+            self.model = self.model.cuda()
+        else:
+            print("Using CPU...\n")
+        self.model = self.model.eval()
+    def extract_content_features(self, wavs, lens):
+        """extract content features from a batch of dataloader
+        Args:
+            wavs: tensor
+            lens: list
+        """
+        feats_list = []
+        lengths_list = []
+        device = next(self.model.parameters()).device
+        # Extract fbank/mfcc features by kaldi
+        assert self.extract_conf is not None, "load model first!"
+        feats_type = self.extract_conf.get("feats_type", "fbank")
+        assert feats_type in ["fbank", "mfcc"]
+        for idx, wav in enumerate(wavs):
+            # wav: (T)
+            wav = wav[: lens[idx]].to(device)
+            # pad one frame to compensate for the frame cut off after feature extraction
+            pad_tensor = torch.zeros(160, device=wav.device)
+            wav = torch.cat((wav, pad_tensor), dim=-1)
+            wav *= 1 << 15
+            wav = wav.unsqueeze(0)  # (T) -> (1, T)
+            if feats_type == "fbank":
+                fbank_conf = self.extract_conf.get("fbank_conf", {})
+                feat = kaldi.fbank(
+                    wav,
+                    sample_frequency=16000,
+                    num_mel_bins=fbank_conf["num_mel_bins"],
+                    frame_length=fbank_conf["frame_length"],
+                    frame_shift=fbank_conf["frame_shift"],
+                    dither=fbank_conf["dither"],
+                )
+            elif feats_type == "mfcc":
+                mfcc_conf = self.extract_conf.get("mfcc", {})
+                feat = kaldi.mfcc(
+                    wav,
+                    sample_frequency=16000,
+                    num_mel_bins=mfcc_conf["num_mel_bins"],
+                    frame_length=mfcc_conf["frame_length"],
+                    frame_shift=mfcc_conf["frame_shift"],
+                    dither=mfcc_conf["dither"],
+                    num_ceps=mfcc_conf.get("num_ceps", 40),
+                    high_freq=mfcc_conf.get("high_freq", 0.0),
+                    low_freq=mfcc_conf.get("low_freq", 20.0),
+                )
+            feats_list.append(feat)
+            lengths_list.append(feat.shape[0])
+        feats_lengths = torch.tensor(lengths_list, dtype=torch.int32).to(device)
+        feats_tensor = pad_sequence(feats_list, batch_first=True).to(
+            device
+        )  # (batch, len, 80)
+        features = self.model.encoder_extractor(
+            feats_tensor,
+            feats_lengths,
+            decoding_chunk_size=-1,
+            num_decoding_left_chunks=-1,
+            simulate_streaming=False,
+        )
+        return features
+class MertExtractor(BaseExtractor):
+    def __init__(self, cfg):
+        super(MertExtractor, self).__init__(cfg)
+        self.extractor_type = "mert"
+        self.preprocessor = None
+    def load_model(self):
+        assert self.model == None
+        assert self.preprocessor == None
+        print("Loading MERT Model: ...", self.cfg.preprocess.mert_model)
+        local_mert_path = "/mnt/workspace/fangzihao/acce/Amphion/pretrained/MERT"
+        model_name = self.cfg.preprocess.mert_model
+        model = AutoModel.from_pretrained(local_mert_path, trust_remote_code=True)
+        if torch.cuda.is_available():
+            model = model.cuda()
+        preprocessor = Wav2Vec2FeatureExtractor.from_pretrained(
+            local_mert_path, trust_remote_code=True
+        )
+        self.model = model
+        self.preprocessor = preprocessor
+    def extract_content_features(self, wavs, lens):
+        """extract content features from a batch of dataloader
+        Args:
+            wavs: tensor (batch, T)
+            lens: list
+        """
+        with torch.no_grad():
+            sample_rate = self.preprocessor.sampling_rate
+            device = next(self.model.parameters()).device
+            assert (
+                sample_rate == self.cfg.preprocess.mert_sample_rate
+            ), "mert sample rate mismatch, expected {}, got {}".format(
+                self.cfg.preprocess.mert_sample_rate, sample_rate
+            )
+            mert_features = []
+            # wav: (len)
+            for wav in wavs:
+                # {input_values: tensor, attention_mask: tensor}
+                inputs = self.preprocessor(
+                    wavs, sampling_rate=sample_rate, return_tensors="pt"
+                ).to(device)
+                outputs = self.model(**inputs, output_hidden_states=True)
+                # (25 layers, time steps, 1024 feature_dim)
+                all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze()
+                # (1, frame_len, 1024) -> (frame_len, 1024)
+                feature = outputs.hidden_states[
+                    self.cfg.preprocess.mert_feature_layer
+                ].squeeze(0)
+                mert_features.append(feature)
+        return mert_features
+def extract_utt_content_features_dataloader(cfg, metadata, num_workers):
+    dataset_name = metadata[0]["Dataset"]
+    if cfg.preprocess.extract_whisper_feature:
+        feat_dir = os.path.join(cfg.preprocess.processed_dir, dataset_name, "whisper")
+        os.makedirs(feat_dir, exist_ok=True)
+        feat_files_num = len(os.listdir(feat_dir))
+        if feat_files_num != len(metadata):
+            whisper_waveforms = FFmpegDataset(
+                cfg, dataset_name, cfg.preprocess.whisper_sample_rate, metadata=metadata
+            )
+            data_loader = DataLoader(
+                whisper_waveforms,
+                num_workers=num_workers,
+                shuffle=False,
+                pin_memory=cfg.preprocess.pin_memory,
+                batch_size=cfg.preprocess.content_feature_batch_size,
+                collate_fn=collate_batch,
+                drop_last=False,
+            )
+            extractor = WhisperExtractor(cfg)
+            extractor.load_model()
+            for batch_idx, items in enumerate(tqdm(data_loader)):
+                _metadata, wavs, lens = items
+                batch_content_features = extractor.extract_content_features(
+                    wavs,
+                    lens,
+                )
+                for index, utt in enumerate(_metadata):
+                    extractor.save_feature(utt, batch_content_features[index])
+    if cfg.preprocess.extract_contentvec_feature:
+        feat_dir = os.path.join(
+            cfg.preprocess.processed_dir, dataset_name, "contentvec"
+        )
+        os.makedirs(feat_dir, exist_ok=True)
+        feat_files_num = len(os.listdir(feat_dir))
+        if feat_files_num != len(metadata):
+            contentvec_waveforms = LibrosaDataset(
+                cfg,
+                dataset_name,
+                cfg.preprocess.contentvec_sample_rate,
+                metadata=metadata,
+            )
+            data_loader = DataLoader(
+                contentvec_waveforms,
+                num_workers=num_workers,
+                shuffle=False,
+                pin_memory=cfg.preprocess.pin_memory,
+                batch_size=cfg.preprocess.content_feature_batch_size,
+                collate_fn=collate_batch,
+                drop_last=False,
+            )
+            extractor = ContentvecExtractor(cfg)
+            extractor.load_model()
+            for batch_idx, items in enumerate(tqdm(data_loader)):
+                _metadata, wavs, lens = items
+                batch_content_features = extractor.extract_content_features(wavs, lens)
+                for index, utt in enumerate(_metadata):
+                    extractor.save_feature(utt, batch_content_features[index])
+    if cfg.preprocess.extract_wenet_feature:
+        feat_dir = os.path.join(cfg.preprocess.processed_dir, dataset_name, "wenet")
+        os.makedirs(feat_dir, exist_ok=True)
+        feat_files_num = len(os.listdir(feat_dir))
+        if feat_files_num != len(metadata):
+            wenet_waveforms = TorchaudioDataset(
+                cfg, dataset_name, cfg.preprocess.wenet_sample_rate, metadata=metadata
+            )
+            data_loader = DataLoader(
+                wenet_waveforms,
+                num_workers=num_workers,
+                shuffle=False,
+                pin_memory=cfg.preprocess.pin_memory,
+                batch_size=cfg.preprocess.content_feature_batch_size,
+                collate_fn=collate_batch,
+                drop_last=False,
+            )
+            extractor = WenetExtractor(cfg)
+            extractor.load_model()
+            for batch_idx, items in enumerate(tqdm(data_loader)):
+                _metadata, wavs, lens = items
+                batch_content_features = extractor.extract_content_features(
+                    wavs,
+                    lens,
+                )
+                for index, utt in enumerate(_metadata):
+                    extractor.save_feature(utt, batch_content_features[index])
+    if cfg.preprocess.extract_mert_feature:
+        feat_dir = os.path.join(cfg.preprocess.processed_dir, dataset_name, "mert")
+        os.makedirs(feat_dir, exist_ok=True)
+        feat_files_num = len(os.listdir(feat_dir))
+        if feat_files_num != len(metadata):
+            mert_waveforms = TorchaudioDataset(
+                cfg, dataset_name, cfg.preprocess.mert_sample_rate, metadata=metadata
+            )
+            data_loader = DataLoader(
+                mert_waveforms,
+                num_workers=num_workers,
+                shuffle=False,
+                pin_memory=cfg.preprocess.pin_memory,
+                batch_size=cfg.preprocess.content_feature_batch_size,
+                collate_fn=collate_batch,
+                drop_last=False,
+            )
+            extractor = MertExtractor(cfg)
+            extractor.load_model()
+            for batch_idx, items in enumerate(tqdm(data_loader)):
+                _metadata, wavs, lens = items
+                batch_content_features = extractor.extract_content_features(
+                    wavs,
+                    lens,
+                )
+                for index, utt in enumerate(_metadata):
+                    extractor.save_feature(utt, batch_content_features[index])

processors/data_augment.py ADDED Viewed

	@@ -0,0 +1,378 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import random
+import os
+import json
+import numpy as np
+import parselmouth
+import torch
+import torchaudio
+from tqdm import tqdm
+from audiomentations import TimeStretch
+from pedalboard import (
+    Pedalboard,
+    HighShelfFilter,
+    LowShelfFilter,
+    PeakFilter,
+    PitchShift,
+)
+from utils.util import has_existed
+PRAAT_CHANGEGENDER_PITCHMEDIAN_DEFAULT = 0.0
+PRAAT_CHANGEGENDER_FORMANTSHIFTRATIO_DEFAULT = 1.0
+PRAAT_CHANGEGENDER_PITCHSHIFTRATIO_DEFAULT = 1.0
+PRAAT_CHANGEGENDER_PITCHRANGERATIO_DEFAULT = 1.0
+PRAAT_CHANGEGENDER_DURATIONFACTOR_DEFAULT = 1.0
+def wav_to_Sound(wav, sr: int) -> parselmouth.Sound:
+    """Convert a waveform to a parselmouth.Sound object
+    Args:
+        wav (np.ndarray/torch.Tensor): waveform of shape (n_channels, n_samples)
+        sr (int, optional): sampling rate.
+    Returns:
+        parselmouth.Sound: a parselmouth.Sound object
+    """
+    assert wav.shape == (1, len(wav[0])), "wav must be of shape (1, n_samples)"
+    sound = None
+    if isinstance(wav, np.ndarray):
+        sound = parselmouth.Sound(wav[0], sampling_frequency=sr)
+    elif isinstance(wav, torch.Tensor):
+        sound = parselmouth.Sound(wav[0].numpy(), sampling_frequency=sr)
+    assert sound is not None, "wav must be either np.ndarray or torch.Tensor"
+    return sound
+def get_pitch_median(wav, sr: int):
+    """Get the median pitch of a waveform
+    Args:
+        wav (np.ndarray/torch.Tensor): waveform of shape (n_channels, n_samples)
+        sr (int, optional): sampling rate.
+    Returns:
+        parselmouth.Pitch, float: a parselmouth.Pitch object and the median pitch
+    """
+    if not isinstance(wav, parselmouth.Sound):
+        sound = wav_to_Sound(wav, sr)
+    else:
+        sound = wav
+    pitch_median = PRAAT_CHANGEGENDER_PITCHMEDIAN_DEFAULT
+    # To Pitch: Time step(s)(standard value: 0.0), Pitch floor (Hz)(standard value: 75), Pitch ceiling (Hz)(standard value: 600.0)
+    pitch = parselmouth.praat.call(sound, "To Pitch", 0.8 / 75, 75, 600)
+    # Get quantile: From time (s), To time (s), Quantile(0.5 is then the 50% quantile, i.e., the median), Units (Hertz or Bark)
+    pitch_median = parselmouth.praat.call(pitch, "Get quantile", 0.0, 0.0, 0.5, "Hertz")
+    return pitch, pitch_median
+def change_gender(
+    sound,
+    pitch=None,
+    formant_shift_ratio: float = PRAAT_CHANGEGENDER_FORMANTSHIFTRATIO_DEFAULT,
+    new_pitch_median: float = PRAAT_CHANGEGENDER_PITCHMEDIAN_DEFAULT,
+    pitch_range_ratio: float = PRAAT_CHANGEGENDER_PITCHRANGERATIO_DEFAULT,
+    duration_factor: float = PRAAT_CHANGEGENDER_DURATIONFACTOR_DEFAULT,
+) -> parselmouth.Sound:
+    """Invoke change gender function in praat
+    Args:
+        sound (parselmouth.Sound): a parselmouth.Sound object
+        pitch (parselmouth.Pitch, optional): a parselmouth.Pitch object. Defaults to None.
+        formant_shift_ratio (float, optional): formant shift ratio. A value of 1.0 means no change. Greater than 1.0 means higher pitch. Less than 1.0 means lower pitch.
+        new_pitch_median (float, optional): new pitch median.
+        pitch_range_ratio (float, optional): pitch range ratio. A value of 1.0 means no change. Greater than 1.0 means higher pitch range. Less than 1.0 means lower pitch range.
+        duration_factor (float, optional): duration factor. A value of 1.0 means no change. Greater than 1.0 means longer duration. Less than 1.0 means shorter duration.
+    Returns:
+        parselmouth.Sound: a parselmouth.Sound object
+    """
+    if pitch is None:
+        new_sound = parselmouth.praat.call(
+            sound,
+            "Change gender",
+            75,
+            600,
+            formant_shift_ratio,
+            new_pitch_median,
+            pitch_range_ratio,
+            duration_factor,
+        )
+    else:
+        new_sound = parselmouth.praat.call(
+            (sound, pitch),
+            "Change gender",
+            formant_shift_ratio,
+            new_pitch_median,
+            pitch_range_ratio,
+            duration_factor,
+        )
+    return new_sound
+def apply_formant_and_pitch_shift(
+    sound: parselmouth.Sound,
+    formant_shift_ratio: float = PRAAT_CHANGEGENDER_FORMANTSHIFTRATIO_DEFAULT,
+    pitch_shift_ratio: float = PRAAT_CHANGEGENDER_PITCHSHIFTRATIO_DEFAULT,
+    pitch_range_ratio: float = PRAAT_CHANGEGENDER_PITCHRANGERATIO_DEFAULT,
+    duration_factor: float = PRAAT_CHANGEGENDER_DURATIONFACTOR_DEFAULT,
+) -> parselmouth.Sound:
+    """use Praat "Changer gender" command to manipulate pitch and formant
+    "Change gender": Praat -> Sound Object -> Convert -> Change gender
+    refer to Help of Praat for more details
+    # https://github.com/YannickJadoul/Parselmouth/issues/25#issuecomment-608632887 might help
+    """
+    pitch = None
+    new_pitch_median = PRAAT_CHANGEGENDER_PITCHMEDIAN_DEFAULT
+    if pitch_shift_ratio != 1.0:
+        pitch, pitch_median = get_pitch_median(sound, sound.sampling_frequency)
+        new_pitch_median = pitch_median * pitch_shift_ratio
+        # refer to https://github.com/praat/praat/issues/1926#issuecomment-974909408
+        pitch_minimum = parselmouth.praat.call(
+            pitch, "Get minimum", 0.0, 0.0, "Hertz", "Parabolic"
+        )
+        new_median = pitch_median * pitch_shift_ratio
+        scaled_minimum = pitch_minimum * pitch_shift_ratio
+        result_minimum = new_median + (scaled_minimum - new_median) * pitch_range_ratio
+        if result_minimum < 0:
+            new_pitch_median = PRAAT_CHANGEGENDER_PITCHMEDIAN_DEFAULT
+            pitch_range_ratio = PRAAT_CHANGEGENDER_PITCHRANGERATIO_DEFAULT
+        if math.isnan(new_pitch_median):
+            new_pitch_median = PRAAT_CHANGEGENDER_PITCHMEDIAN_DEFAULT
+            pitch_range_ratio = PRAAT_CHANGEGENDER_PITCHRANGERATIO_DEFAULT
+    new_sound = change_gender(
+        sound,
+        pitch,
+        formant_shift_ratio,
+        new_pitch_median,
+        pitch_range_ratio,
+        duration_factor,
+    )
+    return new_sound
+# Function used in EQ
+def pedalboard_equalizer(wav: np.ndarray, sr: int) -> np.ndarray:
+    """Use pedalboard to do equalizer"""
+    board = Pedalboard()
+    cutoff_low_freq = 60
+    cutoff_high_freq = 10000
+    q_min = 2
+    q_max = 5
+    random_all_freq = True
+    num_filters = 10
+    if random_all_freq:
+        key_freqs = [random.uniform(1, 12000) for _ in range(num_filters)]
+    else:
+        key_freqs = [
+            power_ratio(float(z) / (num_filters - 1), cutoff_low_freq, cutoff_high_freq)
+            for z in range(num_filters)
+        ]
+    q_values = [
+        power_ratio(random.uniform(0, 1), q_min, q_max) for _ in range(num_filters)
+    ]
+    gains = [random.uniform(-12, 12) for _ in range(num_filters)]
+    # low-shelving filter
+    board.append(
+        LowShelfFilter(
+            cutoff_frequency_hz=key_freqs[0], gain_db=gains[0], q=q_values[0]
+        )
+    )
+    # peaking filters
+    for i in range(1, 9):
+        board.append(
+            PeakFilter(
+                cutoff_frequency_hz=key_freqs[i], gain_db=gains[i], q=q_values[i]
+            )
+        )
+    # high-shelving filter
+    board.append(
+        HighShelfFilter(
+            cutoff_frequency_hz=key_freqs[9], gain_db=gains[9], q=q_values[9]
+        )
+    )
+    # Apply the pedalboard to the audio
+    processed_audio = board(wav, sr)
+    return processed_audio
+def power_ratio(r: float, a: float, b: float):
+    return a * math.pow((b / a), r)
+def audiomentations_time_stretch(wav: np.ndarray, sr: int) -> np.ndarray:
+    """Use audiomentations to do time stretch"""
+    transform = TimeStretch(
+        min_rate=0.8, max_rate=1.25, leave_length_unchanged=False, p=1.0
+    )
+    augmented_wav = transform(wav, sample_rate=sr)
+    return augmented_wav
+def formant_and_pitch_shift(
+    sound: parselmouth.Sound, fs: bool, ps: bool
+) -> parselmouth.Sound:
+    """ """
+    formant_shift_ratio = PRAAT_CHANGEGENDER_FORMANTSHIFTRATIO_DEFAULT
+    pitch_shift_ratio = PRAAT_CHANGEGENDER_PITCHSHIFTRATIO_DEFAULT
+    pitch_range_ratio = PRAAT_CHANGEGENDER_PITCHRANGERATIO_DEFAULT
+    assert fs != ps, "fs, ps are mutually exclusive"
+    if fs:
+        formant_shift_ratio = random.uniform(1.0, 1.4)
+        use_reciprocal = random.uniform(-1, 1) > 0
+        if use_reciprocal:
+            formant_shift_ratio = 1.0 / formant_shift_ratio
+        # only use praat to change formant
+        new_sound = apply_formant_and_pitch_shift(
+            sound,
+            formant_shift_ratio=formant_shift_ratio,
+        )
+        return new_sound
+    if ps:
+        board = Pedalboard()
+        board.append(PitchShift(random.uniform(-12, 12)))
+        wav_numpy = sound.values
+        wav_numpy = board(wav_numpy, sound.sampling_frequency)
+        # use pedalboard to change pitch
+        new_sound = parselmouth.Sound(
+            wav_numpy, sampling_frequency=sound.sampling_frequency
+        )
+        return new_sound
+def wav_manipulation(
+    wav: torch.Tensor,
+    sr: int,
+    aug_type: str = "None",
+    formant_shift: bool = False,
+    pitch_shift: bool = False,
+    time_stretch: bool = False,
+    equalizer: bool = False,
+) -> torch.Tensor:
+    assert aug_type == "None" or aug_type in [
+        "formant_shift",
+        "pitch_shift",
+        "time_stretch",
+        "equalizer",
+    ], "aug_type must be one of formant_shift, pitch_shift, time_stretch, equalizer"
+    assert aug_type == "None" or (
+        formant_shift == False
+        and pitch_shift == False
+        and time_stretch == False
+        and equalizer == False
+    ), "if aug_type is specified, other argument must be False"
+    if aug_type != "None":
+        if aug_type == "formant_shift":
+            formant_shift = True
+        if aug_type == "pitch_shift":
+            pitch_shift = True
+        if aug_type == "equalizer":
+            equalizer = True
+        if aug_type == "time_stretch":
+            time_stretch = True
+    wav_numpy = wav.numpy()
+    if equalizer:
+        wav_numpy = pedalboard_equalizer(wav_numpy, sr)
+    if time_stretch:
+        wav_numpy = audiomentations_time_stretch(wav_numpy, sr)
+    sound = wav_to_Sound(wav_numpy, sr)
+    if formant_shift or pitch_shift:
+        sound = formant_and_pitch_shift(sound, formant_shift, pitch_shift)
+    wav = torch.from_numpy(sound.values).float()
+    # shape (1, n_samples)
+    return wav
+def augment_dataset(cfg, dataset) -> list:
+    """Augment dataset with formant_shift, pitch_shift, time_stretch, equalizer
+    Args:
+        cfg (dict): configuration
+        dataset (str): dataset name
+    Returns:
+        list: augmented dataset names
+    """
+    # load metadata
+    dataset_path = os.path.join(cfg.preprocess.processed_dir, dataset)
+    split = ["train", "test"] if "eval" not in dataset else ["test"]
+    augment_datasets = []
+    aug_types = [
+        "formant_shift" if cfg.preprocess.use_formant_shift else None,
+        "pitch_shift" if cfg.preprocess.use_pitch_shift else None,
+        "time_stretch" if cfg.preprocess.use_time_stretch else None,
+        "equalizer" if cfg.preprocess.use_equalizer else None,
+    ]
+    aug_types = filter(None, aug_types)
+    for aug_type in aug_types:
+        print("Augmenting {} with {}...".format(dataset, aug_type))
+        new_dataset = dataset + "_" + aug_type
+        augment_datasets.append(new_dataset)
+        new_dataset_path = os.path.join(cfg.preprocess.processed_dir, new_dataset)
+        for dataset_type in split:
+            metadata_path = os.path.join(dataset_path, "{}.json".format(dataset_type))
+            augmented_metadata = []
+            new_metadata_path = os.path.join(
+                new_dataset_path, "{}.json".format(dataset_type)
+            )
+            os.makedirs(new_dataset_path, exist_ok=True)
+            new_dataset_wav_dir = os.path.join(new_dataset_path, "wav")
+            os.makedirs(new_dataset_wav_dir, exist_ok=True)
+            if has_existed(new_metadata_path):
+                continue
+            with open(metadata_path, "r") as f:
+                metadata = json.load(f)
+            for utt in tqdm(metadata):
+                original_wav_path = utt["Path"]
+                original_wav, sr = torchaudio.load(original_wav_path)
+                new_wav = wav_manipulation(original_wav, sr, aug_type=aug_type)
+                new_wav_path = os.path.join(new_dataset_wav_dir, utt["Uid"] + ".wav")
+                torchaudio.save(new_wav_path, new_wav, sr)
+                new_utt = {
+                    "Dataset": utt["Dataset"] + "_" + aug_type,
+                    "index": utt["index"],
+                    "Singer": utt["Singer"],
+                    "Uid": utt["Uid"],
+                    "Path": new_wav_path,
+                    "Duration": utt["Duration"],
+                }
+                augmented_metadata.append(new_utt)
+            new_metadata_path = os.path.join(
+                new_dataset_path, "{}.json".format(dataset_type)
+            )
+            with open(new_metadata_path, "w") as f:
+                json.dump(augmented_metadata, f, indent=4, ensure_ascii=False)
+    return augment_datasets

processors/phone_extractor.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+from tqdm import tqdm
+from text.g2p_module import G2PModule, LexiconModule
+from text.symbol_table import SymbolTable
+'''
+    phoneExtractor: extract phone from text
+'''
+class phoneExtractor:
+    def __init__(self, cfg, dataset_name=None, phone_symbol_file=None):
+        '''
+            Args:
+                cfg: config
+                dataset_name: name of dataset
+        '''
+        self.cfg = cfg
+        #  phone symbols dict
+        self.phone_symbols = set()
+        # phone symbols dict file
+        if phone_symbol_file is not None:
+            self.phone_symbols_file = phone_symbol_file
+        elif dataset_name is not None:
+            self.dataset_name = dataset_name
+            self.phone_symbols_file = os.path.join(cfg.preprocess.processed_dir,
+                                            dataset_name,
+                                            cfg.preprocess.symbols_dict)
+        # initialize g2p module
+        if cfg.preprocess.phone_extractor in ["espeak", "pypinyin", "pypinyin_initials_finals"]:
+            self.g2p_module = G2PModule(backend=cfg.preprocess.phone_extractor)
+        elif cfg.preprocess.phone_extractor == 'lexicon':
+            assert cfg.preprocess.lexicon_path != ""
+            self.g2p_module = LexiconModule(cfg.preprocess.lexicon_path)
+        else:
+            print('No suppert to', cfg.preprocess.phone_extractor)
+            raise
+    def extract_phone(self, text):
+        '''
+            Extract phone from text
+            Args:
+                text:  text of utterance
+            Returns:
+                phone_symbols: set of phone symbols
+                phone_seq: list of phone sequence of each utterance
+        '''
+        if self.cfg.preprocess.phone_extractor in ["espeak", "pypinyin", "pypinyin_initials_finals"]:
+            text = text.replace("”", '"').replace("“", '"')
+            phone = self.g2p_module.g2p_conversion(text=text)
+            self.phone_symbols.update(phone)
+            phone_seq = [phn for phn in phone]
+        elif self.cfg.preprocess.phone_extractor == 'lexicon':
+            phone_seq = self.g2p_module.g2p_conversion(text)
+            phone = phone_seq
+            if not isinstance(phone_seq, list):
+                phone_seq = phone_seq.split()
+        return phone_seq
+    def save_dataset_phone_symbols_to_table(self):
+        # load and merge saved phone symbols
+        if os.path.exists(self.phone_symbols_file):
+            phone_symbol_dict_saved = SymbolTable.from_file(self.phone_symbols_file)._sym2id.keys()
+            self.phone_symbols.update(set(phone_symbol_dict_saved))
+        # save phone symbols
+        phone_symbol_dict = SymbolTable()
+        for s in sorted(list(self.phone_symbols)):
+            phone_symbol_dict.add(s)
+        phone_symbol_dict.to_file(self.phone_symbols_file)
+def extract_utt_phone_sequence(cfg, metadata):
+    '''
+        Extract phone sequence from text
+        Args:
+            cfg: config
+            metadata: list of dict, each dict contains "Uid", "Text"
+    '''
+    dataset_name = cfg.dataset[0]
+    # output path
+    out_path = os.path.join(cfg.preprocess.processed_dir, dataset_name, cfg.preprocess.phone_dir)
+    os.makedirs(out_path, exist_ok=True)
+    phone_extractor = phoneExtractor(cfg, dataset_name)
+    for utt in tqdm(metadata):
+        uid = utt["Uid"]
+        text = utt["Text"]
+        phone_seq = phone_extractor.extract_phone(text)
+        phone_path = os.path.join(out_path, uid+'.phone')
+        with open(phone_path, 'w') as fin:
+            fin.write(' '.join(phone_seq))
+    if cfg.preprocess.phone_extractor != 'lexicon':
+        phone_extractor.save_dataset_phone_symbols_to_table()
+def save_all_dataset_phone_symbols_to_table(self, cfg, dataset):
+    #  phone symbols dict
+    phone_symbols = set()
+    for dataset_name in dataset:
+        phone_symbols_file = os.path.join(cfg.preprocess.processed_dir,
+                                          dataset_name,
+                                          cfg.preprocess.symbols_dict)
+        # load and merge saved phone symbols
+        assert os.path.exists(phone_symbols_file)
+        phone_symbol_dict_saved = SymbolTable.from_file(phone_symbols_file)._sym2id.keys()
+        phone_symbols.update(set(phone_symbol_dict_saved))
+    # save all phone symbols to each dataset
+    phone_symbol_dict = SymbolTable()
+    for s in sorted(list(phone_symbols)):
+        phone_symbol_dict.add(s)
+    for dataset_name in dataset:
+        phone_symbols_file = os.path.join(cfg.preprocess.processed_dir,
+                                          dataset_name,
+                                          cfg.preprocess.symbols_dict)
+        phone_symbol_dict.to_file(phone_symbols_file)

text/__init__.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+""" This code is modified from https://github.com/keithito/tacotron """
+import re
+from text import cleaners
+from text.symbols import symbols
+# Mappings from symbol to numeric ID and vice versa:
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+# Regular expression matching text enclosed in curly braces:
+_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
+def text_to_sequence(text, cleaner_names):
+    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
+    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
+    Args:
+      text: string to convert to a sequence
+      cleaner_names: names of the cleaner functions to run the text through
+    Returns:
+      List of integers corresponding to the symbols in the text
+    """
+    sequence = []
+    # Check for curly braces and treat their contents as ARPAbet:
+    while len(text):
+        m = _curly_re.match(text)
+        if not m:
+            sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
+            break
+        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
+        sequence += _arpabet_to_sequence(m.group(2))
+        text = m.group(3)
+    return sequence
+def sequence_to_text(sequence):
+    """Converts a sequence of IDs back to a string"""
+    result = ""
+    for symbol_id in sequence:
+        if symbol_id in _id_to_symbol:
+            s = _id_to_symbol[symbol_id]
+            # Enclose ARPAbet back in curly braces:
+            if len(s) > 1 and s[0] == "@":
+                s = "{%s}" % s[1:]
+            result += s
+    return result.replace("}{", " ")
+def _clean_text(text, cleaner_names):
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception("Unknown cleaner: %s" % name)
+        text = cleaner(text)
+    return text
+def _symbols_to_sequence(symbols):
+    return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
+def _arpabet_to_sequence(text):
+    return _symbols_to_sequence(["@" + s for s in text.split()])
+def _should_keep_symbol(s):
+    return s in _symbol_to_id and s != "_" and s != "~"

text/cleaners.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+""" This code is modified from https://github.com/keithito/tacotron """
+"""
+Cleaners are transformations that run over the input text at both training and eval time.
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+  1. "english_cleaners" for English text
+  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+     the symbols in symbols.py to match your data).
+"""
+# Regular expression matching whitespace:
+import re
+from unidecode import unidecode
+from .numbers import normalize_numbers
+_whitespace_re = re.compile(r"\s+")
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+    ]
+]
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def expand_numbers(text):
+    return normalize_numbers(text)
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text)
+def convert_to_ascii(text):
+    return unidecode(text)
+def basic_cleaners(text):
+    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def transliteration_cleaners(text):
+    """Pipeline for non-English text that transliterates to ASCII."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def english_cleaners(text):
+    """Pipeline for English text, including number and abbreviation expansion."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    text = collapse_whitespace(text)
+    return text

text/cmudict.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""  This code is modified from https://github.com/keithito/tacotron """
+import re
+valid_symbols = [
+    "AA",
+    "AA0",
+    "AA1",
+    "AA2",
+    "AE",
+    "AE0",
+    "AE1",
+    "AE2",
+    "AH",
+    "AH0",
+    "AH1",
+    "AH2",
+    "AO",
+    "AO0",
+    "AO1",
+    "AO2",
+    "AW",
+    "AW0",
+    "AW1",
+    "AW2",
+    "AY",
+    "AY0",
+    "AY1",
+    "AY2",
+    "B",
+    "CH",
+    "D",
+    "DH",
+    "EH",
+    "EH0",
+    "EH1",
+    "EH2",
+    "ER",
+    "ER0",
+    "ER1",
+    "ER2",
+    "EY",
+    "EY0",
+    "EY1",
+    "EY2",
+    "F",
+    "G",
+    "HH",
+    "IH",
+    "IH0",
+    "IH1",
+    "IH2",
+    "IY",
+    "IY0",
+    "IY1",
+    "IY2",
+    "JH",
+    "K",
+    "L",
+    "M",
+    "N",
+    "NG",
+    "OW",
+    "OW0",
+    "OW1",
+    "OW2",
+    "OY",
+    "OY0",
+    "OY1",
+    "OY2",
+    "P",
+    "R",
+    "S",
+    "SH",
+    "T",
+    "TH",
+    "UH",
+    "UH0",
+    "UH1",
+    "UH2",
+    "UW",
+    "UW0",
+    "UW1",
+    "UW2",
+    "V",
+    "W",
+    "Y",
+    "Z",
+    "ZH",
+]
+_valid_symbol_set = set(valid_symbols)
+class CMUDict:
+    """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
+    def __init__(self, file_or_path, keep_ambiguous=True):
+        if isinstance(file_or_path, str):
+            with open(file_or_path, encoding="latin-1") as f:
+                entries = _parse_cmudict(f)
+        else:
+            entries = _parse_cmudict(file_or_path)
+        if not keep_ambiguous:
+            entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
+        self._entries = entries
+    def __len__(self):
+        return len(self._entries)
+    def lookup(self, word):
+        """Returns list of ARPAbet pronunciations of the given word."""
+        return self._entries.get(word.upper())
+_alt_re = re.compile(r"\([0-9]+\)")
+def _parse_cmudict(file):
+    cmudict = {}
+    for line in file:
+        if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
+            parts = line.split("  ")
+            word = re.sub(_alt_re, "", parts[0])
+            pronunciation = _get_pronunciation(parts[1])
+            if pronunciation:
+                if word in cmudict:
+                    cmudict[word].append(pronunciation)
+                else:
+                    cmudict[word] = [pronunciation]
+    return cmudict
+def _get_pronunciation(s):
+    parts = s.strip().split(" ")
+    for part in parts:
+        if part not in _valid_symbol_set:
+            return None
+    return " ".join(parts)

text/g2p.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+from g2p_en import G2p
+from string import punctuation
+def read_lexicon(lex_path):
+    lexicon = {}
+    with open(lex_path) as f:
+        for line in f:
+            temp = re.split(r"\s+", line.strip("\n"))
+            word = temp[0]
+            phones = temp[1:]
+            if word.lower() not in lexicon:
+                lexicon[word.lower()] = phones
+    return lexicon
+def preprocess_english(text, lexicon):
+    text = text.rstrip(punctuation)
+    g2p = G2p()
+    phones = []
+    words = re.split(r"([,;.\-\?\!\s+])", text)
+    for w in words:
+        if w.lower() in lexicon:
+            phones += lexicon[w.lower()]
+        else:
+            phones += list(filter(lambda p: p != " ", g2p(w)))
+    phones = "}{".join(phones)
+    phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
+    phones = phones.replace("}{", " ")
+    return phones

text/g2p_module.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+from g2p_en import G2p
+from string import punctuation
+from typing import Any, Dict, List, Optional, Pattern, Union
+from phonemizer.backend import EspeakBackend
+from phonemizer.backend.espeak.language_switch import LanguageSwitch
+from phonemizer.backend.espeak.words_mismatch import WordMismatch
+from phonemizer.punctuation import Punctuation
+from phonemizer.separator import Separator
+try:
+    from pypinyin import Style, pinyin
+    from pypinyin.style._utils import get_finals, get_initials
+except Exception:
+    pass
+# This code is modified from
+# https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/data/tokenizer.py
+class PypinyinBackend:
+    """PypinyinBackend for Chinese. Most codes is referenced from espnet.
+    There are two types pinyin or initials_finals, one is
+    just like "ni1 hao3", the other is like "n i1 h ao3".
+    """
+    def __init__(
+        self,
+        backend="initials_finals",
+        punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(),
+    ) -> None:
+        self.backend = backend
+        self.punctuation_marks = punctuation_marks
+    def phonemize(
+        self, text: List[str], separator: Separator, strip=True, njobs=1
+    ) -> List[str]:
+        assert isinstance(text, List)
+        phonemized = []
+        for _text in text:
+            _text = re.sub(" +", " ", _text.strip())
+            _text = _text.replace(" ", separator.word)
+            phones = []
+            if self.backend == "pypinyin":
+                for n, py in enumerate(
+                    pinyin(
+                        _text, style=Style.TONE3, neutral_tone_with_five=True
+                    )
+                ):
+                    if all([c in self.punctuation_marks for c in py[0]]):
+                        if len(phones):
+                            assert phones[-1] == separator.syllable
+                            phones.pop(-1)
+                        phones.extend(list(py[0]))
+                    else:
+                        phones.extend([py[0], separator.syllable])
+            elif self.backend == "pypinyin_initials_finals":
+                for n, py in enumerate(
+                    pinyin(
+                        _text, style=Style.TONE3, neutral_tone_with_five=True
+                    )
+                ):
+                    if all([c in self.punctuation_marks for c in py[0]]):
+                        if len(phones):
+                            assert phones[-1] == separator.syllable
+                            phones.pop(-1)
+                        phones.extend(list(py[0]))
+                    else:
+                        if py[0][-1].isalnum():
+                            initial = get_initials(py[0], strict=False)
+                            if py[0][-1].isdigit():
+                                final = (
+                                    get_finals(py[0][:-1], strict=False)
+                                    + py[0][-1]
+                                )
+                            else:
+                                final = get_finals(py[0], strict=False)
+                            phones.extend(
+                                [
+                                    initial,
+                                    separator.phone,
+                                    final,
+                                    separator.syllable,
+                                ]
+                            )
+                        else:
+                            assert ValueError
+            else:
+                raise NotImplementedError
+            phonemized.append(
+                "".join(phones).rstrip(f"{separator.word}{separator.syllable}")
+            )
+        return phonemized
+class G2PModule:
+    """Phonemize Text."""
+    def __init__(
+        self,
+        language="en-us",
+        backend="espeak",
+        separator=Separator(word="_", syllable="-", phone="|"),
+        preserve_punctuation=True,
+        punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(),
+        with_stress: bool = False,
+        tie: Union[bool, str] = False,
+        language_switch: LanguageSwitch = "keep-flags",
+        words_mismatch: WordMismatch = "ignore",
+    ) -> None:
+        self.backend = self._initialize_backend(
+            backend, language, punctuation_marks, preserve_punctuation,
+            with_stress, tie, language_switch, words_mismatch
+        )
+        self.separator = separator
+    def _initialize_backend(
+        self, backend, language, punctuation_marks, preserve_punctuation,
+        with_stress, tie, language_switch, words_mismatch
+    ):
+        if backend == "espeak":
+            return EspeakBackend(
+                language,
+                punctuation_marks=punctuation_marks,
+                preserve_punctuation=preserve_punctuation,
+                with_stress=with_stress,
+                tie=tie,
+                language_switch=language_switch,
+                words_mismatch=words_mismatch,
+            )
+        elif backend in ["pypinyin", "pypinyin_initials_finals"]:
+            return PypinyinBackend(
+                backend=backend,
+                punctuation_marks=punctuation_marks + self.separator.word,
+            )
+        else:
+            raise NotImplementedError(f"{backend}")
+    def to_list(self, phonemized: str) -> List[str]:
+        fields = []
+        for word in phonemized.split(self.separator.word):
+            pp = re.findall(r"\w+|[^\w\s]", word, re.UNICODE)
+            fields.extend(
+                [p for p in pp if p != self.separator.phone]
+                + [self.separator.word]
+            )
+        assert len("".join(fields[:-1])) == len(phonemized) - phonemized.count(
+            self.separator.phone
+        )
+        return fields[:-1]
+    def phonemization(self, text, strip=True) -> List[List[str]]:
+        if isinstance(text, str):
+            text = [text]
+        phonemized = self.backend.phonemize(
+            text, separator=self.separator, strip=strip, njobs=1
+        )
+        phonemes = [self.to_list(p) for p in phonemized]
+        return phonemes
+    def g2p_conversion(self, text: str) -> List[str]:
+        phonemes = self.phonemization([text.strip()])
+        return phonemes[0]
+class LexiconModule:
+    def __init__(self, lex_path, language="en-us") -> None:
+        # todo: check lexicon derivation, merge with G2PModule?
+        lexicon = {}
+        with open(lex_path) as f:
+            for line in f:
+                temp = re.split(r"\s+", line.strip("\n"))
+                word = temp[0]
+                phones = temp[1:]
+                if word.lower() not in lexicon:
+                    lexicon[word.lower()] = phones
+        self.lexicon = lexicon
+        self.language = language
+    def g2p_conversion(self, text):
+        phone = None
+        # todo: preprocess with other languages
+        if self.language == 'en-us':
+            phone =  self.preprocess_english(text)
+        else:
+            print('No support to', self.language)
+            raise
+        return phone
+    def preprocess_english(self, text):
+        text = text.rstrip(punctuation)
+        g2p = G2p()
+        phones = []
+        words = re.split(r"([,;.\-\?\!\s+])", text)
+        for w in words:
+            if w.lower() in self.lexicon:
+                phones += self.lexicon[w.lower()]
+            else:
+                phones += list(filter(lambda p: p != " ", g2p(w)))
+        phones = "}{".join(phones)
+        phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
+        phones = phones.replace("}{", " ")
+        return phones

text/lexicon/librispeech-lexicon.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

text/lexicon/pinyin-lexicon-r.txt ADDED Viewed

	@@ -0,0 +1,4120 @@

+a1 a1
+a2 a2
+a3 a3
+a4 a4
+a5 a5
+ai1 ai1
+ai2 ai2
+ai3 ai3
+ai4 ai4
+ai5 ai5
+an1 an1
+an2 an2
+an3 an3
+an4 an4
+an5 an5
+ang1 ang1
+ang2 ang2
+ang3 ang3
+ang4 ang4
+ang5 ang5
+ao1 ao1
+ao2 ao2
+ao3 ao3
+ao4 ao4
+ao5 ao5
+ba1 b a1
+ba2 b a2
+ba3 b a3
+ba4 b a4
+ba5 b a5
+bai1 b ai1
+bai2 b ai2
+bai3 b ai3
+bai4 b ai4
+bai5 b ai5
+ban1 b an1
+ban2 b an2
+ban3 b an3
+ban4 b an4
+ban5 b an5
+bang1 b ang1
+bang2 b ang2
+bang3 b ang3
+bang4 b ang4
+bang5 b ang5
+bao1 b ao1
+bao2 b ao2
+bao3 b ao3
+bao4 b ao4
+bao5 b ao5
+bei1 b ei1
+bei2 b ei2
+bei3 b ei3
+bei4 b ei4
+bei5 b ei5
+ben1 b en1
+ben2 b en2
+ben3 b en3
+ben4 b en4
+ben5 b en5
+beng1 b eng1
+beng2 b eng2
+beng3 b eng3
+beng4 b eng4
+beng5 b eng5
+bi1 b i1
+bi2 b i2
+bi3 b i3
+bi4 b i4
+bi5 b i5
+bian1 b ian1
+bian2 b ian2
+bian3 b ian3
+bian4 b ian4
+bian5 b ian5
+biao1 b iao1
+biao2 b iao2
+biao3 b iao3
+biao4 b iao4
+biao5 b iao5
+bie1 b ie1
+bie2 b ie2
+bie3 b ie3
+bie4 b ie4
+bie5 b ie5
+bin1 b in1
+bin2 b in2
+bin3 b in3
+bin4 b in4
+bin5 b in5
+bing1 b ing1
+bing2 b ing2
+bing3 b ing3
+bing4 b ing4
+bing5 b ing5
+bo1 b o1
+bo2 b o2
+bo3 b o3
+bo4 b o4
+bo5 b o5
+bu1 b u1
+bu2 b u2
+bu3 b u3
+bu4 b u4
+bu5 b u5
+ca1 c a1
+ca2 c a2
+ca3 c a3
+ca4 c a4
+ca5 c a5
+cai1 c ai1
+cai2 c ai2
+cai3 c ai3
+cai4 c ai4
+cai5 c ai5
+can1 c an1
+can2 c an2
+can3 c an3
+can4 c an4
+can5 c an5
+cang1 c ang1
+cang2 c ang2
+cang3 c ang3
+cang4 c ang4
+cang5 c ang5
+cao1 c ao1
+cao2 c ao2
+cao3 c ao3
+cao4 c ao4
+cao5 c ao5
+ce1 c e1
+ce2 c e2
+ce3 c e3
+ce4 c e4
+ce5 c e5
+cen1 c en1
+cen2 c en2
+cen3 c en3
+cen4 c en4
+cen5 c en5
+ceng1 c eng1
+ceng2 c eng2
+ceng3 c eng3
+ceng4 c eng4
+ceng5 c eng5
+cha1 ch a1
+cha2 ch a2
+cha3 ch a3
+cha4 ch a4
+cha5 ch a5
+chai1 ch ai1
+chai2 ch ai2
+chai3 ch ai3
+chai4 ch ai4
+chai5 ch ai5
+chan1 ch an1
+chan2 ch an2
+chan3 ch an3
+chan4 ch an4
+chan5 ch an5
+chang1 ch ang1
+chang2 ch ang2
+chang3 ch ang3
+chang4 ch ang4
+chang5 ch ang5
+chao1 ch ao1
+chao2 ch ao2
+chao3 ch ao3
+chao4 ch ao4
+chao5 ch ao5
+che1 ch e1
+che2 ch e2
+che3 ch e3
+che4 ch e4
+che5 ch e5
+chen1 ch en1
+chen2 ch en2
+chen3 ch en3
+chen4 ch en4
+chen5 ch en5
+cheng1 ch eng1
+cheng2 ch eng2
+cheng3 ch eng3
+cheng4 ch eng4
+cheng5 ch eng5
+chi1 ch iii1
+chi2 ch iii2
+chi3 ch iii3
+chi4 ch iii4
+chi5 ch iii5
+chong1 ch ong1
+chong2 ch ong2
+chong3 ch ong3
+chong4 ch ong4
+chong5 ch ong5
+chou1 ch ou1
+chou2 ch ou2
+chou3 ch ou3
+chou4 ch ou4
+chou5 ch ou5
+chu1 ch u1
+chu2 ch u2
+chu3 ch u3
+chu4 ch u4
+chu5 ch u5
+chuai1 ch uai1
+chuai2 ch uai2
+chuai3 ch uai3
+chuai4 ch uai4
+chuai5 ch uai5
+chuan1 ch uan1
+chuan2 ch uan2
+chuan3 ch uan3
+chuan4 ch uan4
+chuan5 ch uan5
+chuang1 ch uang1
+chuang2 ch uang2
+chuang3 ch uang3
+chuang4 ch uang4
+chuang5 ch uang5
+chui1 ch uei1
+chui2 ch uei2
+chui3 ch uei3
+chui4 ch uei4
+chui5 ch uei5
+chun1 ch uen1
+chun2 ch uen2
+chun3 ch uen3
+chun4 ch uen4
+chun5 ch uen5
+chuo1 ch uo1
+chuo2 ch uo2
+chuo3 ch uo3
+chuo4 ch uo4
+chuo5 ch uo5
+ci1 c ii1
+ci2 c ii2
+ci3 c ii3
+ci4 c ii4
+ci5 c ii5
+cong1 c ong1
+cong2 c ong2
+cong3 c ong3
+cong4 c ong4
+cong5 c ong5
+cou1 c ou1
+cou2 c ou2
+cou3 c ou3
+cou4 c ou4
+cou5 c ou5
+cu1 c u1
+cu2 c u2
+cu3 c u3
+cu4 c u4
+cu5 c u5
+cuan1 c uan1
+cuan2 c uan2
+cuan3 c uan3
+cuan4 c uan4
+cuan5 c uan5
+cui1 c uei1
+cui2 c uei2
+cui3 c uei3
+cui4 c uei4
+cui5 c uei5
+cun1 c uen1
+cun2 c uen2
+cun3 c uen3
+cun4 c uen4
+cun5 c uen5
+cuo1 c uo1
+cuo2 c uo2
+cuo3 c uo3
+cuo4 c uo4
+cuo5 c uo5
+da1 d a1
+da2 d a2
+da3 d a3
+da4 d a4
+da5 d a5
+dai1 d ai1
+dai2 d ai2
+dai3 d ai3
+dai4 d ai4
+dai5 d ai5
+dan1 d an1
+dan2 d an2
+dan3 d an3
+dan4 d an4
+dan5 d an5
+dang1 d ang1
+dang2 d ang2
+dang3 d ang3
+dang4 d ang4
+dang5 d ang5
+dao1 d ao1
+dao2 d ao2
+dao3 d ao3
+dao4 d ao4
+dao5 d ao5
+de1 d e1
+de2 d e2
+de3 d e3
+de4 d e4
+de5 d e5
+dei1 d ei1
+dei2 d ei2
+dei3 d ei3
+dei4 d ei4
+dei5 d ei5
+den1 d en1
+den2 d en2
+den3 d en3
+den4 d en4
+den5 d en5
+deng1 d eng1
+deng2 d eng2
+deng3 d eng3
+deng4 d eng4
+deng5 d eng5
+di1 d i1
+di2 d i2
+di3 d i3
+di4 d i4
+di5 d i5
+dia1 d ia1
+dia2 d ia2
+dia3 d ia3
+dia4 d ia4
+dia5 d ia5
+dian1 d ian1
+dian2 d ian2
+dian3 d ian3
+dian4 d ian4
+dian5 d ian5
+diao1 d iao1
+diao2 d iao2
+diao3 d iao3
+diao4 d iao4
+diao5 d iao5
+die1 d ie1
+die2 d ie2
+die3 d ie3
+die4 d ie4
+die5 d ie5
+ding1 d ing1
+ding2 d ing2
+ding3 d ing3
+ding4 d ing4
+ding5 d ing5
+diu1 d iou1
+diu2 d iou2
+diu3 d iou3
+diu4 d iou4
+diu5 d iou5
+dong1 d ong1
+dong2 d ong2
+dong3 d ong3
+dong4 d ong4
+dong5 d ong5
+dou1 d ou1
+dou2 d ou2
+dou3 d ou3
+dou4 d ou4
+dou5 d ou5
+du1 d u1
+du2 d u2
+du3 d u3
+du4 d u4
+du5 d u5
+duan1 d uan1
+duan2 d uan2
+duan3 d uan3
+duan4 d uan4
+duan5 d uan5
+dui1 d uei1
+dui2 d uei2
+dui3 d uei3
+dui4 d uei4
+dui5 d uei5
+dun1 d uen1
+dun2 d uen2
+dun3 d uen3
+dun4 d uen4
+dun5 d uen5
+duo1 d uo1
+duo2 d uo2
+duo3 d uo3
+duo4 d uo4
+duo5 d uo5
+e1 e1
+e2 e2
+e3 e3
+e4 e4
+e5 e5
+ei1 ei1
+ei2 ei2
+ei3 ei3
+ei4 ei4
+ei5 ei5
+en1 en1
+en2 en2
+en3 en3
+en4 en4
+en5 en5
+eng1 eng1
+eng2 eng2
+eng3 eng3
+eng4 eng4
+eng5 eng5
+r1 er1
+r2 er2
+r3 er3
+r4 er4
+r5 er5
+er1 er1
+er2 er2
+er3 er3
+er4 er4
+er5 er5
+fa1 f a1
+fa2 f a2
+fa3 f a3
+fa4 f a4
+fa5 f a5
+fan1 f an1
+fan2 f an2
+fan3 f an3
+fan4 f an4
+fan5 f an5
+fang1 f ang1
+fang2 f ang2
+fang3 f ang3
+fang4 f ang4
+fang5 f ang5
+fei1 f ei1
+fei2 f ei2
+fei3 f ei3
+fei4 f ei4
+fei5 f ei5
+fen1 f en1
+fen2 f en2
+fen3 f en3
+fen4 f en4
+fen5 f en5
+feng1 f eng1
+feng2 f eng2
+feng3 f eng3
+feng4 f eng4
+feng5 f eng5
+fo1 f o1
+fo2 f o2
+fo3 f o3
+fo4 f o4
+fo5 f o5
+fou1 f ou1
+fou2 f ou2
+fou3 f ou3
+fou4 f ou4
+fou5 f ou5
+fu1 f u1
+fu2 f u2
+fu3 f u3
+fu4 f u4
+fu5 f u5
+ga1 g a1
+ga2 g a2
+ga3 g a3
+ga4 g a4
+ga5 g a5
+gai1 g ai1
+gai2 g ai2
+gai3 g ai3
+gai4 g ai4
+gai5 g ai5
+gan1 g an1
+gan2 g an2
+gan3 g an3
+gan4 g an4
+gan5 g an5
+gang1 g ang1
+gang2 g ang2
+gang3 g ang3
+gang4 g ang4
+gang5 g ang5
+gao1 g ao1
+gao2 g ao2
+gao3 g ao3
+gao4 g ao4
+gao5 g ao5
+ge1 g e1
+ge2 g e2
+ge3 g e3
+ge4 g e4
+ge5 g e5
+gei1 g ei1
+gei2 g ei2
+gei3 g ei3
+gei4 g ei4
+gei5 g ei5
+gen1 g en1
+gen2 g en2
+gen3 g en3
+gen4 g en4
+gen5 g en5
+geng1 g eng1
+geng2 g eng2
+geng3 g eng3
+geng4 g eng4
+geng5 g eng5
+gong1 g ong1
+gong2 g ong2
+gong3 g ong3
+gong4 g ong4
+gong5 g ong5
+gou1 g ou1
+gou2 g ou2
+gou3 g ou3
+gou4 g ou4
+gou5 g ou5
+gu1 g u1
+gu2 g u2
+gu3 g u3
+gu4 g u4
+gu5 g u5
+gua1 g ua1
+gua2 g ua2
+gua3 g ua3
+gua4 g ua4
+gua5 g ua5
+guai1 g uai1
+guai2 g uai2
+guai3 g uai3
+guai4 g uai4
+guai5 g uai5
+guan1 g uan1
+guan2 g uan2
+guan3 g uan3
+guan4 g uan4
+guan5 g uan5
+guang1 g uang1
+guang2 g uang2
+guang3 g uang3
+guang4 g uang4
+guang5 g uang5
+gui1 g uei1
+gui2 g uei2
+gui3 g uei3
+gui4 g uei4
+gui5 g uei5
+gun1 g uen1
+gun2 g uen2
+gun3 g uen3
+gun4 g uen4
+gun5 g uen5
+guo1 g uo1
+guo2 g uo2
+guo3 g uo3
+guo4 g uo4
+guo5 g uo5
+ha1 h a1
+ha2 h a2
+ha3 h a3
+ha4 h a4
+ha5 h a5
+hai1 h ai1
+hai2 h ai2
+hai3 h ai3
+hai4 h ai4
+hai5 h ai5
+han1 h an1
+han2 h an2
+han3 h an3
+han4 h an4
+han5 h an5
+hang1 h ang1
+hang2 h ang2
+hang3 h ang3
+hang4 h ang4
+hang5 h ang5
+hao1 h ao1
+hao2 h ao2
+hao3 h ao3
+hao4 h ao4
+hao5 h ao5
+he1 h e1
+he2 h e2
+he3 h e3
+he4 h e4
+he5 h e5
+hei1 h ei1
+hei2 h ei2
+hei3 h ei3
+hei4 h ei4
+hei5 h ei5
+hen1 h en1
+hen2 h en2
+hen3 h en3
+hen4 h en4
+hen5 h en5
+heng1 h eng1
+heng2 h eng2
+heng3 h eng3
+heng4 h eng4
+heng5 h eng5
+hong1 h ong1
+hong2 h ong2
+hong3 h ong3
+hong4 h ong4
+hong5 h ong5
+hou1 h ou1
+hou2 h ou2
+hou3 h ou3
+hou4 h ou4
+hou5 h ou5
+hu1 h u1
+hu2 h u2
+hu3 h u3
+hu4 h u4
+hu5 h u5
+hua1 h ua1
+hua2 h ua2
+hua3 h ua3
+hua4 h ua4
+hua5 h ua5
+huai1 h uai1
+huai2 h uai2
+huai3 h uai3
+huai4 h uai4
+huai5 h uai5
+huan1 h uan1
+huan2 h uan2
+huan3 h uan3
+huan4 h uan4
+huan5 h uan5
+huang1 h uang1
+huang2 h uang2
+huang3 h uang3
+huang4 h uang4
+huang5 h uang5
+hui1 h uei1
+hui2 h uei2
+hui3 h uei3
+hui4 h uei4
+hui5 h uei5
+hun1 h uen1
+hun2 h uen2
+hun3 h uen3
+hun4 h uen4
+hun5 h uen5
+huo1 h uo1
+huo2 h uo2
+huo3 h uo3
+huo4 h uo4
+huo5 h uo5
+ji1 j i1
+ji2 j i2
+ji3 j i3
+ji4 j i4
+ji5 j i5
+jia1 j ia1
+jia2 j ia2
+jia3 j ia3
+jia4 j ia4
+jia5 j ia5
+jian1 j ian1
+jian2 j ian2
+jian3 j ian3
+jian4 j ian4
+jian5 j ian5
+jiang1 j iang1
+jiang2 j iang2
+jiang3 j iang3
+jiang4 j iang4
+jiang5 j iang5
+jiao1 j iao1
+jiao2 j iao2
+jiao3 j iao3
+jiao4 j iao4
+jiao5 j iao5
+jie1 j ie1
+jie2 j ie2
+jie3 j ie3
+jie4 j ie4
+jie5 j ie5
+jin1 j in1
+jin2 j in2
+jin3 j in3
+jin4 j in4
+jin5 j in5
+jing1 j ing1
+jing2 j ing2
+jing3 j ing3
+jing4 j ing4
+jing5 j ing5
+jiong1 j iong1
+jiong2 j iong2
+jiong3 j iong3
+jiong4 j iong4
+jiong5 j iong5
+jiu1 j iou1
+jiu2 j iou2
+jiu3 j iou3
+jiu4 j iou4
+jiu5 j iou5
+ju1 j v1
+ju2 j v2
+ju3 j v3
+ju4 j v4
+ju5 j v5
+juan1 j van1
+juan2 j van2
+juan3 j van3
+juan4 j van4
+juan5 j van5
+jue1 j ve1
+jue2 j ve2
+jue3 j ve3
+jue4 j ve4
+jue5 j ve5
+jun1 j vn1
+jun2 j vn2
+jun3 j vn3
+jun4 j vn4
+jun5 j vn5
+ka1 k a1
+ka2 k a2
+ka3 k a3
+ka4 k a4
+ka5 k a5
+kai1 k ai1
+kai2 k ai2
+kai3 k ai3
+kai4 k ai4
+kai5 k ai5
+kan1 k an1
+kan2 k an2
+kan3 k an3
+kan4 k an4
+kan5 k an5
+kang1 k ang1
+kang2 k ang2
+kang3 k ang3
+kang4 k ang4
+kang5 k ang5
+kao1 k ao1
+kao2 k ao2
+kao3 k ao3
+kao4 k ao4
+kao5 k ao5
+ke1 k e1
+ke2 k e2
+ke3 k e3
+ke4 k e4
+ke5 k e5
+kei1 k ei1
+kei2 k ei2
+kei3 k ei3
+kei4 k ei4
+kei5 k ei5
+ken1 k en1
+ken2 k en2
+ken3 k en3
+ken4 k en4
+ken5 k en5
+keng1 k eng1
+keng2 k eng2
+keng3 k eng3
+keng4 k eng4
+keng5 k eng5
+kong1 k ong1
+kong2 k ong2
+kong3 k ong3
+kong4 k ong4
+kong5 k ong5
+kou1 k ou1
+kou2 k ou2
+kou3 k ou3
+kou4 k ou4
+kou5 k ou5
+ku1 k u1
+ku2 k u2
+ku3 k u3
+ku4 k u4
+ku5 k u5
+kua1 k ua1
+kua2 k ua2
+kua3 k ua3
+kua4 k ua4
+kua5 k ua5
+kuai1 k uai1
+kuai2 k uai2
+kuai3 k uai3
+kuai4 k uai4
+kuai5 k uai5
+kuan1 k uan1
+kuan2 k uan2
+kuan3 k uan3
+kuan4 k uan4
+kuan5 k uan5
+kuang1 k uang1
+kuang2 k uang2
+kuang3 k uang3
+kuang4 k uang4
+kuang5 k uang5
+kui1 k uei1
+kui2 k uei2
+kui3 k uei3
+kui4 k uei4
+kui5 k uei5
+kun1 k uen1
+kun2 k uen2
+kun3 k uen3
+kun4 k uen4
+kun5 k uen5
+kuo1 k uo1
+kuo2 k uo2
+kuo3 k uo3
+kuo4 k uo4
+kuo5 k uo5
+la1 l a1
+la2 l a2
+la3 l a3
+la4 l a4
+la5 l a5
+lai1 l ai1
+lai2 l ai2
+lai3 l ai3
+lai4 l ai4
+lai5 l ai5
+lan1 l an1
+lan2 l an2
+lan3 l an3
+lan4 l an4
+lan5 l an5
+lang1 l ang1
+lang2 l ang2
+lang3 l ang3
+lang4 l ang4
+lang5 l ang5
+lao1 l ao1
+lao2 l ao2
+lao3 l ao3
+lao4 l ao4
+lao5 l ao5
+le1 l e1
+le2 l e2
+le3 l e3
+le4 l e4
+le5 l e5
+lei1 l ei1
+lei2 l ei2
+lei3 l ei3
+lei4 l ei4
+lei5 l ei5
+leng1 l eng1
+leng2 l eng2
+leng3 l eng3
+leng4 l eng4
+leng5 l eng5
+li1 l i1
+li2 l i2
+li3 l i3
+li4 l i4
+li5 l i5
+lia1 l ia1
+lia2 l ia2
+lia3 l ia3
+lia4 l ia4
+lia5 l ia5
+lian1 l ian1
+lian2 l ian2
+lian3 l ian3
+lian4 l ian4
+lian5 l ian5
+liang1 l iang1
+liang2 l iang2
+liang3 l iang3
+liang4 l iang4
+liang5 l iang5
+liao1 l iao1
+liao2 l iao2
+liao3 l iao3
+liao4 l iao4
+liao5 l iao5
+lie1 l ie1
+lie2 l ie2
+lie3 l ie3
+lie4 l ie4
+lie5 l ie5
+lin1 l in1
+lin2 l in2
+lin3 l in3
+lin4 l in4
+lin5 l in5
+ling1 l ing1
+ling2 l ing2
+ling3 l ing3
+ling4 l ing4
+ling5 l ing5
+liu1 l iou1
+liu2 l iou2
+liu3 l iou3
+liu4 l iou4
+liu5 l iou5
+lo1 l o1
+lo2 l o2
+lo3 l o3
+lo4 l o4
+lo5 l o5
+long1 l ong1
+long2 l ong2
+long3 l ong3
+long4 l ong4
+long5 l ong5
+lou1 l ou1
+lou2 l ou2
+lou3 l ou3
+lou4 l ou4
+lou5 l ou5
+lu1 l u1
+lu2 l u2
+lu3 l u3
+lu4 l u4
+lu5 l u5
+luan1 l uan1
+luan2 l uan2
+luan3 l uan3
+luan4 l uan4
+luan5 l uan5
+lue1 l ve1
+lue2 l ve2
+lue3 l ve3
+lue4 l ve4
+lue5 l ve5
+lve1 l ve1
+lve2 l ve2
+lve3 l ve3
+lve4 l ve4
+lve5 l ve5
+lun1 l uen1
+lun2 l uen2
+lun3 l uen3
+lun4 l uen4
+lun5 l uen5
+luo1 l uo1
+luo2 l uo2
+luo3 l uo3
+luo4 l uo4
+luo5 l uo5
+lv1 l v1
+lv2 l v2
+lv3 l v3
+lv4 l v4
+lv5 l v5
+ma1 m a1
+ma2 m a2
+ma3 m a3
+ma4 m a4
+ma5 m a5
+mai1 m ai1
+mai2 m ai2
+mai3 m ai3
+mai4 m ai4
+mai5 m ai5
+man1 m an1
+man2 m an2
+man3 m an3
+man4 m an4
+man5 m an5
+mang1 m ang1
+mang2 m ang2
+mang3 m ang3
+mang4 m ang4
+mang5 m ang5
+mao1 m ao1
+mao2 m ao2
+mao3 m ao3
+mao4 m ao4
+mao5 m ao5
+me1 m e1
+me2 m e2
+me3 m e3
+me4 m e4
+me5 m e5
+mei1 m ei1
+mei2 m ei2
+mei3 m ei3
+mei4 m ei4
+mei5 m ei5
+men1 m en1
+men2 m en2
+men3 m en3
+men4 m en4
+men5 m en5
+meng1 m eng1
+meng2 m eng2
+meng3 m eng3
+meng4 m eng4
+meng5 m eng5
+mi1 m i1
+mi2 m i2
+mi3 m i3
+mi4 m i4
+mi5 m i5
+mian1 m ian1
+mian2 m ian2
+mian3 m ian3
+mian4 m ian4
+mian5 m ian5
+miao1 m iao1
+miao2 m iao2
+miao3 m iao3
+miao4 m iao4
+miao5 m iao5
+mie1 m ie1
+mie2 m ie2
+mie3 m ie3
+mie4 m ie4
+mie5 m ie5
+min1 m in1
+min2 m in2
+min3 m in3
+min4 m in4
+min5 m in5
+ming1 m ing1
+ming2 m ing2
+ming3 m ing3
+ming4 m ing4
+ming5 m ing5
+miu1 m iou1
+miu2 m iou2
+miu3 m iou3
+miu4 m iou4
+miu5 m iou5
+mo1 m o1
+mo2 m o2
+mo3 m o3
+mo4 m o4
+mo5 m o5
+mou1 m ou1
+mou2 m ou2
+mou3 m ou3
+mou4 m ou4
+mou5 m ou5
+mu1 m u1
+mu2 m u2
+mu3 m u3
+mu4 m u4
+mu5 m u5
+na1 n a1
+na2 n a2
+na3 n a3
+na4 n a4
+na5 n a5
+nai1 n ai1
+nai2 n ai2
+nai3 n ai3
+nai4 n ai4
+nai5 n ai5
+nan1 n an1
+nan2 n an2
+nan3 n an3
+nan4 n an4
+nan5 n an5
+nang1 n ang1
+nang2 n ang2
+nang3 n ang3
+nang4 n ang4
+nang5 n ang5
+nao1 n ao1
+nao2 n ao2
+nao3 n ao3
+nao4 n ao4
+nao5 n ao5
+ne1 n e1
+ne2 n e2
+ne3 n e3
+ne4 n e4
+ne5 n e5
+nei1 n ei1
+nei2 n ei2
+nei3 n ei3
+nei4 n ei4
+nei5 n ei5
+nen1 n en1
+nen2 n en2
+nen3 n en3
+nen4 n en4
+nen5 n en5
+neng1 n eng1
+neng2 n eng2
+neng3 n eng3
+neng4 n eng4
+neng5 n eng5
+ni1 n i1
+ni2 n i2
+ni3 n i3
+ni4 n i4
+ni5 n i5
+nian1 n ian1
+nian2 n ian2
+nian3 n ian3
+nian4 n ian4
+nian5 n ian5
+niang1 n iang1
+niang2 n iang2
+niang3 n iang3
+niang4 n iang4
+niang5 n iang5
+niao1 n iao1
+niao2 n iao2
+niao3 n iao3
+niao4 n iao4
+niao5 n iao5
+nie1 n ie1
+nie2 n ie2
+nie3 n ie3
+nie4 n ie4
+nie5 n ie5
+nin1 n in1
+nin2 n in2
+nin3 n in3
+nin4 n in4
+nin5 n in5
+ning1 n ing1
+ning2 n ing2
+ning3 n ing3
+ning4 n ing4
+ning5 n ing5
+niu1 n iou1
+niu2 n iou2
+niu3 n iou3
+niu4 n iou4
+niu5 n iou5
+nong1 n ong1
+nong2 n ong2
+nong3 n ong3
+nong4 n ong4
+nong5 n ong5
+nou1 n ou1
+nou2 n ou2
+nou3 n ou3
+nou4 n ou4
+nou5 n ou5
+nu1 n u1
+nu2 n u2
+nu3 n u3
+nu4 n u4
+nu5 n u5
+nuan1 n uan1
+nuan2 n uan2
+nuan3 n uan3
+nuan4 n uan4
+nuan5 n uan5
+nue1 n ve1
+nue2 n ve2
+nue3 n ve3
+nue4 n ve4
+nue5 n ve5
+nve1 n ve1
+nve2 n ve2
+nve3 n ve3
+nve4 n ve4
+nve5 n ve5
+nuo1 n uo1
+nuo2 n uo2
+nuo3 n uo3
+nuo4 n uo4
+nuo5 n uo5
+nv1 n v1
+nv2 n v2
+nv3 n v3
+nv4 n v4
+nv5 n v5
+o1 o1
+o2 o2
+o3 o3
+o4 o4
+o5 o5
+ou1 ou1
+ou2 ou2
+ou3 ou3
+ou4 ou4
+ou5 ou5
+pa1 p a1
+pa2 p a2
+pa3 p a3
+pa4 p a4
+pa5 p a5
+pai1 p ai1
+pai2 p ai2
+pai3 p ai3
+pai4 p ai4
+pai5 p ai5
+pan1 p an1
+pan2 p an2
+pan3 p an3
+pan4 p an4
+pan5 p an5
+pang1 p ang1
+pang2 p ang2
+pang3 p ang3
+pang4 p ang4
+pang5 p ang5
+pao1 p ao1
+pao2 p ao2
+pao3 p ao3
+pao4 p ao4
+pao5 p ao5
+pei1 p ei1
+pei2 p ei2
+pei3 p ei3
+pei4 p ei4
+pei5 p ei5
+pen1 p en1
+pen2 p en2
+pen3 p en3
+pen4 p en4
+pen5 p en5
+peng1 p eng1
+peng2 p eng2
+peng3 p eng3
+peng4 p eng4
+peng5 p eng5
+pi1 p i1
+pi2 p i2
+pi3 p i3
+pi4 p i4
+pi5 p i5
+pian1 p ian1
+pian2 p ian2
+pian3 p ian3
+pian4 p ian4
+pian5 p ian5
+piao1 p iao1
+piao2 p iao2
+piao3 p iao3
+piao4 p iao4
+piao5 p iao5
+pie1 p ie1
+pie2 p ie2
+pie3 p ie3
+pie4 p ie4
+pie5 p ie5
+pin1 p in1
+pin2 p in2
+pin3 p in3
+pin4 p in4
+pin5 p in5
+ping1 p ing1
+ping2 p ing2
+ping3 p ing3
+ping4 p ing4
+ping5 p ing5
+po1 p o1
+po2 p o2
+po3 p o3
+po4 p o4
+po5 p o5
+pou1 p ou1
+pou2 p ou2
+pou3 p ou3
+pou4 p ou4
+pou5 p ou5
+pu1 p u1
+pu2 p u2
+pu3 p u3
+pu4 p u4
+pu5 p u5
+qi1 q i1
+qi2 q i2
+qi3 q i3
+qi4 q i4
+qi5 q i5
+qia1 q ia1
+qia2 q ia2
+qia3 q ia3
+qia4 q ia4
+qia5 q ia5
+qian1 q ian1
+qian2 q ian2
+qian3 q ian3
+qian4 q ian4
+qian5 q ian5
+qiang1 q iang1
+qiang2 q iang2
+qiang3 q iang3
+qiang4 q iang4
+qiang5 q iang5
+qiao1 q iao1
+qiao2 q iao2
+qiao3 q iao3
+qiao4 q iao4
+qiao5 q iao5
+qie1 q ie1
+qie2 q ie2
+qie3 q ie3
+qie4 q ie4
+qie5 q ie5
+qin1 q in1
+qin2 q in2
+qin3 q in3
+qin4 q in4
+qin5 q in5
+qing1 q ing1
+qing2 q ing2
+qing3 q ing3
+qing4 q ing4
+qing5 q ing5
+qiong1 q iong1
+qiong2 q iong2
+qiong3 q iong3
+qiong4 q iong4
+qiong5 q iong5
+qiu1 q iou1
+qiu2 q iou2
+qiu3 q iou3
+qiu4 q iou4
+qiu5 q iou5
+qu1 q v1
+qu2 q v2
+qu3 q v3
+qu4 q v4
+qu5 q v5
+quan1 q van1
+quan2 q van2
+quan3 q van3
+quan4 q van4
+quan5 q van5
+que1 q ve1
+que2 q ve2
+que3 q ve3
+que4 q ve4
+que5 q ve5
+qun1 q vn1
+qun2 q vn2
+qun3 q vn3
+qun4 q vn4
+qun5 q vn5
+ran1 r an1
+ran2 r an2
+ran3 r an3
+ran4 r an4
+ran5 r an5
+rang1 r ang1
+rang2 r ang2
+rang3 r ang3
+rang4 r ang4
+rang5 r ang5
+rao1 r ao1
+rao2 r ao2
+rao3 r ao3
+rao4 r ao4
+rao5 r ao5
+re1 r e1
+re2 r e2
+re3 r e3
+re4 r e4
+re5 r e5
+ren1 r en1
+ren2 r en2
+ren3 r en3
+ren4 r en4
+ren5 r en5
+reng1 r eng1
+reng2 r eng2
+reng3 r eng3
+reng4 r eng4
+reng5 r eng5
+ri1 r iii1
+ri2 r iii2
+ri3 r iii3
+ri4 r iii4
+ri5 r iii5
+rong1 r ong1
+rong2 r ong2
+rong3 r ong3
+rong4 r ong4
+rong5 r ong5
+rou1 r ou1
+rou2 r ou2
+rou3 r ou3
+rou4 r ou4
+rou5 r ou5
+ru1 r u1
+ru2 r u2
+ru3 r u3
+ru4 r u4
+ru5 r u5
+rua1 r ua1
+rua2 r ua2
+rua3 r ua3
+rua4 r ua4
+rua5 r ua5
+ruan1 r uan1
+ruan2 r uan2
+ruan3 r uan3
+ruan4 r uan4
+ruan5 r uan5
+rui1 r uei1
+rui2 r uei2
+rui3 r uei3
+rui4 r uei4
+rui5 r uei5
+run1 r uen1
+run2 r uen2
+run3 r uen3
+run4 r uen4
+run5 r uen5
+ruo1 r uo1
+ruo2 r uo2
+ruo3 r uo3
+ruo4 r uo4
+ruo5 r uo5
+sa1 s a1
+sa2 s a2
+sa3 s a3
+sa4 s a4
+sa5 s a5
+sai1 s ai1
+sai2 s ai2
+sai3 s ai3
+sai4 s ai4
+sai5 s ai5
+san1 s an1
+san2 s an2
+san3 s an3
+san4 s an4
+san5 s an5
+sang1 s ang1
+sang2 s ang2
+sang3 s ang3
+sang4 s ang4
+sang5 s ang5
+sao1 s ao1
+sao2 s ao2
+sao3 s ao3
+sao4 s ao4
+sao5 s ao5
+se1 s e1
+se2 s e2
+se3 s e3
+se4 s e4
+se5 s e5
+sen1 s en1
+sen2 s en2
+sen3 s en3
+sen4 s en4
+sen5 s en5
+seng1 s eng1
+seng2 s eng2
+seng3 s eng3
+seng4 s eng4
+seng5 s eng5
+sha1 sh a1
+sha2 sh a2
+sha3 sh a3
+sha4 sh a4
+sha5 sh a5
+shai1 sh ai1
+shai2 sh ai2
+shai3 sh ai3
+shai4 sh ai4
+shai5 sh ai5
+shan1 sh an1
+shan2 sh an2
+shan3 sh an3
+shan4 sh an4
+shan5 sh an5
+shang1 sh ang1
+shang2 sh ang2
+shang3 sh ang3
+shang4 sh ang4
+shang5 sh ang5
+shao1 sh ao1
+shao2 sh ao2
+shao3 sh ao3
+shao4 sh ao4
+shao5 sh ao5
+she1 sh e1
+she2 sh e2
+she3 sh e3
+she4 sh e4
+she5 sh e5
+shei1 sh ei1
+shei2 sh ei2
+shei3 sh ei3
+shei4 sh ei4
+shei5 sh ei5
+shen1 sh en1
+shen2 sh en2
+shen3 sh en3
+shen4 sh en4
+shen5 sh en5
+sheng1 sh eng1
+sheng2 sh eng2
+sheng3 sh eng3
+sheng4 sh eng4
+sheng5 sh eng5
+shi1 sh iii1
+shi2 sh iii2
+shi3 sh iii3
+shi4 sh iii4
+shi5 sh iii5
+shou1 sh ou1
+shou2 sh ou2
+shou3 sh ou3
+shou4 sh ou4
+shou5 sh ou5
+shu1 sh u1
+shu2 sh u2
+shu3 sh u3
+shu4 sh u4
+shu5 sh u5
+shua1 sh ua1
+shua2 sh ua2
+shua3 sh ua3
+shua4 sh ua4
+shua5 sh ua5
+shuai1 sh uai1
+shuai2 sh uai2
+shuai3 sh uai3
+shuai4 sh uai4
+shuai5 sh uai5
+shuan1 sh uan1
+shuan2 sh uan2
+shuan3 sh uan3
+shuan4 sh uan4
+shuan5 sh uan5
+shuang1 sh uang1
+shuang2 sh uang2
+shuang3 sh uang3
+shuang4 sh uang4
+shuang5 sh uang5
+shui1 sh uei1
+shui2 sh uei2
+shui3 sh uei3
+shui4 sh uei4
+shui5 sh uei5
+shun1 sh uen1
+shun2 sh uen2
+shun3 sh uen3
+shun4 sh uen4
+shun5 sh uen5
+shuo1 sh uo1
+shuo2 sh uo2
+shuo3 sh uo3
+shuo4 sh uo4
+shuo5 sh uo5
+si1 s ii1
+si2 s ii2
+si3 s ii3
+si4 s ii4
+si5 s ii5
+song1 s ong1
+song2 s ong2
+song3 s ong3
+song4 s ong4
+song5 s ong5
+sou1 s ou1
+sou2 s ou2
+sou3 s ou3
+sou4 s ou4
+sou5 s ou5
+su1 s u1
+su2 s u2
+su3 s u3
+su4 s u4
+su5 s u5
+suan1 s uan1
+suan2 s uan2
+suan3 s uan3
+suan4 s uan4
+suan5 s uan5
+sui1 s uei1
+sui2 s uei2
+sui3 s uei3
+sui4 s uei4
+sui5 s uei5
+sun1 s uen1
+sun2 s uen2
+sun3 s uen3
+sun4 s uen4
+sun5 s uen5
+suo1 s uo1
+suo2 s uo2
+suo3 s uo3
+suo4 s uo4
+suo5 s uo5
+ta1 t a1
+ta2 t a2
+ta3 t a3
+ta4 t a4
+ta5 t a5
+tai1 t ai1
+tai2 t ai2
+tai3 t ai3
+tai4 t ai4
+tai5 t ai5
+tan1 t an1
+tan2 t an2
+tan3 t an3
+tan4 t an4
+tan5 t an5
+tang1 t ang1
+tang2 t ang2
+tang3 t ang3
+tang4 t ang4
+tang5 t ang5
+tao1 t ao1
+tao2 t ao2
+tao3 t ao3
+tao4 t ao4
+tao5 t ao5
+te1 t e1
+te2 t e2
+te3 t e3
+te4 t e4
+te5 t e5
+tei1 t ei1
+tei2 t ei2
+tei3 t ei3
+tei4 t ei4
+tei5 t ei5
+teng1 t eng1
+teng2 t eng2
+teng3 t eng3
+teng4 t eng4
+teng5 t eng5
+ti1 t i1
+ti2 t i2
+ti3 t i3
+ti4 t i4
+ti5 t i5
+tian1 t ian1
+tian2 t ian2
+tian3 t ian3
+tian4 t ian4
+tian5 t ian5
+tiao1 t iao1
+tiao2 t iao2
+tiao3 t iao3
+tiao4 t iao4
+tiao5 t iao5
+tie1 t ie1
+tie2 t ie2
+tie3 t ie3
+tie4 t ie4
+tie5 t ie5
+ting1 t ing1
+ting2 t ing2
+ting3 t ing3
+ting4 t ing4
+ting5 t ing5
+tong1 t ong1
+tong2 t ong2
+tong3 t ong3
+tong4 t ong4
+tong5 t ong5
+tou1 t ou1
+tou2 t ou2
+tou3 t ou3
+tou4 t ou4
+tou5 t ou5
+tu1 t u1
+tu2 t u2
+tu3 t u3
+tu4 t u4
+tu5 t u5
+tuan1 t uan1
+tuan2 t uan2
+tuan3 t uan3
+tuan4 t uan4
+tuan5 t uan5
+tui1 t uei1
+tui2 t uei2
+tui3 t uei3
+tui4 t uei4
+tui5 t uei5
+tun1 t uen1
+tun2 t uen2
+tun3 t uen3
+tun4 t uen4
+tun5 t uen5
+tuo1 t uo1
+tuo2 t uo2
+tuo3 t uo3
+tuo4 t uo4
+tuo5 t uo5
+wa1 w ua1
+wa2 w ua2
+wa3 w ua3
+wa4 w ua4
+wa5 w ua5
+wai1 w uai1
+wai2 w uai2
+wai3 w uai3
+wai4 w uai4
+wai5 w uai5
+wan1 w uan1
+wan2 w uan2
+wan3 w uan3
+wan4 w uan4
+wan5 w uan5
+wang1 w uang1
+wang2 w uang2
+wang3 w uang3
+wang4 w uang4
+wang5 w uang5
+wei1 w uei1
+wei2 w uei2
+wei3 w uei3
+wei4 w uei4
+wei5 w uei5
+wen1 w uen1
+wen2 w uen2
+wen3 w uen3
+wen4 w uen4
+wen5 w uen5
+weng1 w uen1
+weng2 w uen2
+weng3 w uen3
+weng4 w uen4
+weng5 w uen5
+wo1 w uo1
+wo2 w uo2
+wo3 w uo3
+wo4 w uo4
+wo5 w uo5
+wu1 w u1
+wu2 w u2
+wu3 w u3
+wu4 w u4
+wu5 w u5
+xi1 x i1
+xi2 x i2
+xi3 x i3
+xi4 x i4
+xi5 x i5
+xia1 x ia1
+xia2 x ia2
+xia3 x ia3
+xia4 x ia4
+xia5 x ia5
+xian1 x ian1
+xian2 x ian2
+xian3 x ian3
+xian4 x ian4
+xian5 x ian5
+xiang1 x iang1
+xiang2 x iang2
+xiang3 x iang3
+xiang4 x iang4
+xiang5 x iang5
+xiao1 x iao1
+xiao2 x iao2
+xiao3 x iao3
+xiao4 x iao4
+xiao5 x iao5
+xie1 x ie1
+xie2 x ie2
+xie3 x ie3
+xie4 x ie4
+xie5 x ie5
+xin1 x in1
+xin2 x in2
+xin3 x in3
+xin4 x in4
+xin5 x in5
+xing1 x ing1
+xing2 x ing2
+xing3 x ing3
+xing4 x ing4
+xing5 x ing5
+xiong1 x iong1
+xiong2 x iong2
+xiong3 x iong3
+xiong4 x iong4
+xiong5 x iong5
+xiu1 x iou1
+xiu2 x iou2
+xiu3 x iou3
+xiu4 x iou4
+xiu5 x iou5
+xu1 x v1
+xu2 x v2
+xu3 x v3
+xu4 x v4
+xu5 x v5
+xuan1 x van1
+xuan2 x van2
+xuan3 x van3
+xuan4 x van4
+xuan5 x van5
+xue1 x ve1
+xue2 x ve2
+xue3 x ve3
+xue4 x ve4
+xue5 x ve5
+xun1 x vn1
+xun2 x vn2
+xun3 x vn3
+xun4 x vn4
+xun5 x vn5
+ya1 y ia1
+ya2 y ia2
+ya3 y ia3
+ya4 y ia4
+ya5 y ia5
+yan1 y ian1
+yan2 y ian2
+yan3 y ian3
+yan4 y ian4
+yan5 y ian5
+yang1 y iang1
+yang2 y iang2
+yang3 y iang3
+yang4 y iang4
+yang5 y iang5
+yao1 y iao1
+yao2 y iao2
+yao3 y iao3
+yao4 y iao4
+yao5 y iao5
+ye1 y ie1
+ye2 y ie2
+ye3 y ie3
+ye4 y ie4
+ye5 y ie5
+yi1 y i1
+yi2 y i2
+yi3 y i3
+yi4 y i4
+yi5 y i5
+yin1 y in1
+yin2 y in2
+yin3 y in3
+yin4 y in4
+yin5 y in5
+ying1 y ing1
+ying2 y ing2
+ying3 y ing3
+ying4 y ing4
+ying5 y ing5
+yo1 y iou1
+yo2 y iou2
+yo3 y iou3
+yo4 y iou4
+yo5 y iou5
+yong1 y iong1
+yong2 y iong2
+yong3 y iong3
+yong4 y iong4
+yong5 y iong5
+you1 y iou1
+you2 y iou2
+you3 y iou3
+you4 y iou4
+you5 y iou5
+yu1 y v1
+yu2 y v2
+yu3 y v3
+yu4 y v4
+yu5 y v5
+yuan1 y van1
+yuan2 y van2
+yuan3 y van3
+yuan4 y van4
+yuan5 y van5
+yue1 y ve1
+yue2 y ve2
+yue3 y ve3
+yue4 y ve4
+yue5 y ve5
+yun1 y vn1
+yun2 y vn2
+yun3 y vn3
+yun4 y vn4
+yun5 y vn5
+za1 z a1
+za2 z a2
+za3 z a3
+za4 z a4
+za5 z a5
+zai1 z ai1
+zai2 z ai2
+zai3 z ai3
+zai4 z ai4
+zai5 z ai5
+zan1 z an1
+zan2 z an2
+zan3 z an3
+zan4 z an4
+zan5 z an5
+zang1 z ang1
+zang2 z ang2
+zang3 z ang3
+zang4 z ang4
+zang5 z ang5
+zao1 z ao1
+zao2 z ao2
+zao3 z ao3
+zao4 z ao4
+zao5 z ao5
+ze1 z e1
+ze2 z e2
+ze3 z e3
+ze4 z e4
+ze5 z e5
+zei1 z ei1
+zei2 z ei2
+zei3 z ei3
+zei4 z ei4
+zei5 z ei5
+zen1 z en1
+zen2 z en2
+zen3 z en3
+zen4 z en4
+zen5 z en5
+zeng1 z eng1
+zeng2 z eng2
+zeng3 z eng3
+zeng4 z eng4
+zeng5 z eng5
+zha1 zh a1
+zha2 zh a2
+zha3 zh a3
+zha4 zh a4
+zha5 zh a5
+zhai1 zh ai1
+zhai2 zh ai2
+zhai3 zh ai3
+zhai4 zh ai4
+zhai5 zh ai5
+zhan1 zh an1
+zhan2 zh an2
+zhan3 zh an3
+zhan4 zh an4
+zhan5 zh an5
+zhang1 zh ang1
+zhang2 zh ang2
+zhang3 zh ang3
+zhang4 zh ang4
+zhang5 zh ang5
+zhao1 zh ao1
+zhao2 zh ao2
+zhao3 zh ao3
+zhao4 zh ao4
+zhao5 zh ao5
+zhe1 zh e1
+zhe2 zh e2
+zhe3 zh e3
+zhe4 zh e4
+zhe5 zh e5
+zhei1 zh ei1
+zhei2 zh ei2
+zhei3 zh ei3
+zhei4 zh ei4
+zhei5 zh ei5
+zhen1 zh en1
+zhen2 zh en2
+zhen3 zh en3
+zhen4 zh en4
+zhen5 zh en5
+zheng1 zh eng1
+zheng2 zh eng2
+zheng3 zh eng3
+zheng4 zh eng4
+zheng5 zh eng5
+zhi1 zh iii1
+zhi2 zh iii2
+zhi3 zh iii3
+zhi4 zh iii4
+zhi5 zh iii5
+zhong1 zh ong1
+zhong2 zh ong2
+zhong3 zh ong3
+zhong4 zh ong4
+zhong5 zh ong5
+zhou1 zh ou1
+zhou2 zh ou2
+zhou3 zh ou3
+zhou4 zh ou4
+zhou5 zh ou5
+zhu1 zh u1
+zhu2 zh u2
+zhu3 zh u3
+zhu4 zh u4
+zhu5 zh u5
+zhua1 zh ua1
+zhua2 zh ua2
+zhua3 zh ua3
+zhua4 zh ua4
+zhua5 zh ua5
+zhuai1 zh uai1
+zhuai2 zh uai2
+zhuai3 zh uai3
+zhuai4 zh uai4
+zhuai5 zh uai5
+zhuan1 zh uan1
+zhuan2 zh uan2
+zhuan3 zh uan3
+zhuan4 zh uan4
+zhuan5 zh uan5
+zhuang1 zh uang1
+zhuang2 zh uang2
+zhuang3 zh uang3
+zhuang4 zh uang4
+zhuang5 zh uang5
+zhui1 zh uei1
+zhui2 zh uei2
+zhui3 zh uei3
+zhui4 zh uei4
+zhui5 zh uei5
+zhun1 zh uen1
+zhun2 zh uen2
+zhun3 zh uen3
+zhun4 zh uen4
+zhun5 zh uen5
+zhuo1 zh uo1
+zhuo2 zh uo2
+zhuo3 zh uo3
+zhuo4 zh uo4
+zhuo5 zh uo5
+zi1 z ii1
+zi2 z ii2
+zi3 z ii3
+zi4 z ii4
+zi5 z ii5
+zong1 z ong1
+zong2 z ong2
+zong3 z ong3
+zong4 z ong4
+zong5 z ong5
+zou1 z ou1
+zou2 z ou2
+zou3 z ou3
+zou4 z ou4
+zou5 z ou5
+zu1 z u1
+zu2 z u2
+zu3 z u3
+zu4 z u4
+zu5 z u5
+zuan1 z uan1
+zuan2 z uan2
+zuan3 z uan3
+zuan4 z uan4
+zuan5 z uan5
+zui1 z uei1
+zui2 z uei2
+zui3 z uei3
+zui4 z uei4
+zui5 z uei5
+zun1 z uen1
+zun2 z uen2
+zun3 z uen3
+zun4 z uen4
+zun5 z uen5
+zuo1 z uo1
+zuo2 z uo2
+zuo3 z uo3
+zuo4 z uo4
+zuo5 z uo5
+ar1 a1 rr
+ar2 a2 rr
+ar3 a3 rr
+ar4 a4 rr
+ar5 a5 rr
+air1 ai1 rr
+air2 ai2 rr
+air3 ai3 rr
+air4 ai4 rr
+air5 ai5 rr
+anr1 an1 rr
+anr2 an2 rr
+anr3 an3 rr
+anr4 an4 rr
+anr5 an5 rr
+angr1 ang1 rr
+angr2 ang2 rr
+angr3 ang3 rr
+angr4 ang4 rr
+angr5 ang5 rr
+aor1 ao1 rr
+aor2 ao2 rr
+aor3 ao3 rr
+aor4 ao4 rr
+aor5 ao5 rr
+bar1 b a1 rr
+bar2 b a2 rr
+bar3 b a3 rr
+bar4 b a4 rr
+bar5 b a5 rr
+bair1 b ai1 rr
+bair2 b ai2 rr
+bair3 b ai3 rr
+bair4 b ai4 rr
+bair5 b ai5 rr
+banr1 b an1 rr
+banr2 b an2 rr
+banr3 b an3 rr
+banr4 b an4 rr
+banr5 b an5 rr
+bangr1 b ang1 rr
+bangr2 b ang2 rr
+bangr3 b ang3 rr
+bangr4 b ang4 rr
+bangr5 b ang5 rr
+baor1 b ao1 rr
+baor2 b ao2 rr
+baor3 b ao3 rr
+baor4 b ao4 rr
+baor5 b ao5 rr
+beir1 b ei1 rr
+beir2 b ei2 rr
+beir3 b ei3 rr
+beir4 b ei4 rr
+beir5 b ei5 rr
+benr1 b en1 rr
+benr2 b en2 rr
+benr3 b en3 rr
+benr4 b en4 rr
+benr5 b en5 rr
+bengr1 b eng1 rr
+bengr2 b eng2 rr
+bengr3 b eng3 rr
+bengr4 b eng4 rr
+bengr5 b eng5 rr
+bir1 b i1 rr
+bir2 b i2 rr
+bir3 b i3 rr
+bir4 b i4 rr
+bir5 b i5 rr
+bianr1 b ian1 rr
+bianr2 b ian2 rr
+bianr3 b ian3 rr
+bianr4 b ian4 rr
+bianr5 b ian5 rr
+biaor1 b iao1 rr
+biaor2 b iao2 rr
+biaor3 b iao3 rr
+biaor4 b iao4 rr
+biaor5 b iao5 rr
+bier1 b ie1 rr
+bier2 b ie2 rr
+bier3 b ie3 rr
+bier4 b ie4 rr
+bier5 b ie5 rr
+binr1 b in1 rr
+binr2 b in2 rr
+binr3 b in3 rr
+binr4 b in4 rr
+binr5 b in5 rr
+bingr1 b ing1 rr
+bingr2 b ing2 rr
+bingr3 b ing3 rr
+bingr4 b ing4 rr
+bingr5 b ing5 rr
+bor1 b o1 rr
+bor2 b o2 rr
+bor3 b o3 rr
+bor4 b o4 rr
+bor5 b o5 rr
+bur1 b u1 rr
+bur2 b u2 rr
+bur3 b u3 rr
+bur4 b u4 rr
+bur5 b u5 rr
+car1 c a1 rr
+car2 c a2 rr
+car3 c a3 rr
+car4 c a4 rr
+car5 c a5 rr
+cair1 c ai1 rr
+cair2 c ai2 rr
+cair3 c ai3 rr
+cair4 c ai4 rr
+cair5 c ai5 rr
+canr1 c an1 rr
+canr2 c an2 rr
+canr3 c an3 rr
+canr4 c an4 rr
+canr5 c an5 rr
+cangr1 c ang1 rr
+cangr2 c ang2 rr
+cangr3 c ang3 rr
+cangr4 c ang4 rr
+cangr5 c ang5 rr
+caor1 c ao1 rr
+caor2 c ao2 rr
+caor3 c ao3 rr
+caor4 c ao4 rr
+caor5 c ao5 rr
+cer1 c e1 rr
+cer2 c e2 rr
+cer3 c e3 rr
+cer4 c e4 rr
+cer5 c e5 rr
+cenr1 c en1 rr
+cenr2 c en2 rr
+cenr3 c en3 rr
+cenr4 c en4 rr
+cenr5 c en5 rr
+cengr1 c eng1 rr
+cengr2 c eng2 rr
+cengr3 c eng3 rr
+cengr4 c eng4 rr
+cengr5 c eng5 rr
+char1 ch a1 rr
+char2 ch a2 rr
+char3 ch a3 rr
+char4 ch a4 rr
+char5 ch a5 rr
+chair1 ch ai1 rr
+chair2 ch ai2 rr
+chair3 ch ai3 rr
+chair4 ch ai4 rr
+chair5 ch ai5 rr
+chanr1 ch an1 rr
+chanr2 ch an2 rr
+chanr3 ch an3 rr
+chanr4 ch an4 rr
+chanr5 ch an5 rr
+changr1 ch ang1 rr
+changr2 ch ang2 rr
+changr3 ch ang3 rr
+changr4 ch ang4 rr
+changr5 ch ang5 rr
+chaor1 ch ao1 rr
+chaor2 ch ao2 rr
+chaor3 ch ao3 rr
+chaor4 ch ao4 rr
+chaor5 ch ao5 rr
+cher1 ch e1 rr
+cher2 ch e2 rr
+cher3 ch e3 rr
+cher4 ch e4 rr
+cher5 ch e5 rr
+chenr1 ch en1 rr
+chenr2 ch en2 rr
+chenr3 ch en3 rr
+chenr4 ch en4 rr
+chenr5 ch en5 rr
+chengr1 ch eng1 rr
+chengr2 ch eng2 rr
+chengr3 ch eng3 rr
+chengr4 ch eng4 rr
+chengr5 ch eng5 rr
+chir1 ch iii1 rr
+chir2 ch iii2 rr
+chir3 ch iii3 rr
+chir4 ch iii4 rr
+chir5 ch iii5 rr
+chongr1 ch ong1 rr
+chongr2 ch ong2 rr
+chongr3 ch ong3 rr
+chongr4 ch ong4 rr
+chongr5 ch ong5 rr
+chour1 ch ou1 rr
+chour2 ch ou2 rr
+chour3 ch ou3 rr
+chour4 ch ou4 rr
+chour5 ch ou5 rr
+chur1 ch u1 rr
+chur2 ch u2 rr
+chur3 ch u3 rr
+chur4 ch u4 rr
+chur5 ch u5 rr
+chuair1 ch uai1 rr
+chuair2 ch uai2 rr
+chuair3 ch uai3 rr
+chuair4 ch uai4 rr
+chuair5 ch uai5 rr
+chuanr1 ch uan1 rr
+chuanr2 ch uan2 rr
+chuanr3 ch uan3 rr
+chuanr4 ch uan4 rr
+chuanr5 ch uan5 rr
+chuangr1 ch uang1 rr
+chuangr2 ch uang2 rr
+chuangr3 ch uang3 rr
+chuangr4 ch uang4 rr
+chuangr5 ch uang5 rr
+chuir1 ch uei1 rr
+chuir2 ch uei2 rr
+chuir3 ch uei3 rr
+chuir4 ch uei4 rr
+chuir5 ch uei5 rr
+chunr1 ch uen1 rr
+chunr2 ch uen2 rr
+chunr3 ch uen3 rr
+chunr4 ch uen4 rr
+chunr5 ch uen5 rr
+chuor1 ch uo1 rr
+chuor2 ch uo2 rr
+chuor3 ch uo3 rr
+chuor4 ch uo4 rr
+chuor5 ch uo5 rr
+cir1 c ii1 rr
+cir2 c ii2 rr
+cir3 c ii3 rr
+cir4 c ii4 rr
+cir5 c ii5 rr
+congr1 c ong1 rr
+congr2 c ong2 rr
+congr3 c ong3 rr
+congr4 c ong4 rr
+congr5 c ong5 rr
+cour1 c ou1 rr
+cour2 c ou2 rr
+cour3 c ou3 rr
+cour4 c ou4 rr
+cour5 c ou5 rr
+cur1 c u1 rr
+cur2 c u2 rr
+cur3 c u3 rr
+cur4 c u4 rr
+cur5 c u5 rr
+cuanr1 c uan1 rr
+cuanr2 c uan2 rr
+cuanr3 c uan3 rr
+cuanr4 c uan4 rr
+cuanr5 c uan5 rr
+cuir1 c uei1 rr
+cuir2 c uei2 rr
+cuir3 c uei3 rr
+cuir4 c uei4 rr
+cuir5 c uei5 rr
+cunr1 c uen1 rr
+cunr2 c uen2 rr
+cunr3 c uen3 rr
+cunr4 c uen4 rr
+cunr5 c uen5 rr
+cuor1 c uo1 rr
+cuor2 c uo2 rr
+cuor3 c uo3 rr
+cuor4 c uo4 rr
+cuor5 c uo5 rr
+dar1 d a1 rr
+dar2 d a2 rr
+dar3 d a3 rr
+dar4 d a4 rr
+dar5 d a5 rr
+dair1 d ai1 rr
+dair2 d ai2 rr
+dair3 d ai3 rr
+dair4 d ai4 rr
+dair5 d ai5 rr
+danr1 d an1 rr
+danr2 d an2 rr
+danr3 d an3 rr
+danr4 d an4 rr
+danr5 d an5 rr
+dangr1 d ang1 rr
+dangr2 d ang2 rr
+dangr3 d ang3 rr
+dangr4 d ang4 rr
+dangr5 d ang5 rr
+daor1 d ao1 rr
+daor2 d ao2 rr
+daor3 d ao3 rr
+daor4 d ao4 rr
+daor5 d ao5 rr
+der1 d e1 rr
+der2 d e2 rr
+der3 d e3 rr
+der4 d e4 rr
+der5 d e5 rr
+deir1 d ei1 rr
+deir2 d ei2 rr
+deir3 d ei3 rr
+deir4 d ei4 rr
+deir5 d ei5 rr
+denr1 d en1 rr
+denr2 d en2 rr
+denr3 d en3 rr
+denr4 d en4 rr
+denr5 d en5 rr
+dengr1 d eng1 rr
+dengr2 d eng2 rr
+dengr3 d eng3 rr
+dengr4 d eng4 rr
+dengr5 d eng5 rr
+dir1 d i1 rr
+dir2 d i2 rr
+dir3 d i3 rr
+dir4 d i4 rr
+dir5 d i5 rr
+diar1 d ia1 rr
+diar2 d ia2 rr
+diar3 d ia3 rr
+diar4 d ia4 rr
+diar5 d ia5 rr
+dianr1 d ian1 rr
+dianr2 d ian2 rr
+dianr3 d ian3 rr
+dianr4 d ian4 rr
+dianr5 d ian5 rr
+diaor1 d iao1 rr
+diaor2 d iao2 rr
+diaor3 d iao3 rr
+diaor4 d iao4 rr
+diaor5 d iao5 rr
+dier1 d ie1 rr
+dier2 d ie2 rr
+dier3 d ie3 rr
+dier4 d ie4 rr
+dier5 d ie5 rr
+dingr1 d ing1 rr
+dingr2 d ing2 rr
+dingr3 d ing3 rr
+dingr4 d ing4 rr
+dingr5 d ing5 rr
+diur1 d iou1 rr
+diur2 d iou2 rr
+diur3 d iou3 rr
+diur4 d iou4 rr
+diur5 d iou5 rr
+dongr1 d ong1 rr
+dongr2 d ong2 rr
+dongr3 d ong3 rr
+dongr4 d ong4 rr
+dongr5 d ong5 rr
+dour1 d ou1 rr
+dour2 d ou2 rr
+dour3 d ou3 rr
+dour4 d ou4 rr
+dour5 d ou5 rr
+dur1 d u1 rr
+dur2 d u2 rr
+dur3 d u3 rr
+dur4 d u4 rr
+dur5 d u5 rr
+duanr1 d uan1 rr
+duanr2 d uan2 rr
+duanr3 d uan3 rr
+duanr4 d uan4 rr
+duanr5 d uan5 rr
+duir1 d uei1 rr
+duir2 d uei2 rr
+duir3 d uei3 rr
+duir4 d uei4 rr
+duir5 d uei5 rr
+dunr1 d uen1 rr
+dunr2 d uen2 rr
+dunr3 d uen3 rr
+dunr4 d uen4 rr
+dunr5 d uen5 rr
+duor1 d uo1 rr
+duor2 d uo2 rr
+duor3 d uo3 rr
+duor4 d uo4 rr
+duor5 d uo5 rr
+er1 e1 rr
+er2 e2 rr
+er3 e3 rr
+er4 e4 rr
+er5 e5 rr
+eir1 ei1 rr
+eir2 ei2 rr
+eir3 ei3 rr
+eir4 ei4 rr
+eir5 ei5 rr
+enr1 en1 rr
+enr2 en2 rr
+enr3 en3 rr
+enr4 en4 rr
+enr5 en5 rr
+engr1 eng1 rr
+engr2 eng2 rr
+engr3 eng3 rr
+engr4 eng4 rr
+engr5 eng5 rr
+far1 f a1 rr
+far2 f a2 rr
+far3 f a3 rr
+far4 f a4 rr
+far5 f a5 rr
+fanr1 f an1 rr
+fanr2 f an2 rr
+fanr3 f an3 rr
+fanr4 f an4 rr
+fanr5 f an5 rr
+fangr1 f ang1 rr
+fangr2 f ang2 rr
+fangr3 f ang3 rr
+fangr4 f ang4 rr
+fangr5 f ang5 rr
+feir1 f ei1 rr
+feir2 f ei2 rr
+feir3 f ei3 rr
+feir4 f ei4 rr
+feir5 f ei5 rr
+fenr1 f en1 rr
+fenr2 f en2 rr
+fenr3 f en3 rr
+fenr4 f en4 rr
+fenr5 f en5 rr
+fengr1 f eng1 rr
+fengr2 f eng2 rr
+fengr3 f eng3 rr
+fengr4 f eng4 rr
+fengr5 f eng5 rr
+for1 f o1 rr
+for2 f o2 rr
+for3 f o3 rr
+for4 f o4 rr
+for5 f o5 rr
+four1 f ou1 rr
+four2 f ou2 rr
+four3 f ou3 rr
+four4 f ou4 rr
+four5 f ou5 rr
+fur1 f u1 rr
+fur2 f u2 rr
+fur3 f u3 rr
+fur4 f u4 rr
+fur5 f u5 rr
+gar1 g a1 rr
+gar2 g a2 rr
+gar3 g a3 rr
+gar4 g a4 rr
+gar5 g a5 rr
+gair1 g ai1 rr
+gair2 g ai2 rr
+gair3 g ai3 rr
+gair4 g ai4 rr
+gair5 g ai5 rr
+ganr1 g an1 rr
+ganr2 g an2 rr
+ganr3 g an3 rr
+ganr4 g an4 rr
+ganr5 g an5 rr
+gangr1 g ang1 rr
+gangr2 g ang2 rr
+gangr3 g ang3 rr
+gangr4 g ang4 rr
+gangr5 g ang5 rr
+gaor1 g ao1 rr
+gaor2 g ao2 rr
+gaor3 g ao3 rr
+gaor4 g ao4 rr
+gaor5 g ao5 rr
+ger1 g e1 rr
+ger2 g e2 rr
+ger3 g e3 rr
+ger4 g e4 rr
+ger5 g e5 rr
+geir1 g ei1 rr
+geir2 g ei2 rr
+geir3 g ei3 rr
+geir4 g ei4 rr
+geir5 g ei5 rr
+genr1 g en1 rr
+genr2 g en2 rr
+genr3 g en3 rr
+genr4 g en4 rr
+genr5 g en5 rr
+gengr1 g eng1 rr
+gengr2 g eng2 rr
+gengr3 g eng3 rr
+gengr4 g eng4 rr
+gengr5 g eng5 rr
+gongr1 g ong1 rr
+gongr2 g ong2 rr
+gongr3 g ong3 rr
+gongr4 g ong4 rr
+gongr5 g ong5 rr
+gour1 g ou1 rr
+gour2 g ou2 rr
+gour3 g ou3 rr
+gour4 g ou4 rr
+gour5 g ou5 rr
+gur1 g u1 rr
+gur2 g u2 rr
+gur3 g u3 rr
+gur4 g u4 rr
+gur5 g u5 rr
+guar1 g ua1 rr
+guar2 g ua2 rr
+guar3 g ua3 rr
+guar4 g ua4 rr
+guar5 g ua5 rr
+guair1 g uai1 rr
+guair2 g uai2 rr
+guair3 g uai3 rr
+guair4 g uai4 rr
+guair5 g uai5 rr
+guanr1 g uan1 rr
+guanr2 g uan2 rr
+guanr3 g uan3 rr
+guanr4 g uan4 rr
+guanr5 g uan5 rr
+guangr1 g uang1 rr
+guangr2 g uang2 rr
+guangr3 g uang3 rr
+guangr4 g uang4 rr
+guangr5 g uang5 rr
+guir1 g uei1 rr
+guir2 g uei2 rr
+guir3 g uei3 rr
+guir4 g uei4 rr
+guir5 g uei5 rr
+gunr1 g uen1 rr
+gunr2 g uen2 rr
+gunr3 g uen3 rr
+gunr4 g uen4 rr
+gunr5 g uen5 rr
+guor1 g uo1 rr
+guor2 g uo2 rr
+guor3 g uo3 rr
+guor4 g uo4 rr
+guor5 g uo5 rr
+har1 h a1 rr
+har2 h a2 rr
+har3 h a3 rr
+har4 h a4 rr
+har5 h a5 rr
+hair1 h ai1 rr
+hair2 h ai2 rr
+hair3 h ai3 rr
+hair4 h ai4 rr
+hair5 h ai5 rr
+hanr1 h an1 rr
+hanr2 h an2 rr
+hanr3 h an3 rr
+hanr4 h an4 rr
+hanr5 h an5 rr
+hangr1 h ang1 rr
+hangr2 h ang2 rr
+hangr3 h ang3 rr
+hangr4 h ang4 rr
+hangr5 h ang5 rr
+haor1 h ao1 rr
+haor2 h ao2 rr
+haor3 h ao3 rr
+haor4 h ao4 rr
+haor5 h ao5 rr
+her1 h e1 rr
+her2 h e2 rr
+her3 h e3 rr
+her4 h e4 rr
+her5 h e5 rr
+heir1 h ei1 rr
+heir2 h ei2 rr
+heir3 h ei3 rr
+heir4 h ei4 rr
+heir5 h ei5 rr
+henr1 h en1 rr
+henr2 h en2 rr
+henr3 h en3 rr
+henr4 h en4 rr
+henr5 h en5 rr
+hengr1 h eng1 rr
+hengr2 h eng2 rr
+hengr3 h eng3 rr
+hengr4 h eng4 rr
+hengr5 h eng5 rr
+hongr1 h ong1 rr
+hongr2 h ong2 rr
+hongr3 h ong3 rr
+hongr4 h ong4 rr
+hongr5 h ong5 rr
+hour1 h ou1 rr
+hour2 h ou2 rr
+hour3 h ou3 rr
+hour4 h ou4 rr
+hour5 h ou5 rr
+hur1 h u1 rr
+hur2 h u2 rr
+hur3 h u3 rr
+hur4 h u4 rr
+hur5 h u5 rr
+huar1 h ua1 rr
+huar2 h ua2 rr
+huar3 h ua3 rr
+huar4 h ua4 rr
+huar5 h ua5 rr
+huair1 h uai1 rr
+huair2 h uai2 rr
+huair3 h uai3 rr
+huair4 h uai4 rr
+huair5 h uai5 rr
+huanr1 h uan1 rr
+huanr2 h uan2 rr
+huanr3 h uan3 rr
+huanr4 h uan4 rr
+huanr5 h uan5 rr
+huangr1 h uang1 rr
+huangr2 h uang2 rr
+huangr3 h uang3 rr
+huangr4 h uang4 rr
+huangr5 h uang5 rr
+huir1 h uei1 rr
+huir2 h uei2 rr
+huir3 h uei3 rr
+huir4 h uei4 rr
+huir5 h uei5 rr
+hunr1 h uen1 rr
+hunr2 h uen2 rr
+hunr3 h uen3 rr
+hunr4 h uen4 rr
+hunr5 h uen5 rr
+huor1 h uo1 rr
+huor2 h uo2 rr
+huor3 h uo3 rr
+huor4 h uo4 rr
+huor5 h uo5 rr
+jir1 j i1 rr
+jir2 j i2 rr
+jir3 j i3 rr
+jir4 j i4 rr
+jir5 j i5 rr
+jiar1 j ia1 rr
+jiar2 j ia2 rr
+jiar3 j ia3 rr
+jiar4 j ia4 rr
+jiar5 j ia5 rr
+jianr1 j ian1 rr
+jianr2 j ian2 rr
+jianr3 j ian3 rr
+jianr4 j ian4 rr
+jianr5 j ian5 rr
+jiangr1 j iang1 rr
+jiangr2 j iang2 rr
+jiangr3 j iang3 rr
+jiangr4 j iang4 rr
+jiangr5 j iang5 rr
+jiaor1 j iao1 rr
+jiaor2 j iao2 rr
+jiaor3 j iao3 rr
+jiaor4 j iao4 rr
+jiaor5 j iao5 rr
+jier1 j ie1 rr
+jier2 j ie2 rr
+jier3 j ie3 rr
+jier4 j ie4 rr
+jier5 j ie5 rr
+jinr1 j in1 rr
+jinr2 j in2 rr
+jinr3 j in3 rr
+jinr4 j in4 rr
+jinr5 j in5 rr
+jingr1 j ing1 rr
+jingr2 j ing2 rr
+jingr3 j ing3 rr
+jingr4 j ing4 rr
+jingr5 j ing5 rr
+jiongr1 j iong1 rr
+jiongr2 j iong2 rr
+jiongr3 j iong3 rr
+jiongr4 j iong4 rr
+jiongr5 j iong5 rr
+jiur1 j iou1 rr
+jiur2 j iou2 rr
+jiur3 j iou3 rr
+jiur4 j iou4 rr
+jiur5 j iou5 rr
+jur1 j v1 rr
+jur2 j v2 rr
+jur3 j v3 rr
+jur4 j v4 rr
+jur5 j v5 rr
+juanr1 j van1 rr
+juanr2 j van2 rr
+juanr3 j van3 rr
+juanr4 j van4 rr
+juanr5 j van5 rr
+juer1 j ve1 rr
+juer2 j ve2 rr
+juer3 j ve3 rr
+juer4 j ve4 rr
+juer5 j ve5 rr
+junr1 j vn1 rr
+junr2 j vn2 rr
+junr3 j vn3 rr
+junr4 j vn4 rr
+junr5 j vn5 rr
+kar1 k a1 rr
+kar2 k a2 rr
+kar3 k a3 rr
+kar4 k a4 rr
+kar5 k a5 rr
+kair1 k ai1 rr
+kair2 k ai2 rr
+kair3 k ai3 rr
+kair4 k ai4 rr
+kair5 k ai5 rr
+kanr1 k an1 rr
+kanr2 k an2 rr
+kanr3 k an3 rr
+kanr4 k an4 rr
+kanr5 k an5 rr
+kangr1 k ang1 rr
+kangr2 k ang2 rr
+kangr3 k ang3 rr
+kangr4 k ang4 rr
+kangr5 k ang5 rr
+kaor1 k ao1 rr
+kaor2 k ao2 rr
+kaor3 k ao3 rr
+kaor4 k ao4 rr
+kaor5 k ao5 rr
+ker1 k e1 rr
+ker2 k e2 rr
+ker3 k e3 rr
+ker4 k e4 rr
+ker5 k e5 rr
+keir1 k ei1 rr
+keir2 k ei2 rr
+keir3 k ei3 rr
+keir4 k ei4 rr
+keir5 k ei5 rr
+kenr1 k en1 rr
+kenr2 k en2 rr
+kenr3 k en3 rr
+kenr4 k en4 rr
+kenr5 k en5 rr
+kengr1 k eng1 rr
+kengr2 k eng2 rr
+kengr3 k eng3 rr
+kengr4 k eng4 rr
+kengr5 k eng5 rr
+kongr1 k ong1 rr
+kongr2 k ong2 rr
+kongr3 k ong3 rr
+kongr4 k ong4 rr
+kongr5 k ong5 rr
+kour1 k ou1 rr
+kour2 k ou2 rr
+kour3 k ou3 rr
+kour4 k ou4 rr
+kour5 k ou5 rr
+kur1 k u1 rr
+kur2 k u2 rr
+kur3 k u3 rr
+kur4 k u4 rr
+kur5 k u5 rr
+kuar1 k ua1 rr
+kuar2 k ua2 rr
+kuar3 k ua3 rr
+kuar4 k ua4 rr
+kuar5 k ua5 rr
+kuair1 k uai1 rr
+kuair2 k uai2 rr
+kuair3 k uai3 rr
+kuair4 k uai4 rr
+kuair5 k uai5 rr
+kuanr1 k uan1 rr
+kuanr2 k uan2 rr
+kuanr3 k uan3 rr
+kuanr4 k uan4 rr
+kuanr5 k uan5 rr
+kuangr1 k uang1 rr
+kuangr2 k uang2 rr
+kuangr3 k uang3 rr
+kuangr4 k uang4 rr
+kuangr5 k uang5 rr
+kuir1 k uei1 rr
+kuir2 k uei2 rr
+kuir3 k uei3 rr
+kuir4 k uei4 rr
+kuir5 k uei5 rr
+kunr1 k uen1 rr
+kunr2 k uen2 rr
+kunr3 k uen3 rr
+kunr4 k uen4 rr
+kunr5 k uen5 rr
+kuor1 k uo1 rr
+kuor2 k uo2 rr
+kuor3 k uo3 rr
+kuor4 k uo4 rr
+kuor5 k uo5 rr
+lar1 l a1 rr
+lar2 l a2 rr
+lar3 l a3 rr
+lar4 l a4 rr
+lar5 l a5 rr
+lair1 l ai1 rr
+lair2 l ai2 rr
+lair3 l ai3 rr
+lair4 l ai4 rr
+lair5 l ai5 rr
+lanr1 l an1 rr
+lanr2 l an2 rr
+lanr3 l an3 rr
+lanr4 l an4 rr
+lanr5 l an5 rr
+langr1 l ang1 rr
+langr2 l ang2 rr
+langr3 l ang3 rr
+langr4 l ang4 rr
+langr5 l ang5 rr
+laor1 l ao1 rr
+laor2 l ao2 rr
+laor3 l ao3 rr
+laor4 l ao4 rr
+laor5 l ao5 rr
+ler1 l e1 rr
+ler2 l e2 rr
+ler3 l e3 rr
+ler4 l e4 rr
+ler5 l e5 rr
+leir1 l ei1 rr
+leir2 l ei2 rr
+leir3 l ei3 rr
+leir4 l ei4 rr
+leir5 l ei5 rr
+lengr1 l eng1 rr
+lengr2 l eng2 rr
+lengr3 l eng3 rr
+lengr4 l eng4 rr
+lengr5 l eng5 rr
+lir1 l i1 rr
+lir2 l i2 rr
+lir3 l i3 rr
+lir4 l i4 rr
+lir5 l i5 rr
+liar1 l ia1 rr
+liar2 l ia2 rr
+liar3 l ia3 rr
+liar4 l ia4 rr
+liar5 l ia5 rr
+lianr1 l ian1 rr
+lianr2 l ian2 rr
+lianr3 l ian3 rr
+lianr4 l ian4 rr
+lianr5 l ian5 rr
+liangr1 l iang1 rr
+liangr2 l iang2 rr
+liangr3 l iang3 rr
+liangr4 l iang4 rr
+liangr5 l iang5 rr
+liaor1 l iao1 rr
+liaor2 l iao2 rr
+liaor3 l iao3 rr
+liaor4 l iao4 rr
+liaor5 l iao5 rr
+lier1 l ie1 rr
+lier2 l ie2 rr
+lier3 l ie3 rr
+lier4 l ie4 rr
+lier5 l ie5 rr
+linr1 l in1 rr
+linr2 l in2 rr
+linr3 l in3 rr
+linr4 l in4 rr
+linr5 l in5 rr
+lingr1 l ing1 rr
+lingr2 l ing2 rr
+lingr3 l ing3 rr
+lingr4 l ing4 rr
+lingr5 l ing5 rr
+liur1 l iou1 rr
+liur2 l iou2 rr
+liur3 l iou3 rr
+liur4 l iou4 rr
+liur5 l iou5 rr
+lor1 l o1 rr
+lor2 l o2 rr
+lor3 l o3 rr
+lor4 l o4 rr
+lor5 l o5 rr
+longr1 l ong1 rr
+longr2 l ong2 rr
+longr3 l ong3 rr
+longr4 l ong4 rr
+longr5 l ong5 rr
+lour1 l ou1 rr
+lour2 l ou2 rr
+lour3 l ou3 rr
+lour4 l ou4 rr
+lour5 l ou5 rr
+lur1 l u1 rr
+lur2 l u2 rr
+lur3 l u3 rr
+lur4 l u4 rr
+lur5 l u5 rr
+luanr1 l uan1 rr
+luanr2 l uan2 rr
+luanr3 l uan3 rr
+luanr4 l uan4 rr
+luanr5 l uan5 rr
+luer1 l ve1 rr
+luer2 l ve2 rr
+luer3 l ve3 rr
+luer4 l ve4 rr
+luer5 l ve5 rr
+lver1 l ve1 rr
+lver2 l ve2 rr
+lver3 l ve3 rr
+lver4 l ve4 rr
+lver5 l ve5 rr
+lunr1 l uen1 rr
+lunr2 l uen2 rr
+lunr3 l uen3 rr
+lunr4 l uen4 rr
+lunr5 l uen5 rr
+luor1 l uo1 rr
+luor2 l uo2 rr
+luor3 l uo3 rr
+luor4 l uo4 rr
+luor5 l uo5 rr
+lvr1 l v1 rr
+lvr2 l v2 rr
+lvr3 l v3 rr
+lvr4 l v4 rr
+lvr5 l v5 rr
+mar1 m a1 rr
+mar2 m a2 rr
+mar3 m a3 rr
+mar4 m a4 rr
+mar5 m a5 rr
+mair1 m ai1 rr
+mair2 m ai2 rr
+mair3 m ai3 rr
+mair4 m ai4 rr
+mair5 m ai5 rr
+manr1 m an1 rr
+manr2 m an2 rr
+manr3 m an3 rr
+manr4 m an4 rr
+manr5 m an5 rr
+mangr1 m ang1 rr
+mangr2 m ang2 rr
+mangr3 m ang3 rr
+mangr4 m ang4 rr
+mangr5 m ang5 rr
+maor1 m ao1 rr
+maor2 m ao2 rr
+maor3 m ao3 rr
+maor4 m ao4 rr
+maor5 m ao5 rr
+mer1 m e1 rr
+mer2 m e2 rr
+mer3 m e3 rr
+mer4 m e4 rr
+mer5 m e5 rr
+meir1 m ei1 rr
+meir2 m ei2 rr
+meir3 m ei3 rr
+meir4 m ei4 rr
+meir5 m ei5 rr
+menr1 m en1 rr
+menr2 m en2 rr
+menr3 m en3 rr
+menr4 m en4 rr
+menr5 m en5 rr
+mengr1 m eng1 rr
+mengr2 m eng2 rr
+mengr3 m eng3 rr
+mengr4 m eng4 rr
+mengr5 m eng5 rr
+mir1 m i1 rr
+mir2 m i2 rr
+mir3 m i3 rr
+mir4 m i4 rr
+mir5 m i5 rr
+mianr1 m ian1 rr
+mianr2 m ian2 rr
+mianr3 m ian3 rr
+mianr4 m ian4 rr
+mianr5 m ian5 rr
+miaor1 m iao1 rr
+miaor2 m iao2 rr
+miaor3 m iao3 rr
+miaor4 m iao4 rr
+miaor5 m iao5 rr
+mier1 m ie1 rr
+mier2 m ie2 rr
+mier3 m ie3 rr
+mier4 m ie4 rr
+mier5 m ie5 rr
+minr1 m in1 rr
+minr2 m in2 rr
+minr3 m in3 rr
+minr4 m in4 rr
+minr5 m in5 rr
+mingr1 m ing1 rr
+mingr2 m ing2 rr
+mingr3 m ing3 rr
+mingr4 m ing4 rr
+mingr5 m ing5 rr
+miur1 m iou1 rr
+miur2 m iou2 rr
+miur3 m iou3 rr
+miur4 m iou4 rr
+miur5 m iou5 rr
+mor1 m o1 rr
+mor2 m o2 rr
+mor3 m o3 rr
+mor4 m o4 rr
+mor5 m o5 rr
+mour1 m ou1 rr
+mour2 m ou2 rr
+mour3 m ou3 rr
+mour4 m ou4 rr
+mour5 m ou5 rr
+mur1 m u1 rr
+mur2 m u2 rr
+mur3 m u3 rr
+mur4 m u4 rr
+mur5 m u5 rr
+nar1 n a1 rr
+nar2 n a2 rr
+nar3 n a3 rr
+nar4 n a4 rr
+nar5 n a5 rr
+nair1 n ai1 rr
+nair2 n ai2 rr
+nair3 n ai3 rr
+nair4 n ai4 rr
+nair5 n ai5 rr
+nanr1 n an1 rr
+nanr2 n an2 rr
+nanr3 n an3 rr
+nanr4 n an4 rr
+nanr5 n an5 rr
+nangr1 n ang1 rr
+nangr2 n ang2 rr
+nangr3 n ang3 rr
+nangr4 n ang4 rr
+nangr5 n ang5 rr
+naor1 n ao1 rr
+naor2 n ao2 rr
+naor3 n ao3 rr
+naor4 n ao4 rr
+naor5 n ao5 rr
+ner1 n e1 rr
+ner2 n e2 rr
+ner3 n e3 rr
+ner4 n e4 rr
+ner5 n e5 rr
+neir1 n ei1 rr
+neir2 n ei2 rr
+neir3 n ei3 rr
+neir4 n ei4 rr
+neir5 n ei5 rr
+nenr1 n en1 rr
+nenr2 n en2 rr
+nenr3 n en3 rr
+nenr4 n en4 rr
+nenr5 n en5 rr
+nengr1 n eng1 rr
+nengr2 n eng2 rr
+nengr3 n eng3 rr
+nengr4 n eng4 rr
+nengr5 n eng5 rr
+nir1 n i1 rr
+nir2 n i2 rr
+nir3 n i3 rr
+nir4 n i4 rr
+nir5 n i5 rr
+nianr1 n ian1 rr
+nianr2 n ian2 rr
+nianr3 n ian3 rr
+nianr4 n ian4 rr
+nianr5 n ian5 rr
+niangr1 n iang1 rr
+niangr2 n iang2 rr
+niangr3 n iang3 rr
+niangr4 n iang4 rr
+niangr5 n iang5 rr
+niaor1 n iao1 rr
+niaor2 n iao2 rr
+niaor3 n iao3 rr
+niaor4 n iao4 rr
+niaor5 n iao5 rr
+nier1 n ie1 rr
+nier2 n ie2 rr
+nier3 n ie3 rr
+nier4 n ie4 rr
+nier5 n ie5 rr
+ninr1 n in1 rr
+ninr2 n in2 rr
+ninr3 n in3 rr
+ninr4 n in4 rr
+ninr5 n in5 rr
+ningr1 n ing1 rr
+ningr2 n ing2 rr
+ningr3 n ing3 rr
+ningr4 n ing4 rr
+ningr5 n ing5 rr
+niur1 n iou1 rr
+niur2 n iou2 rr
+niur3 n iou3 rr
+niur4 n iou4 rr
+niur5 n iou5 rr
+nongr1 n ong1 rr
+nongr2 n ong2 rr
+nongr3 n ong3 rr
+nongr4 n ong4 rr
+nongr5 n ong5 rr
+nour1 n ou1 rr
+nour2 n ou2 rr
+nour3 n ou3 rr
+nour4 n ou4 rr
+nour5 n ou5 rr
+nur1 n u1 rr
+nur2 n u2 rr
+nur3 n u3 rr
+nur4 n u4 rr
+nur5 n u5 rr
+nuanr1 n uan1 rr
+nuanr2 n uan2 rr
+nuanr3 n uan3 rr
+nuanr4 n uan4 rr
+nuanr5 n uan5 rr
+nuer1 n ve1 rr
+nuer2 n ve2 rr
+nuer3 n ve3 rr
+nuer4 n ve4 rr
+nuer5 n ve5 rr
+nver1 n ve1 rr
+nver2 n ve2 rr
+nver3 n ve3 rr
+nver4 n ve4 rr
+nver5 n ve5 rr
+nuor1 n uo1 rr
+nuor2 n uo2 rr
+nuor3 n uo3 rr
+nuor4 n uo4 rr
+nuor5 n uo5 rr
+nvr1 n v1 rr
+nvr2 n v2 rr
+nvr3 n v3 rr
+nvr4 n v4 rr
+nvr5 n v5 rr
+or1 o1 rr
+or2 o2 rr
+or3 o3 rr
+or4 o4 rr
+or5 o5 rr
+our1 ou1 rr
+our2 ou2 rr
+our3 ou3 rr
+our4 ou4 rr
+our5 ou5 rr
+par1 p a1 rr
+par2 p a2 rr
+par3 p a3 rr
+par4 p a4 rr
+par5 p a5 rr
+pair1 p ai1 rr
+pair2 p ai2 rr
+pair3 p ai3 rr
+pair4 p ai4 rr
+pair5 p ai5 rr
+panr1 p an1 rr
+panr2 p an2 rr
+panr3 p an3 rr
+panr4 p an4 rr
+panr5 p an5 rr
+pangr1 p ang1 rr
+pangr2 p ang2 rr
+pangr3 p ang3 rr
+pangr4 p ang4 rr
+pangr5 p ang5 rr
+paor1 p ao1 rr
+paor2 p ao2 rr
+paor3 p ao3 rr
+paor4 p ao4 rr
+paor5 p ao5 rr
+peir1 p ei1 rr
+peir2 p ei2 rr
+peir3 p ei3 rr
+peir4 p ei4 rr
+peir5 p ei5 rr
+penr1 p en1 rr
+penr2 p en2 rr
+penr3 p en3 rr
+penr4 p en4 rr
+penr5 p en5 rr
+pengr1 p eng1 rr
+pengr2 p eng2 rr
+pengr3 p eng3 rr
+pengr4 p eng4 rr
+pengr5 p eng5 rr
+pir1 p i1 rr
+pir2 p i2 rr
+pir3 p i3 rr
+pir4 p i4 rr
+pir5 p i5 rr
+pianr1 p ian1 rr
+pianr2 p ian2 rr
+pianr3 p ian3 rr
+pianr4 p ian4 rr
+pianr5 p ian5 rr
+piaor1 p iao1 rr
+piaor2 p iao2 rr
+piaor3 p iao3 rr
+piaor4 p iao4 rr
+piaor5 p iao5 rr
+pier1 p ie1 rr
+pier2 p ie2 rr
+pier3 p ie3 rr
+pier4 p ie4 rr
+pier5 p ie5 rr
+pinr1 p in1 rr
+pinr2 p in2 rr
+pinr3 p in3 rr
+pinr4 p in4 rr
+pinr5 p in5 rr
+pingr1 p ing1 rr
+pingr2 p ing2 rr
+pingr3 p ing3 rr
+pingr4 p ing4 rr
+pingr5 p ing5 rr
+por1 p o1 rr
+por2 p o2 rr
+por3 p o3 rr
+por4 p o4 rr
+por5 p o5 rr
+pour1 p ou1 rr
+pour2 p ou2 rr
+pour3 p ou3 rr
+pour4 p ou4 rr
+pour5 p ou5 rr
+pur1 p u1 rr
+pur2 p u2 rr
+pur3 p u3 rr
+pur4 p u4 rr
+pur5 p u5 rr
+qir1 q i1 rr
+qir2 q i2 rr
+qir3 q i3 rr
+qir4 q i4 rr
+qir5 q i5 rr
+qiar1 q ia1 rr
+qiar2 q ia2 rr
+qiar3 q ia3 rr
+qiar4 q ia4 rr
+qiar5 q ia5 rr
+qianr1 q ian1 rr
+qianr2 q ian2 rr
+qianr3 q ian3 rr
+qianr4 q ian4 rr
+qianr5 q ian5 rr
+qiangr1 q iang1 rr
+qiangr2 q iang2 rr
+qiangr3 q iang3 rr
+qiangr4 q iang4 rr
+qiangr5 q iang5 rr
+qiaor1 q iao1 rr
+qiaor2 q iao2 rr
+qiaor3 q iao3 rr
+qiaor4 q iao4 rr
+qiaor5 q iao5 rr
+qier1 q ie1 rr
+qier2 q ie2 rr
+qier3 q ie3 rr
+qier4 q ie4 rr
+qier5 q ie5 rr
+qinr1 q in1 rr
+qinr2 q in2 rr
+qinr3 q in3 rr
+qinr4 q in4 rr
+qinr5 q in5 rr
+qingr1 q ing1 rr
+qingr2 q ing2 rr
+qingr3 q ing3 rr
+qingr4 q ing4 rr
+qingr5 q ing5 rr
+qiongr1 q iong1 rr
+qiongr2 q iong2 rr
+qiongr3 q iong3 rr
+qiongr4 q iong4 rr
+qiongr5 q iong5 rr
+qiur1 q iou1 rr
+qiur2 q iou2 rr
+qiur3 q iou3 rr
+qiur4 q iou4 rr
+qiur5 q iou5 rr
+qur1 q v1 rr
+qur2 q v2 rr
+qur3 q v3 rr
+qur4 q v4 rr
+qur5 q v5 rr
+quanr1 q van1 rr
+quanr2 q van2 rr
+quanr3 q van3 rr
+quanr4 q van4 rr
+quanr5 q van5 rr
+quer1 q ve1 rr
+quer2 q ve2 rr
+quer3 q ve3 rr
+quer4 q ve4 rr
+quer5 q ve5 rr
+qunr1 q vn1 rr
+qunr2 q vn2 rr
+qunr3 q vn3 rr
+qunr4 q vn4 rr
+qunr5 q vn5 rr
+ranr1 r an1 rr
+ranr2 r an2 rr
+ranr3 r an3 rr
+ranr4 r an4 rr
+ranr5 r an5 rr
+rangr1 r ang1 rr
+rangr2 r ang2 rr
+rangr3 r ang3 rr
+rangr4 r ang4 rr
+rangr5 r ang5 rr
+raor1 r ao1 rr
+raor2 r ao2 rr
+raor3 r ao3 rr
+raor4 r ao4 rr
+raor5 r ao5 rr
+rer1 r e1 rr
+rer2 r e2 rr
+rer3 r e3 rr
+rer4 r e4 rr
+rer5 r e5 rr
+renr1 r en1 rr
+renr2 r en2 rr
+renr3 r en3 rr
+renr4 r en4 rr
+renr5 r en5 rr
+rengr1 r eng1 rr
+rengr2 r eng2 rr
+rengr3 r eng3 rr
+rengr4 r eng4 rr
+rengr5 r eng5 rr
+rir1 r iii1 rr
+rir2 r iii2 rr
+rir3 r iii3 rr
+rir4 r iii4 rr
+rir5 r iii5 rr
+rongr1 r ong1 rr
+rongr2 r ong2 rr
+rongr3 r ong3 rr
+rongr4 r ong4 rr
+rongr5 r ong5 rr
+rour1 r ou1 rr
+rour2 r ou2 rr
+rour3 r ou3 rr
+rour4 r ou4 rr
+rour5 r ou5 rr
+rur1 r u1 rr
+rur2 r u2 rr
+rur3 r u3 rr
+rur4 r u4 rr
+rur5 r u5 rr
+ruar1 r ua1 rr
+ruar2 r ua2 rr
+ruar3 r ua3 rr
+ruar4 r ua4 rr
+ruar5 r ua5 rr
+ruanr1 r uan1 rr
+ruanr2 r uan2 rr
+ruanr3 r uan3 rr
+ruanr4 r uan4 rr
+ruanr5 r uan5 rr
+ruir1 r uei1 rr
+ruir2 r uei2 rr
+ruir3 r uei3 rr
+ruir4 r uei4 rr
+ruir5 r uei5 rr
+runr1 r uen1 rr
+runr2 r uen2 rr
+runr3 r uen3 rr
+runr4 r uen4 rr
+runr5 r uen5 rr
+ruor1 r uo1 rr
+ruor2 r uo2 rr
+ruor3 r uo3 rr
+ruor4 r uo4 rr
+ruor5 r uo5 rr
+sar1 s a1 rr
+sar2 s a2 rr
+sar3 s a3 rr
+sar4 s a4 rr
+sar5 s a5 rr
+sair1 s ai1 rr
+sair2 s ai2 rr
+sair3 s ai3 rr
+sair4 s ai4 rr
+sair5 s ai5 rr
+sanr1 s an1 rr
+sanr2 s an2 rr
+sanr3 s an3 rr
+sanr4 s an4 rr
+sanr5 s an5 rr
+sangr1 s ang1 rr
+sangr2 s ang2 rr
+sangr3 s ang3 rr
+sangr4 s ang4 rr
+sangr5 s ang5 rr
+saor1 s ao1 rr
+saor2 s ao2 rr
+saor3 s ao3 rr
+saor4 s ao4 rr
+saor5 s ao5 rr
+ser1 s e1 rr
+ser2 s e2 rr
+ser3 s e3 rr
+ser4 s e4 rr
+ser5 s e5 rr
+senr1 s en1 rr
+senr2 s en2 rr
+senr3 s en3 rr
+senr4 s en4 rr
+senr5 s en5 rr
+sengr1 s eng1 rr
+sengr2 s eng2 rr
+sengr3 s eng3 rr
+sengr4 s eng4 rr
+sengr5 s eng5 rr
+shar1 sh a1 rr
+shar2 sh a2 rr
+shar3 sh a3 rr
+shar4 sh a4 rr
+shar5 sh a5 rr
+shair1 sh ai1 rr
+shair2 sh ai2 rr
+shair3 sh ai3 rr
+shair4 sh ai4 rr
+shair5 sh ai5 rr
+shanr1 sh an1 rr
+shanr2 sh an2 rr
+shanr3 sh an3 rr
+shanr4 sh an4 rr
+shanr5 sh an5 rr
+shangr1 sh ang1 rr
+shangr2 sh ang2 rr
+shangr3 sh ang3 rr
+shangr4 sh ang4 rr
+shangr5 sh ang5 rr
+shaor1 sh ao1 rr
+shaor2 sh ao2 rr
+shaor3 sh ao3 rr
+shaor4 sh ao4 rr
+shaor5 sh ao5 rr
+sher1 sh e1 rr
+sher2 sh e2 rr
+sher3 sh e3 rr
+sher4 sh e4 rr
+sher5 sh e5 rr
+sheir1 sh ei1 rr
+sheir2 sh ei2 rr
+sheir3 sh ei3 rr
+sheir4 sh ei4 rr
+sheir5 sh ei5 rr
+shenr1 sh en1 rr
+shenr2 sh en2 rr
+shenr3 sh en3 rr
+shenr4 sh en4 rr
+shenr5 sh en5 rr
+shengr1 sh eng1 rr
+shengr2 sh eng2 rr
+shengr3 sh eng3 rr
+shengr4 sh eng4 rr
+shengr5 sh eng5 rr
+shir1 sh iii1 rr
+shir2 sh iii2 rr
+shir3 sh iii3 rr
+shir4 sh iii4 rr
+shir5 sh iii5 rr
+shour1 sh ou1 rr
+shour2 sh ou2 rr
+shour3 sh ou3 rr
+shour4 sh ou4 rr
+shour5 sh ou5 rr
+shur1 sh u1 rr
+shur2 sh u2 rr
+shur3 sh u3 rr
+shur4 sh u4 rr
+shur5 sh u5 rr
+shuar1 sh ua1 rr
+shuar2 sh ua2 rr
+shuar3 sh ua3 rr
+shuar4 sh ua4 rr
+shuar5 sh ua5 rr
+shuair1 sh uai1 rr
+shuair2 sh uai2 rr
+shuair3 sh uai3 rr
+shuair4 sh uai4 rr
+shuair5 sh uai5 rr
+shuanr1 sh uan1 rr
+shuanr2 sh uan2 rr
+shuanr3 sh uan3 rr
+shuanr4 sh uan4 rr
+shuanr5 sh uan5 rr
+shuangr1 sh uang1 rr
+shuangr2 sh uang2 rr
+shuangr3 sh uang3 rr
+shuangr4 sh uang4 rr
+shuangr5 sh uang5 rr
+shuir1 sh uei1 rr
+shuir2 sh uei2 rr
+shuir3 sh uei3 rr
+shuir4 sh uei4 rr
+shuir5 sh uei5 rr
+shunr1 sh uen1 rr
+shunr2 sh uen2 rr
+shunr3 sh uen3 rr
+shunr4 sh uen4 rr
+shunr5 sh uen5 rr
+shuor1 sh uo1 rr
+shuor2 sh uo2 rr
+shuor3 sh uo3 rr
+shuor4 sh uo4 rr
+shuor5 sh uo5 rr
+sir1 s ii1 rr
+sir2 s ii2 rr
+sir3 s ii3 rr
+sir4 s ii4 rr
+sir5 s ii5 rr
+songr1 s ong1 rr
+songr2 s ong2 rr
+songr3 s ong3 rr
+songr4 s ong4 rr
+songr5 s ong5 rr
+sour1 s ou1 rr
+sour2 s ou2 rr
+sour3 s ou3 rr
+sour4 s ou4 rr
+sour5 s ou5 rr
+sur1 s u1 rr
+sur2 s u2 rr
+sur3 s u3 rr
+sur4 s u4 rr
+sur5 s u5 rr
+suanr1 s uan1 rr
+suanr2 s uan2 rr
+suanr3 s uan3 rr
+suanr4 s uan4 rr
+suanr5 s uan5 rr
+suir1 s uei1 rr
+suir2 s uei2 rr
+suir3 s uei3 rr
+suir4 s uei4 rr
+suir5 s uei5 rr
+sunr1 s uen1 rr
+sunr2 s uen2 rr
+sunr3 s uen3 rr
+sunr4 s uen4 rr
+sunr5 s uen5 rr
+suor1 s uo1 rr
+suor2 s uo2 rr
+suor3 s uo3 rr
+suor4 s uo4 rr
+suor5 s uo5 rr
+tar1 t a1 rr
+tar2 t a2 rr
+tar3 t a3 rr
+tar4 t a4 rr
+tar5 t a5 rr
+tair1 t ai1 rr
+tair2 t ai2 rr
+tair3 t ai3 rr
+tair4 t ai4 rr
+tair5 t ai5 rr
+tanr1 t an1 rr
+tanr2 t an2 rr
+tanr3 t an3 rr
+tanr4 t an4 rr
+tanr5 t an5 rr
+tangr1 t ang1 rr
+tangr2 t ang2 rr
+tangr3 t ang3 rr
+tangr4 t ang4 rr
+tangr5 t ang5 rr
+taor1 t ao1 rr
+taor2 t ao2 rr
+taor3 t ao3 rr
+taor4 t ao4 rr
+taor5 t ao5 rr
+ter1 t e1 rr
+ter2 t e2 rr
+ter3 t e3 rr
+ter4 t e4 rr
+ter5 t e5 rr
+teir1 t ei1 rr
+teir2 t ei2 rr
+teir3 t ei3 rr
+teir4 t ei4 rr
+teir5 t ei5 rr
+tengr1 t eng1 rr
+tengr2 t eng2 rr
+tengr3 t eng3 rr
+tengr4 t eng4 rr
+tengr5 t eng5 rr
+tir1 t i1 rr
+tir2 t i2 rr
+tir3 t i3 rr
+tir4 t i4 rr
+tir5 t i5 rr
+tianr1 t ian1 rr
+tianr2 t ian2 rr
+tianr3 t ian3 rr
+tianr4 t ian4 rr
+tianr5 t ian5 rr
+tiaor1 t iao1 rr
+tiaor2 t iao2 rr
+tiaor3 t iao3 rr
+tiaor4 t iao4 rr
+tiaor5 t iao5 rr
+tier1 t ie1 rr
+tier2 t ie2 rr
+tier3 t ie3 rr
+tier4 t ie4 rr
+tier5 t ie5 rr
+tingr1 t ing1 rr
+tingr2 t ing2 rr
+tingr3 t ing3 rr
+tingr4 t ing4 rr
+tingr5 t ing5 rr
+tongr1 t ong1 rr
+tongr2 t ong2 rr
+tongr3 t ong3 rr
+tongr4 t ong4 rr
+tongr5 t ong5 rr
+tour1 t ou1 rr
+tour2 t ou2 rr
+tour3 t ou3 rr
+tour4 t ou4 rr
+tour5 t ou5 rr
+tur1 t u1 rr
+tur2 t u2 rr
+tur3 t u3 rr
+tur4 t u4 rr
+tur5 t u5 rr
+tuanr1 t uan1 rr
+tuanr2 t uan2 rr
+tuanr3 t uan3 rr
+tuanr4 t uan4 rr
+tuanr5 t uan5 rr
+tuir1 t uei1 rr
+tuir2 t uei2 rr
+tuir3 t uei3 rr
+tuir4 t uei4 rr
+tuir5 t uei5 rr
+tunr1 t uen1 rr
+tunr2 t uen2 rr
+tunr3 t uen3 rr
+tunr4 t uen4 rr
+tunr5 t uen5 rr
+tuor1 t uo1 rr
+tuor2 t uo2 rr
+tuor3 t uo3 rr
+tuor4 t uo4 rr
+tuor5 t uo5 rr
+war1 w ua1 rr
+war2 w ua2 rr
+war3 w ua3 rr
+war4 w ua4 rr
+war5 w ua5 rr
+wair1 w uai1 rr
+wair2 w uai2 rr
+wair3 w uai3 rr
+wair4 w uai4 rr
+wair5 w uai5 rr
+wanr1 w uan1 rr
+wanr2 w uan2 rr
+wanr3 w uan3 rr
+wanr4 w uan4 rr
+wanr5 w uan5 rr
+wangr1 w uang1 rr
+wangr2 w uang2 rr
+wangr3 w uang3 rr
+wangr4 w uang4 rr
+wangr5 w uang5 rr
+weir1 w uei1 rr
+weir2 w uei2 rr
+weir3 w uei3 rr
+weir4 w uei4 rr
+weir5 w uei5 rr
+wenr1 w uen1 rr
+wenr2 w uen2 rr
+wenr3 w uen3 rr
+wenr4 w uen4 rr
+wenr5 w uen5 rr
+wengr1 w uen1 rr
+wengr2 w uen2 rr
+wengr3 w uen3 rr
+wengr4 w uen4 rr
+wengr5 w uen5 rr
+wor1 w uo1 rr
+wor2 w uo2 rr
+wor3 w uo3 rr
+wor4 w uo4 rr
+wor5 w uo5 rr
+wur1 w u1 rr
+wur2 w u2 rr
+wur3 w u3 rr
+wur4 w u4 rr
+wur5 w u5 rr
+xir1 x i1 rr
+xir2 x i2 rr
+xir3 x i3 rr
+xir4 x i4 rr
+xir5 x i5 rr
+xiar1 x ia1 rr
+xiar2 x ia2 rr
+xiar3 x ia3 rr
+xiar4 x ia4 rr
+xiar5 x ia5 rr
+xianr1 x ian1 rr
+xianr2 x ian2 rr
+xianr3 x ian3 rr
+xianr4 x ian4 rr
+xianr5 x ian5 rr
+xiangr1 x iang1 rr
+xiangr2 x iang2 rr
+xiangr3 x iang3 rr
+xiangr4 x iang4 rr
+xiangr5 x iang5 rr
+xiaor1 x iao1 rr
+xiaor2 x iao2 rr
+xiaor3 x iao3 rr
+xiaor4 x iao4 rr
+xiaor5 x iao5 rr
+xier1 x ie1 rr
+xier2 x ie2 rr
+xier3 x ie3 rr
+xier4 x ie4 rr
+xier5 x ie5 rr
+xinr1 x in1 rr
+xinr2 x in2 rr
+xinr3 x in3 rr
+xinr4 x in4 rr
+xinr5 x in5 rr
+xingr1 x ing1 rr
+xingr2 x ing2 rr
+xingr3 x ing3 rr
+xingr4 x ing4 rr
+xingr5 x ing5 rr
+xiongr1 x iong1 rr
+xiongr2 x iong2 rr
+xiongr3 x iong3 rr
+xiongr4 x iong4 rr
+xiongr5 x iong5 rr
+xiur1 x iou1 rr
+xiur2 x iou2 rr
+xiur3 x iou3 rr
+xiur4 x iou4 rr
+xiur5 x iou5 rr
+xur1 x v1 rr
+xur2 x v2 rr
+xur3 x v3 rr
+xur4 x v4 rr
+xur5 x v5 rr
+xuanr1 x van1 rr
+xuanr2 x van2 rr
+xuanr3 x van3 rr
+xuanr4 x van4 rr
+xuanr5 x van5 rr
+xuer1 x ve1 rr
+xuer2 x ve2 rr
+xuer3 x ve3 rr
+xuer4 x ve4 rr
+xuer5 x ve5 rr
+xunr1 x vn1 rr
+xunr2 x vn2 rr
+xunr3 x vn3 rr
+xunr4 x vn4 rr
+xunr5 x vn5 rr
+yar1 y ia1 rr
+yar2 y ia2 rr
+yar3 y ia3 rr
+yar4 y ia4 rr
+yar5 y ia5 rr
+yanr1 y ian1 rr
+yanr2 y ian2 rr
+yanr3 y ian3 rr
+yanr4 y ian4 rr
+yanr5 y ian5 rr
+yangr1 y iang1 rr
+yangr2 y iang2 rr
+yangr3 y iang3 rr
+yangr4 y iang4 rr
+yangr5 y iang5 rr
+yaor1 y iao1 rr
+yaor2 y iao2 rr
+yaor3 y iao3 rr
+yaor4 y iao4 rr
+yaor5 y iao5 rr
+yer1 y ie1 rr
+yer2 y ie2 rr
+yer3 y ie3 rr
+yer4 y ie4 rr
+yer5 y ie5 rr
+yir1 y i1 rr
+yir2 y i2 rr
+yir3 y i3 rr
+yir4 y i4 rr
+yir5 y i5 rr
+yinr1 y in1 rr
+yinr2 y in2 rr
+yinr3 y in3 rr
+yinr4 y in4 rr
+yinr5 y in5 rr
+yingr1 y ing1 rr
+yingr2 y ing2 rr
+yingr3 y ing3 rr
+yingr4 y ing4 rr
+yingr5 y ing5 rr
+yor1 y iou1 rr
+yor2 y iou2 rr
+yor3 y iou3 rr
+yor4 y iou4 rr
+yor5 y iou5 rr
+yongr1 y iong1 rr
+yongr2 y iong2 rr
+yongr3 y iong3 rr
+yongr4 y iong4 rr
+yongr5 y iong5 rr
+your1 y iou1 rr
+your2 y iou2 rr
+your3 y iou3 rr
+your4 y iou4 rr
+your5 y iou5 rr
+yur1 y v1 rr
+yur2 y v2 rr
+yur3 y v3 rr
+yur4 y v4 rr
+yur5 y v5 rr
+yuanr1 y van1 rr
+yuanr2 y van2 rr
+yuanr3 y van3 rr
+yuanr4 y van4 rr
+yuanr5 y van5 rr
+yuer1 y ve1 rr
+yuer2 y ve2 rr
+yuer3 y ve3 rr
+yuer4 y ve4 rr
+yuer5 y ve5 rr
+yunr1 y vn1 rr
+yunr2 y vn2 rr
+yunr3 y vn3 rr
+yunr4 y vn4 rr
+yunr5 y vn5 rr
+zar1 z a1 rr
+zar2 z a2 rr
+zar3 z a3 rr
+zar4 z a4 rr
+zar5 z a5 rr
+zair1 z ai1 rr
+zair2 z ai2 rr
+zair3 z ai3 rr
+zair4 z ai4 rr
+zair5 z ai5 rr
+zanr1 z an1 rr
+zanr2 z an2 rr
+zanr3 z an3 rr
+zanr4 z an4 rr
+zanr5 z an5 rr
+zangr1 z ang1 rr
+zangr2 z ang2 rr
+zangr3 z ang3 rr
+zangr4 z ang4 rr
+zangr5 z ang5 rr
+zaor1 z ao1 rr
+zaor2 z ao2 rr
+zaor3 z ao3 rr
+zaor4 z ao4 rr
+zaor5 z ao5 rr
+zer1 z e1 rr
+zer2 z e2 rr
+zer3 z e3 rr
+zer4 z e4 rr
+zer5 z e5 rr
+zeir1 z ei1 rr
+zeir2 z ei2 rr
+zeir3 z ei3 rr
+zeir4 z ei4 rr
+zeir5 z ei5 rr
+zenr1 z en1 rr
+zenr2 z en2 rr
+zenr3 z en3 rr
+zenr4 z en4 rr
+zenr5 z en5 rr
+zengr1 z eng1 rr
+zengr2 z eng2 rr
+zengr3 z eng3 rr
+zengr4 z eng4 rr
+zengr5 z eng5 rr
+zhar1 zh a1 rr
+zhar2 zh a2 rr
+zhar3 zh a3 rr
+zhar4 zh a4 rr
+zhar5 zh a5 rr
+zhair1 zh ai1 rr
+zhair2 zh ai2 rr
+zhair3 zh ai3 rr
+zhair4 zh ai4 rr
+zhair5 zh ai5 rr
+zhanr1 zh an1 rr
+zhanr2 zh an2 rr
+zhanr3 zh an3 rr
+zhanr4 zh an4 rr
+zhanr5 zh an5 rr
+zhangr1 zh ang1 rr
+zhangr2 zh ang2 rr
+zhangr3 zh ang3 rr
+zhangr4 zh ang4 rr
+zhangr5 zh ang5 rr
+zhaor1 zh ao1 rr
+zhaor2 zh ao2 rr
+zhaor3 zh ao3 rr
+zhaor4 zh ao4 rr
+zhaor5 zh ao5 rr
+zher1 zh e1 rr
+zher2 zh e2 rr
+zher3 zh e3 rr
+zher4 zh e4 rr
+zher5 zh e5 rr
+zheir1 zh ei1 rr
+zheir2 zh ei2 rr
+zheir3 zh ei3 rr
+zheir4 zh ei4 rr
+zheir5 zh ei5 rr
+zhenr1 zh en1 rr
+zhenr2 zh en2 rr
+zhenr3 zh en3 rr
+zhenr4 zh en4 rr
+zhenr5 zh en5 rr
+zhengr1 zh eng1 rr
+zhengr2 zh eng2 rr
+zhengr3 zh eng3 rr
+zhengr4 zh eng4 rr
+zhengr5 zh eng5 rr
+zhir1 zh iii1 rr
+zhir2 zh iii2 rr
+zhir3 zh iii3 rr
+zhir4 zh iii4 rr
+zhir5 zh iii5 rr
+zhongr1 zh ong1 rr
+zhongr2 zh ong2 rr
+zhongr3 zh ong3 rr
+zhongr4 zh ong4 rr
+zhongr5 zh ong5 rr
+zhour1 zh ou1 rr
+zhour2 zh ou2 rr
+zhour3 zh ou3 rr
+zhour4 zh ou4 rr
+zhour5 zh ou5 rr
+zhur1 zh u1 rr
+zhur2 zh u2 rr
+zhur3 zh u3 rr
+zhur4 zh u4 rr
+zhur5 zh u5 rr
+zhuar1 zh ua1 rr
+zhuar2 zh ua2 rr
+zhuar3 zh ua3 rr
+zhuar4 zh ua4 rr
+zhuar5 zh ua5 rr
+zhuair1 zh uai1 rr
+zhuair2 zh uai2 rr
+zhuair3 zh uai3 rr
+zhuair4 zh uai4 rr
+zhuair5 zh uai5 rr
+zhuanr1 zh uan1 rr
+zhuanr2 zh uan2 rr
+zhuanr3 zh uan3 rr
+zhuanr4 zh uan4 rr
+zhuanr5 zh uan5 rr
+zhuangr1 zh uang1 rr
+zhuangr2 zh uang2 rr
+zhuangr3 zh uang3 rr
+zhuangr4 zh uang4 rr
+zhuangr5 zh uang5 rr
+zhuir1 zh uei1 rr
+zhuir2 zh uei2 rr
+zhuir3 zh uei3 rr
+zhuir4 zh uei4 rr
+zhuir5 zh uei5 rr
+zhunr1 zh uen1 rr
+zhunr2 zh uen2 rr
+zhunr3 zh uen3 rr
+zhunr4 zh uen4 rr
+zhunr5 zh uen5 rr
+zhuor1 zh uo1 rr
+zhuor2 zh uo2 rr
+zhuor3 zh uo3 rr
+zhuor4 zh uo4 rr
+zhuor5 zh uo5 rr
+zir1 z ii1 rr
+zir2 z ii2 rr
+zir3 z ii3 rr
+zir4 z ii4 rr
+zir5 z ii5 rr
+zongr1 z ong1 rr
+zongr2 z ong2 rr
+zongr3 z ong3 rr
+zongr4 z ong4 rr
+zongr5 z ong5 rr
+zour1 z ou1 rr
+zour2 z ou2 rr
+zour3 z ou3 rr
+zour4 z ou4 rr
+zour5 z ou5 rr
+zur1 z u1 rr
+zur2 z u2 rr
+zur3 z u3 rr
+zur4 z u4 rr
+zur5 z u5 rr
+zuanr1 z uan1 rr
+zuanr2 z uan2 rr
+zuanr3 z uan3 rr
+zuanr4 z uan4 rr
+zuanr5 z uan5 rr
+zuir1 z uei1 rr
+zuir2 z uei2 rr
+zuir3 z uei3 rr
+zuir4 z uei4 rr
+zuir5 z uei5 rr
+zunr1 z uen1 rr
+zunr2 z uen2 rr
+zunr3 z uen3 rr
+zunr4 z uen4 rr
+zunr5 z uen5 rr
+zuor1 z uo1 rr
+zuor2 z uo2 rr
+zuor3 z uo3 rr
+zuor4 z uo4 rr
+zuor5 z uo5 rr

text/numbers.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+""" This code is modified from https://github.com/keithito/tacotron """
+import inflect
+import re
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
+_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"[0-9]+")
+def _remove_commas(m):
+    return m.group(1).replace(",", "")
+def _expand_decimal_point(m):
+    return m.group(1).replace(".", " point ")
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return match + " dollars"  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return "%s %s" % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s" % (cents, cent_unit)
+    else:
+        return "zero dollars"
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return "two thousand"
+        elif num > 2000 and num < 2010:
+            return "two thousand " + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + " hundred"
+        else:
+            return _inflect.number_to_words(
+                num, andword="", zero="oh", group=2
+            ).replace(", ", " ")
+    else:
+        return _inflect.number_to_words(num, andword="")
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r"\1 pounds", text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text

text/pinyin.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+initials = [
+    "b",
+    "c",
+    "ch",
+    "d",
+    "f",
+    "g",
+    "h",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "p",
+    "q",
+    "r",
+    "s",
+    "sh",
+    "t",
+    "w",
+    "x",
+    "y",
+    "z",
+    "zh",
+]
+finals = [
+    "a1",
+    "a2",
+    "a3",
+    "a4",
+    "a5",
+    "ai1",
+    "ai2",
+    "ai3",
+    "ai4",
+    "ai5",
+    "an1",
+    "an2",
+    "an3",
+    "an4",
+    "an5",
+    "ang1",
+    "ang2",
+    "ang3",
+    "ang4",
+    "ang5",
+    "ao1",
+    "ao2",
+    "ao3",
+    "ao4",
+    "ao5",
+    "e1",
+    "e2",
+    "e3",
+    "e4",
+    "e5",
+    "ei1",
+    "ei2",
+    "ei3",
+    "ei4",
+    "ei5",
+    "en1",
+    "en2",
+    "en3",
+    "en4",
+    "en5",
+    "eng1",
+    "eng2",
+    "eng3",
+    "eng4",
+    "eng5",
+    "er1",
+    "er2",
+    "er3",
+    "er4",
+    "er5",
+    "i1",
+    "i2",
+    "i3",
+    "i4",
+    "i5",
+    "ia1",
+    "ia2",
+    "ia3",
+    "ia4",
+    "ia5",
+    "ian1",
+    "ian2",
+    "ian3",
+    "ian4",
+    "ian5",
+    "iang1",
+    "iang2",
+    "iang3",
+    "iang4",
+    "iang5",
+    "iao1",
+    "iao2",
+    "iao3",
+    "iao4",
+    "iao5",
+    "ie1",
+    "ie2",
+    "ie3",
+    "ie4",
+    "ie5",
+    "ii1",
+    "ii2",
+    "ii3",
+    "ii4",
+    "ii5",
+    "iii1",
+    "iii2",
+    "iii3",
+    "iii4",
+    "iii5",
+    "in1",
+    "in2",
+    "in3",
+    "in4",
+    "in5",
+    "ing1",
+    "ing2",
+    "ing3",
+    "ing4",
+    "ing5",
+    "iong1",
+    "iong2",
+    "iong3",
+    "iong4",
+    "iong5",
+    "iou1",
+    "iou2",
+    "iou3",
+    "iou4",
+    "iou5",
+    "o1",
+    "o2",
+    "o3",
+    "o4",
+    "o5",
+    "ong1",
+    "ong2",
+    "ong3",
+    "ong4",
+    "ong5",
+    "ou1",
+    "ou2",
+    "ou3",
+    "ou4",
+    "ou5",
+    "u1",
+    "u2",
+    "u3",
+    "u4",
+    "u5",
+    "ua1",
+    "ua2",
+    "ua3",
+    "ua4",
+    "ua5",
+    "uai1",
+    "uai2",
+    "uai3",
+    "uai4",
+    "uai5",
+    "uan1",
+    "uan2",
+    "uan3",
+    "uan4",
+    "uan5",
+    "uang1",
+    "uang2",
+    "uang3",
+    "uang4",
+    "uang5",
+    "uei1",
+    "uei2",
+    "uei3",
+    "uei4",
+    "uei5",
+    "uen1",
+    "uen2",
+    "uen3",
+    "uen4",
+    "uen5",
+    "uo1",
+    "uo2",
+    "uo3",
+    "uo4",
+    "uo5",
+    "v1",
+    "v2",
+    "v3",
+    "v4",
+    "v5",
+    "van1",
+    "van2",
+    "van3",
+    "van4",
+    "van5",
+    "ve1",
+    "ve2",
+    "ve3",
+    "ve4",
+    "ve5",
+    "vn1",
+    "vn2",
+    "vn3",
+    "vn4",
+    "vn5",
+]
+valid_symbols = initials + finals + ["rr"]

text/symbol_table.py ADDED Viewed

	@@ -0,0 +1,284 @@

+# Copyright      2020  Mobvoi Inc.        (authors: Fangjun Kuang)
+#
+# See ../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Dict
+from typing import Generic
+from typing import List
+from typing import Optional
+from typing import TypeVar
+from typing import Union
+Symbol = TypeVar('Symbol')
+# SymbolTable is copied from
+# https://github.com/k2-fsa/k2/blob/master/k2/python/k2/symbol_table.py
+'''
+    SymbolTable: map symbol to id
+'''
+@dataclass(repr=False)
+class SymbolTable(Generic[Symbol]):
+    '''SymbolTable that maps symbol IDs, found on the FSA arcs to
+    actual objects. These objects can be arbitrary Python objects
+    that can serve as keys in a dictionary (i.e. they need to be
+    hashable and immutable).
+    The SymbolTable can only be read to/written from disk if the
+    symbols are strings.
+    '''
+    _id2sym: Dict[int, Symbol] = field(default_factory=dict)
+    '''Map an integer to a symbol.
+    '''
+    _sym2id: Dict[Symbol, int] = field(default_factory=dict)
+    '''Map a symbol to an integer.
+    '''
+    _next_available_id: int = 1
+    '''A helper internal field that helps adding new symbols
+    to the table efficiently.
+    '''
+    eps: Symbol = '<eps>'
+    '''Null symbol, always mapped to index 0.
+    '''
+    def __post_init__(self):
+        assert all(self._sym2id[sym] == idx for idx, sym in self._id2sym.items())
+        assert all(self._id2sym[idx] == sym for sym, idx in self._sym2id.items())
+        assert 0 not in self._id2sym or self._id2sym[0] == self.eps
+        self._next_available_id = max(self._id2sym, default=0) + 1
+        self._id2sym.setdefault(0, self.eps)
+        self._sym2id.setdefault(self.eps, 0)
+    @staticmethod
+    def from_str(s: str) -> 'SymbolTable':
+        '''Build a symbol table from a string.
+        The string consists of lines. Every line has two fields separated
+        by space(s), tab(s) or both. The first field is the symbol and the
+        second the integer id of the symbol.
+        Args:
+          s:
+            The input string with the format described above.
+        Returns:
+          An instance of :class:`SymbolTable`.
+        '''
+        id2sym: Dict[int, str] = dict()
+        sym2id: Dict[str, int] = dict()
+        for line in s.split('\n'):
+            fields = line.split()
+            if len(fields) == 0:
+                continue  # skip empty lines
+            assert len(fields) == 2, \
+                    f'Expect a line with 2 fields. Given: {len(fields)}'
+            sym, idx = fields[0], int(fields[1])
+            assert sym not in sym2id, f'Duplicated symbol {sym}'
+            assert idx not in id2sym, f'Duplicated id {idx}'
+            id2sym[idx] = sym
+            sym2id[sym] = idx
+        eps = id2sym.get(0, '<eps>')
+        return SymbolTable(_id2sym=id2sym, _sym2id=sym2id, eps=eps)
+    @staticmethod
+    def from_file(filename: str) -> 'SymbolTable':
+        '''Build a symbol table from file.
+        Every line in the symbol table file has two fields separated by
+        space(s), tab(s) or both. The following is an example file:
+        .. code-block::
+            <eps> 0
+            a 1
+            b 2
+            c 3
+        Args:
+          filename:
+            Name of the symbol table file. Its format is documented above.
+        Returns:
+          An instance of :class:`SymbolTable`.
+        '''
+        with open(filename, 'r', encoding='utf-8') as f:
+            return SymbolTable.from_str(f.read().strip())
+    def to_str(self) -> str:
+        '''
+        Returns:
+          Return a string representation of this object. You can pass
+          it to the method ``from_str`` to recreate an identical object.
+        '''
+        s = ''
+        for idx, symbol in sorted(self._id2sym.items()):
+            s += f'{symbol} {idx}\n'
+        return s
+    def to_file(self, filename: str):
+        '''Serialize the SymbolTable to a file.
+        Every line in the symbol table file has two fields separated by
+        space(s), tab(s) or both. The following is an example file:
+        .. code-block::
+            <eps> 0
+            a 1
+            b 2
+            c 3
+        Args:
+          filename:
+            Name of the symbol table file. Its format is documented above.
+        '''
+        with open(filename, 'w') as f:
+            for idx, symbol in sorted(self._id2sym.items()):
+                print(symbol, idx, file=f)
+    def add(self, symbol: Symbol, index: Optional[int] = None) -> int:
+        '''Add a new symbol to the SymbolTable.
+        Args:
+            symbol:
+                The symbol to be added.
+            index:
+                Optional int id to which the symbol should be assigned.
+                If it is not available, a ValueError will be raised.
+        Returns:
+            The int id to which the symbol has been assigned.
+        '''
+        # Already in the table? Return its ID.
+        if symbol in self._sym2id:
+            return self._sym2id[symbol]
+        # Specific ID not provided - use next available.
+        if index is None:
+            index = self._next_available_id
+        # Specific ID provided but not available.
+        if index in self._id2sym:
+            raise ValueError(f"Cannot assign id '{index}' to '{symbol}' - "
+                             f"already occupied by {self._id2sym[index]}")
+        self._sym2id[symbol] = index
+        self._id2sym[index] = symbol
+        # Update next available ID if needed
+        if self._next_available_id <= index:
+            self._next_available_id = index + 1
+        return index
+    def get(self, k: Union[int, Symbol]) -> Union[Symbol, int]:
+        '''Get a symbol for an id or get an id for a symbol
+        Args:
+          k:
+            If it is an id, it tries to find the symbol corresponding
+            to the id; if it is a symbol, it tries to find the id
+            corresponding to the symbol.
+        Returns:
+          An id or a symbol depending on the given `k`.
+        '''
+        if isinstance(k, int):
+            return self._id2sym[k]
+        else:
+            return self._sym2id[k]
+    def merge(self, other: 'SymbolTable') -> 'SymbolTable':
+        '''Create a union of two SymbolTables.
+        Raises an AssertionError if the same IDs are occupied by
+        different symbols.
+        Args:
+            other:
+                A symbol table to merge with ``self``.
+        Returns:
+            A new symbol table.
+        '''
+        self._check_compatible(other)
+        return SymbolTable(
+            _id2sym={**self._id2sym, **other._id2sym},
+            _sym2id={**self._sym2id, **other._sym2id},
+            eps=self.eps
+        )
+    def _check_compatible(self, other: 'SymbolTable') -> None:
+        # Epsilon compatibility
+        assert self.eps == other.eps, f'Mismatched epsilon symbol: ' \
+                                      f'{self.eps} != {other.eps}'
+        # IDs compatibility
+        common_ids = set(self._id2sym).intersection(other._id2sym)
+        for idx in common_ids:
+            assert self[idx] == other[idx], f'ID conflict for id: {idx}, ' \
+                                            f'self[idx] = "{self[idx]}", ' \
+                                            f'other[idx] = "{other[idx]}"'
+        # Symbols compatibility
+        common_symbols = set(self._sym2id).intersection(other._sym2id)
+        for sym in common_symbols:
+            assert self[sym] == other[sym], f'ID conflict for id: {sym}, ' \
+                                            f'self[sym] = "{self[sym]}", ' \
+                                            f'other[sym] = "{other[sym]}"'
+    def __getitem__(self, item: Union[int, Symbol]) -> Union[Symbol, int]:
+        return self.get(item)
+    def __contains__(self, item: Union[int, Symbol]) -> bool:
+        if isinstance(item, int):
+            return item in self._id2sym
+        else:
+            return item in self._sym2id
+    def __len__(self) -> int:
+        return len(self._id2sym)
+    def __eq__(self, other: 'SymbolTable') -> bool:
+        if len(self) != len(other):
+            return False
+        for s in self.symbols:
+            if self[s] != other[s]:
+                return False
+        return True
+    @property
+    def ids(self) -> List[int]:
+        '''Returns a list of integer IDs corresponding to the symbols.
+        '''
+        ans = list(self._id2sym.keys())
+        ans.sort()
+        return ans
+    @property
+    def symbols(self) -> List[Symbol]:
+        '''Returns a list of symbols (e.g., strings) corresponding to
+        the integer IDs.
+        '''
+        ans = list(self._sym2id.keys())
+        ans.sort()
+        return ans

text/symbols.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+""" This code is modified from https://github.com/keithito/tacotron """
+"""
+Defines the set of symbols used in text input to the model.
+The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. """
+from text import cmudict, pinyin
+_pad = "_"
+_punctuation = "!'(),.:;? "
+_special = "-"
+_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+_silences = ["@sp", "@spn", "@sil"]
+# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
+_arpabet = ["@" + s for s in cmudict.valid_symbols]
+_pinyin = ["@" + s for s in pinyin.valid_symbols]
+# Export all symbols:
+symbols = (
+    [_pad]
+    + list(_special)
+    + list(_punctuation)
+    + list(_letters)
+    + _arpabet
+    + _silences
+    # + _pinyin # for chinese
+)

text/text_token_collation.py ADDED Viewed

	@@ -0,0 +1,131 @@

+ # Copyright (c) 2023 Amphion.
+ # This source code is licensed under the MIT license found in the
+ # LICENSE file in the root directory of this source tree.
+from pathlib import Path
+from typing import List, Tuple
+import os
+import numpy as np
+import torch
+from text.symbol_table import SymbolTable
+from text import text_to_sequence
+'''
+    TextToken: map text to id
+'''
+# TextTokenCollator is modified from
+# https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/data/collation.py
+class TextTokenCollator:
+    def __init__(
+        self,
+        text_tokens: List[str],
+        add_eos: bool = True,
+        add_bos: bool = True,
+        pad_symbol: str = "<pad>",
+        bos_symbol: str = "<bos>",
+        eos_symbol: str = "<eos>",
+    ):
+        self.pad_symbol = pad_symbol
+        self.add_eos = add_eos
+        self.add_bos = add_bos
+        self.bos_symbol = bos_symbol
+        self.eos_symbol = eos_symbol
+        unique_tokens = [pad_symbol]
+        if add_bos:
+            unique_tokens.append(bos_symbol)
+        if add_eos:
+            unique_tokens.append(eos_symbol)
+        unique_tokens.extend(sorted(text_tokens))
+        self.token2idx = {token: idx for idx, token in enumerate(unique_tokens)}
+        self.idx2token = unique_tokens
+    def index(
+        self, tokens_list: List[str]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        seqs, seq_lens = [], []
+        for tokens in tokens_list:
+            assert (
+                all([True if s in self.token2idx else False for s in tokens])
+                is True
+            )
+            seq = (
+                ([self.bos_symbol] if self.add_bos else [])
+                + list(tokens)
+                + ([self.eos_symbol] if self.add_eos else [])
+            )
+            seqs.append(seq)
+            seq_lens.append(len(seq))
+        max_len = max(seq_lens)
+        for k, (seq, seq_len) in enumerate(zip(seqs, seq_lens)):
+            seq.extend([self.pad_symbol] * (max_len - seq_len))
+        tokens = torch.from_numpy(
+            np.array(
+                [[self.token2idx[token] for token in seq] for seq in seqs],
+                dtype=np.int64,
+            )
+        )
+        tokens_lens = torch.IntTensor(seq_lens)
+        return tokens, tokens_lens
+    def __call__(self, text):
+        tokens_seq = [p for p in text]
+        seq = (
+            ([self.bos_symbol] if self.add_bos else [])
+            + tokens_seq
+            + ([self.eos_symbol] if self.add_eos else [])
+        )
+        token_ids = [self.token2idx[token] for token in seq]
+        token_lens = len(tokens_seq) + self.add_eos + self.add_bos
+        return token_ids, token_lens
+def get_text_token_collater(text_tokens_file: str) -> TextTokenCollator:
+    text_tokens_path = Path(text_tokens_file)
+    unique_tokens = SymbolTable.from_file(text_tokens_path)
+    collater = TextTokenCollator(
+        unique_tokens.symbols, add_bos=True, add_eos=True
+    )
+    token2idx = collater.token2idx
+    return collater, token2idx
+class phoneIDCollation:
+    def __init__(self, cfg, dataset=None, symbols_dict_file=None) -> None:
+        if cfg.preprocess.phone_extractor != 'lexicon':
+            ### get text token collator
+            if symbols_dict_file is None:
+                assert dataset is not None
+                symbols_dict_file = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.symbols_dict
+                )
+            self.text_token_colloator, token2idx = get_text_token_collater(symbols_dict_file)
+            # # unique_tokens = SymbolTable.from_file(symbols_dict_path)
+            # # text_tokenizer = TextToken(unique_tokens.symbols, add_bos=True, add_eos=True)
+            # # update phone symbols dict file with pad_symbol or optional tokens (add_bos and add_eos) in TextTokenCollator
+            # phone_symbol_dict = SymbolTable()
+            # for s in sorted(list(set(token2idx.keys()))):
+            #     phone_symbol_dict.add(s)
+            # phone_symbol_dict.to_file(symbols_dict_file)
+    def get_phone_id_sequence(self, cfg, phones_seq):
+        if cfg.preprocess.phone_extractor == 'lexicon':
+            phones_seq = ' '.join(phones_seq)
+            sequence = text_to_sequence(phones_seq, cfg.preprocess.text_cleaners)
+        else:
+            sequence, seq_len = self.text_token_colloator(phones_seq)
+        return sequence

utils/HyperParams/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .hps import HyperParams

utils/HyperParams/hps.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+class HyperParams:
+    """The class to store hyperparameters. The key is case-insensitive.
+    Args:
+        *args: a list of dict or HyperParams.
+        **kwargs: a list of key-value pairs.
+    """
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if type(v) == dict:
+                v = HyperParams(**v)
+            self[k] = v
+    def keys(self):
+        return self.__dict__.keys()
+    def items(self):
+        return self.__dict__.items()
+    def values(self):
+        return self.__dict__.values()
+    def __len__(self):
+        return len(self.__dict__)
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+    def __contains__(self, key):
+        return key in self.__dict__
+    def __repr__(self):
+        return self.__dict__.__repr__()

utils/__init__.py ADDED Viewed

File without changes

utils/audio.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import numpy as np
+from numpy import linalg as LA
+import librosa
+import soundfile as sf
+import librosa.filters
+def load_audio_torch(wave_file, fs):
+    """Load audio data into torch tensor
+    Args:
+        wave_file (str): path to wave file
+        fs (int): sample rate
+    Returns:
+        audio (tensor): audio data in tensor
+        fs (int): sample rate
+    """
+    audio, sample_rate = librosa.load(wave_file, sr=fs, mono=True)
+    # audio: (T,)
+    assert len(audio) > 2
+    # Check the audio type (for soundfile loading backbone) - float, 8bit or 16bit
+    if np.issubdtype(audio.dtype, np.integer):
+        max_mag = -np.iinfo(audio.dtype).min
+    else:
+        max_mag = max(np.amax(audio), -np.amin(audio))
+        max_mag = (
+            (2**31) + 1
+            if max_mag > (2**15)
+            else ((2**15) + 1 if max_mag > 1.01 else 1.0)
+        )
+    # Normalize the audio
+    audio = torch.FloatTensor(audio.astype(np.float32)) / max_mag
+    if (torch.isnan(audio) | torch.isinf(audio)).any():
+        return [], sample_rate or fs or 48000
+    # Resample the audio to our target samplerate
+    if fs is not None and fs != sample_rate:
+        audio = torch.from_numpy(
+            librosa.core.resample(audio.numpy(), orig_sr=sample_rate, target_sr=fs)
+        )
+        sample_rate = fs
+    return audio, fs
+def _stft(y, cfg):
+    return librosa.stft(
+        y=y, n_fft=cfg.n_fft, hop_length=cfg.hop_size, win_length=cfg.win_size
+    )
+def energy(wav, cfg):
+    D = _stft(wav, cfg)
+    magnitudes = np.abs(D).T  # [F, T]
+    return LA.norm(magnitudes, axis=1)
+def get_energy_from_tacotron(audio, _stft):
+    audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
+    audio = torch.autograd.Variable(audio, requires_grad=False)
+    mel, energy = _stft.mel_spectrogram(audio)
+    energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
+    return mel, energy

utils/audio_slicer.py ADDED Viewed

	@@ -0,0 +1,476 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import numpy as np
+from tqdm import tqdm
+import torch
+import torchaudio
+from utils.io import save_audio
+from utils.audio import load_audio_torch
+# This function is obtained from librosa.
+def get_rms(
+    y,
+    *,
+    frame_length=2048,
+    hop_length=512,
+    pad_mode="constant",
+):
+    padding = (int(frame_length // 2), int(frame_length // 2))
+    y = np.pad(y, padding, mode=pad_mode)
+    axis = -1
+    # put our new within-frame axis at the end for now
+    out_strides = y.strides + tuple([y.strides[axis]])
+    # Reduce the shape on the framing axis
+    x_shape_trimmed = list(y.shape)
+    x_shape_trimmed[axis] -= frame_length - 1
+    out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
+    xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
+    if axis < 0:
+        target_axis = axis - 1
+    else:
+        target_axis = axis + 1
+    xw = np.moveaxis(xw, -1, target_axis)
+    # Downsample along the target axis
+    slices = [slice(None)] * xw.ndim
+    slices[axis] = slice(0, None, hop_length)
+    x = xw[tuple(slices)]
+    # Calculate power
+    power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
+    return np.sqrt(power)
+class Slicer:
+    """
+    Copy from: https://github.com/openvpi/audio-slicer/blob/main/slicer2.py
+    """
+    def __init__(
+        self,
+        sr: int,
+        threshold: float = -40.0,
+        min_length: int = 5000,
+        min_interval: int = 300,
+        hop_size: int = 10,
+        max_sil_kept: int = 5000,
+    ):
+        if not min_length >= min_interval >= hop_size:
+            raise ValueError(
+                "The following condition must be satisfied: min_length >= min_interval >= hop_size"
+            )
+        if not max_sil_kept >= hop_size:
+            raise ValueError(
+                "The following condition must be satisfied: max_sil_kept >= hop_size"
+            )
+        min_interval = sr * min_interval / 1000
+        self.threshold = 10 ** (threshold / 20.0)
+        self.hop_size = round(sr * hop_size / 1000)
+        self.win_size = min(round(min_interval), 4 * self.hop_size)
+        self.min_length = round(sr * min_length / 1000 / self.hop_size)
+        self.min_interval = round(min_interval / self.hop_size)
+        self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
+    def _apply_slice(self, waveform, begin, end):
+        begin = begin * self.hop_size
+        if len(waveform.shape) > 1:
+            end = min(waveform.shape[1], end * self.hop_size)
+            return waveform[:, begin:end], begin, end
+        else:
+            end = min(waveform.shape[0], end * self.hop_size)
+            return waveform[begin:end], begin, end
+    # @timeit
+    def slice(self, waveform, return_chunks_positions=False):
+        if len(waveform.shape) > 1:
+            # (#channle, wave_len) -> (wave_len)
+            samples = waveform.mean(axis=0)
+        else:
+            samples = waveform
+        if samples.shape[0] <= self.min_length:
+            return [waveform]
+        rms_list = get_rms(
+            y=samples, frame_length=self.win_size, hop_length=self.hop_size
+        ).squeeze(0)
+        sil_tags = []
+        silence_start = None
+        clip_start = 0
+        for i, rms in enumerate(rms_list):
+            # Keep looping while frame is silent.
+            if rms < self.threshold:
+                # Record start of silent frames.
+                if silence_start is None:
+                    silence_start = i
+                continue
+            # Keep looping while frame is not silent and silence start has not been recorded.
+            if silence_start is None:
+                continue
+            # Clear recorded silence start if interval is not enough or clip is too short
+            is_leading_silence = silence_start == 0 and i > self.max_sil_kept
+            need_slice_middle = (
+                i - silence_start >= self.min_interval
+                and i - clip_start >= self.min_length
+            )
+            if not is_leading_silence and not need_slice_middle:
+                silence_start = None
+                continue
+            # Need slicing. Record the range of silent frames to be removed.
+            if i - silence_start <= self.max_sil_kept:
+                pos = rms_list[silence_start : i + 1].argmin() + silence_start
+                if silence_start == 0:
+                    sil_tags.append((0, pos))
+                else:
+                    sil_tags.append((pos, pos))
+                clip_start = pos
+            elif i - silence_start <= self.max_sil_kept * 2:
+                pos = rms_list[
+                    i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
+                ].argmin()
+                pos += i - self.max_sil_kept
+                pos_l = (
+                    rms_list[
+                        silence_start : silence_start + self.max_sil_kept + 1
+                    ].argmin()
+                    + silence_start
+                )
+                pos_r = (
+                    rms_list[i - self.max_sil_kept : i + 1].argmin()
+                    + i
+                    - self.max_sil_kept
+                )
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                    clip_start = pos_r
+                else:
+                    sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
+                    clip_start = max(pos_r, pos)
+            else:
+                pos_l = (
+                    rms_list[
+                        silence_start : silence_start + self.max_sil_kept + 1
+                    ].argmin()
+                    + silence_start
+                )
+                pos_r = (
+                    rms_list[i - self.max_sil_kept : i + 1].argmin()
+                    + i
+                    - self.max_sil_kept
+                )
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                else:
+                    sil_tags.append((pos_l, pos_r))
+                clip_start = pos_r
+            silence_start = None
+        # Deal with trailing silence.
+        total_frames = rms_list.shape[0]
+        if (
+            silence_start is not None
+            and total_frames - silence_start >= self.min_interval
+        ):
+            silence_end = min(total_frames, silence_start + self.max_sil_kept)
+            pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
+            sil_tags.append((pos, total_frames + 1))
+        # Apply and return slices.
+        if len(sil_tags) == 0:
+            return [waveform]
+        else:
+            chunks = []
+            chunks_pos_of_waveform = []
+            if sil_tags[0][0] > 0:
+                chunk, begin, end = self._apply_slice(waveform, 0, sil_tags[0][0])
+                chunks.append(chunk)
+                chunks_pos_of_waveform.append((begin, end))
+            for i in range(len(sil_tags) - 1):
+                chunk, begin, end = self._apply_slice(
+                    waveform, sil_tags[i][1], sil_tags[i + 1][0]
+                )
+                chunks.append(chunk)
+                chunks_pos_of_waveform.append((begin, end))
+            if sil_tags[-1][1] < total_frames:
+                chunk, begin, end = self._apply_slice(
+                    waveform, sil_tags[-1][1], total_frames
+                )
+                chunks.append(chunk)
+                chunks_pos_of_waveform.append((begin, end))
+            return (
+                chunks
+                if not return_chunks_positions
+                else (
+                    chunks,
+                    chunks_pos_of_waveform,
+                )
+            )
+def split_utterances_from_audio(
+    wav_file,
+    output_dir,
+    max_duration_of_utterance=10.0,
+    min_interval=300,
+    db_threshold=-40,
+):
+    """
+    Split a long audio into utterances accoring to the silence (VAD).
+    max_duration_of_utterance (second):
+        The maximum duration of every utterance (seconds)
+    min_interval (millisecond):
+        The smaller min_interval is, the more sliced audio clips this script is likely to generate.
+    """
+    print("File:", wav_file.split("/")[-1])
+    waveform, fs = torchaudio.load(wav_file)
+    slicer = Slicer(sr=fs, min_interval=min_interval, threshold=db_threshold)
+    chunks, positions = slicer.slice(waveform, return_chunks_positions=True)
+    durations = [(end - begin) / fs for begin, end in positions]
+    print(
+        "Slicer's min silence part is {}ms, min and max duration of sliced utterances is {}s and {}s".format(
+            min_interval, min(durations), max(durations)
+        )
+    )
+    res_chunks, res_positions = [], []
+    for i, chunk in enumerate(chunks):
+        if len(chunk.shape) == 1:
+            chunk = chunk[None, :]
+        begin, end = positions[i]
+        assert end - begin == chunk.shape[-1]
+        max_wav_len = max_duration_of_utterance * fs
+        if chunk.shape[-1] <= max_wav_len:
+            res_chunks.append(chunk)
+            res_positions.append(positions[i])
+        else:
+            # TODO: to reserve overlapping and conduct fade-in, fade-out
+            # Get segments number
+            number = 2
+            while chunk.shape[-1] // number >= max_wav_len:
+                number += 1
+            seg_len = chunk.shape[-1] // number
+            # Split
+            for num in range(number):
+                s = seg_len * num
+                t = min(s + seg_len, chunk.shape[-1])
+                seg_begin = begin + s
+                seg_end = begin + t
+                res_chunks.append(chunk[:, s:t])
+                res_positions.append((seg_begin, seg_end))
+    # Save utterances
+    os.makedirs(output_dir, exist_ok=True)
+    res = {"fs": int(fs)}
+    for i, chunk in enumerate(res_chunks):
+        filename = "{:04d}.wav".format(i)
+        res[filename] = [int(p) for p in res_positions[i]]
+        save_audio(os.path.join(output_dir, filename), chunk, fs)
+    # Save positions
+    with open(os.path.join(output_dir, "positions.json"), "w") as f:
+        json.dump(res, f, indent=4, ensure_ascii=False)
+    return res
+def is_silence(
+    wavform,
+    fs,
+    threshold=-40.0,
+    min_interval=300,
+    hop_size=10,
+    min_length=5000,
+):
+    """
+    Detect whether the given wavform is a silence
+    wavform: (T, )
+    """
+    threshold = 10 ** (threshold / 20.0)
+    hop_size = round(fs * hop_size / 1000)
+    win_size = min(round(min_interval), 4 * hop_size)
+    min_length = round(fs * min_length / 1000 / hop_size)
+    if wavform.shape[0] <= min_length:
+        return True
+    # (#Frame,)
+    rms_array = get_rms(y=wavform, frame_length=win_size, hop_length=hop_size).squeeze(
+        0
+    )
+    return (rms_array < threshold).all()
+def split_audio(
+    wav_file, target_sr, output_dir, max_duration_of_segment=10.0, overlap_duration=1.0
+):
+    """
+    Split a long audio into segments.
+    target_sr:
+        The target sampling rate to save the segments.
+    max_duration_of_utterance (second):
+        The maximum duration of every utterance (second)
+    overlap_duraion:
+        Each segment has "overlap duration" (second) overlap with its previous and next segment
+    """
+    # (#channel, T) -> (T,)
+    waveform, fs = torchaudio.load(wav_file)
+    waveform = torchaudio.functional.resample(
+        waveform, orig_freq=fs, new_freq=target_sr
+    )
+    waveform = torch.mean(waveform, dim=0)
+    # waveform, _ = load_audio_torch(wav_file, target_sr)
+    assert len(waveform.shape) == 1
+    assert overlap_duration < max_duration_of_segment
+    length = int(max_duration_of_segment * target_sr)
+    stride = int((max_duration_of_segment - overlap_duration) * target_sr)
+    chunks = []
+    for i in range(0, len(waveform), stride):
+        # (length,)
+        chunks.append(waveform[i : i + length])
+        if i + length >= len(waveform):
+            break
+    # Save segments
+    os.makedirs(output_dir, exist_ok=True)
+    results = []
+    for i, chunk in enumerate(chunks):
+        uid = "{:04d}".format(i)
+        filename = os.path.join(output_dir, "{}.wav".format(uid))
+        results.append(
+            {"Uid": uid, "Path": filename, "Duration": len(chunk) / target_sr}
+        )
+        save_audio(
+            filename,
+            chunk,
+            target_sr,
+            turn_up=not is_silence(chunk, target_sr),
+            add_silence=False,
+        )
+    return results
+def merge_segments_torchaudio(wav_files, fs, output_path, overlap_duration=1.0):
+    """Merge the given wav_files (may have overlaps) into a long audio
+    fs:
+        The sampling rate of the wav files.
+    output_path:
+        The output path to save the merged audio.
+    overlap_duration (float, optional):
+        Each segment has "overlap duration" (second) overlap with its previous and next segment. Defaults to 1.0.
+    """
+    waveforms = []
+    for file in wav_files:
+        # (T,)
+        waveform, _ = load_audio_torch(file, fs)
+        waveforms.append(waveform)
+    if len(waveforms) == 1:
+        save_audio(output_path, waveforms[0], fs, add_silence=False, turn_up=False)
+        return
+    overlap_len = int(overlap_duration * fs)
+    fade_out = torchaudio.transforms.Fade(fade_out_len=overlap_len)
+    fade_in = torchaudio.transforms.Fade(fade_in_len=overlap_len)
+    fade_in_and_out = torchaudio.transforms.Fade(fade_out_len=overlap_len)
+    segments_lens = [len(wav) for wav in waveforms]
+    merged_waveform_len = sum(segments_lens) - overlap_len * (len(waveforms) - 1)
+    merged_waveform = torch.zeros(merged_waveform_len)
+    start = 0
+    for index, wav in enumerate(
+        tqdm(waveforms, desc="Merge for {}".format(output_path))
+    ):
+        wav_len = len(wav)
+        if index == 0:
+            wav = fade_out(wav)
+        elif index == len(waveforms) - 1:
+            wav = fade_in(wav)
+        else:
+            wav = fade_in_and_out(wav)
+        merged_waveform[start : start + wav_len] = wav
+        start += wav_len - overlap_len
+    save_audio(output_path, merged_waveform, fs, add_silence=False, turn_up=True)
+def merge_segments_encodec(wav_files, fs, output_path, overlap_duration=1.0):
+    """Merge the given wav_files (may have overlaps) into a long audio
+    fs:
+        The sampling rate of the wav files.
+    output_path:
+        The output path to save the merged audio.
+    overlap_duration (float, optional):
+        Each segment has "overlap duration" (second) overlap with its previous and next segment. Defaults to 1.0.
+    """
+    waveforms = []
+    for file in wav_files:
+        # (T,)
+        waveform, _ = load_audio_torch(file, fs)
+        waveforms.append(waveform)
+    if len(waveforms) == 1:
+        save_audio(output_path, waveforms[0], fs, add_silence=False, turn_up=False)
+        return
+    device = waveforms[0].device
+    dtype = waveforms[0].dtype
+    shape = waveforms[0].shape[:-1]
+    overlap_len = int(overlap_duration * fs)
+    segments_lens = [len(wav) for wav in waveforms]
+    merged_waveform_len = sum(segments_lens) - overlap_len * (len(waveforms) - 1)
+    sum_weight = torch.zeros(merged_waveform_len, device=device, dtype=dtype)
+    out = torch.zeros(*shape, merged_waveform_len, device=device, dtype=dtype)
+    offset = 0
+    for frame in waveforms:
+        frame_length = frame.size(-1)
+        t = torch.linspace(0, 1, frame_length + 2, device=device, dtype=torch.float32)[
+            1:-1
+        ]
+        weight = 0.5 - (t - 0.5).abs()
+        weighted_frame = frame * weight
+        cur = out[..., offset : offset + frame_length]
+        cur += weighted_frame[..., : cur.size(-1)]
+        out[..., offset : offset + frame_length] = cur
+        cur = sum_weight[offset : offset + frame_length]
+        cur += weight[..., : cur.size(-1)]
+        sum_weight[offset : offset + frame_length] = cur
+        offset += frame_length - overlap_len
+    assert sum_weight.min() > 0
+    merged_waveform = out / sum_weight
+    save_audio(output_path, merged_waveform, fs, add_silence=False, turn_up=True)

utils/data_utils.py ADDED Viewed

	@@ -0,0 +1,575 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import numpy as np
+from scipy.interpolate import interp1d
+from tqdm import tqdm
+from sklearn.preprocessing import StandardScaler
+def load_content_feature_path(meta_data, processed_dir, feat_dir):
+    utt2feat_path = {}
+    for utt_info in meta_data:
+        utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+        feat_path = os.path.join(
+            processed_dir, utt_info["Dataset"], feat_dir, f'{utt_info["Uid"]}.npy'
+        )
+        utt2feat_path[utt] = feat_path
+    return utt2feat_path
+def load_source_content_feature_path(meta_data, feat_dir):
+    utt2feat_path = {}
+    for utt in meta_data:
+        feat_path = os.path.join(feat_dir, f"{utt}.npy")
+        utt2feat_path[utt] = feat_path
+    return utt2feat_path
+def get_spk_map(spk2id_path, utt2spk_path):
+    utt2spk = {}
+    with open(spk2id_path, "r") as spk2id_file:
+        spk2id = json.load(spk2id_file)
+    with open(utt2spk_path, encoding="utf-8") as f:
+        for line in f.readlines():
+            utt, spk = line.strip().split("\t")
+            utt2spk[utt] = spk
+    return spk2id, utt2spk
+def get_target_f0_median(f0_dir):
+    total_f0 = []
+    for utt in os.listdir(f0_dir):
+        if not utt.endswith(".npy"):
+            continue
+        f0_feat_path = os.path.join(f0_dir, utt)
+        f0 = np.load(f0_feat_path)
+        total_f0 += f0.tolist()
+    total_f0 = np.array(total_f0)
+    voiced_position = np.where(total_f0 != 0)
+    return np.median(total_f0[voiced_position])
+def get_conversion_f0_factor(source_f0, target_median, source_median=None):
+    """Align the median between source f0 and target f0
+    Note: Here we use multiplication, whose factor is target_median/source_median
+    Reference: Frequency and pitch interval
+    http://blog.ccyg.studio/article/be12c2ee-d47c-4098-9782-ca76da3035e4/
+    """
+    if source_median is None:
+        voiced_position = np.where(source_f0 != 0)
+        source_median = np.median(source_f0[voiced_position])
+    factor = target_median / source_median
+    return source_median, factor
+def transpose_key(frame_pitch, trans_key):
+    # Transpose by user's argument
+    print("Transpose key = {} ...\n".format(trans_key))
+    transed_pitch = frame_pitch * 2 ** (trans_key / 12)
+    return transed_pitch
+def pitch_shift_to_target(frame_pitch, target_pitch_median, source_pitch_median=None):
+    # Loading F0 Base (median) and shift
+    source_pitch_median, factor = get_conversion_f0_factor(
+        frame_pitch, target_pitch_median, source_pitch_median
+    )
+    print(
+        "Auto transposing: source f0 median = {:.1f}, target f0 median = {:.1f}, factor = {:.2f}".format(
+            source_pitch_median, target_pitch_median, factor
+        )
+    )
+    transed_pitch = frame_pitch * factor
+    return transed_pitch
+def load_frame_pitch(
+    meta_data,
+    processed_dir,
+    pitch_dir,
+    use_log_scale=False,
+    return_norm=False,
+    interoperate=False,
+    utt2spk=None,
+):
+    utt2pitch = {}
+    utt2uv = {}
+    if utt2spk is None:
+        pitch_scaler = StandardScaler()
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            pitch_path = os.path.join(
+                processed_dir, utt_info["Dataset"], pitch_dir, f'{utt_info["Uid"]}.npy'
+            )
+            pitch = np.load(pitch_path)
+            assert len(pitch) > 0
+            uv = pitch != 0
+            utt2uv[utt] = uv
+            if use_log_scale:
+                nonzero_idxes = np.where(pitch != 0)[0]
+                pitch[nonzero_idxes] = np.log(pitch[nonzero_idxes])
+            utt2pitch[utt] = pitch
+            pitch_scaler.partial_fit(pitch.reshape(-1, 1))
+        mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
+        if return_norm:
+            for utt_info in meta_data:
+                utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+                pitch = utt2pitch[utt]
+                normalized_pitch = (pitch - mean) / std
+                utt2pitch[utt] = normalized_pitch
+        pitch_statistic = {"mean": mean, "std": std}
+    else:
+        spk2utt = {}
+        pitch_statistic = []
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            if not utt2spk[utt] in spk2utt:
+                spk2utt[utt2spk[utt]] = []
+            spk2utt[utt2spk[utt]].append(utt)
+        for spk in spk2utt:
+            pitch_scaler = StandardScaler()
+            for utt in spk2utt[spk]:
+                dataset = utt.split("_")[0]
+                uid = "_".join(utt.split("_")[1:])
+                pitch_path = os.path.join(
+                    processed_dir, dataset, pitch_dir, f"{uid}.npy"
+                )
+                pitch = np.load(pitch_path)
+                assert len(pitch) > 0
+                uv = pitch != 0
+                utt2uv[utt] = uv
+                if use_log_scale:
+                    nonzero_idxes = np.where(pitch != 0)[0]
+                    pitch[nonzero_idxes] = np.log(pitch[nonzero_idxes])
+                utt2pitch[utt] = pitch
+                pitch_scaler.partial_fit(pitch.reshape(-1, 1))
+            mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
+            if return_norm:
+                for utt in spk2utt[spk]:
+                    pitch = utt2pitch[utt]
+                    normalized_pitch = (pitch - mean) / std
+                    utt2pitch[utt] = normalized_pitch
+            pitch_statistic.append({"spk": spk, "mean": mean, "std": std})
+    return utt2pitch, utt2uv, pitch_statistic
+# discard
+def load_phone_pitch(
+    meta_data,
+    processed_dir,
+    pitch_dir,
+    utt2dur,
+    use_log_scale=False,
+    return_norm=False,
+    interoperate=True,
+    utt2spk=None,
+):
+    print("Load Phone Pitch")
+    utt2pitch = {}
+    utt2uv = {}
+    if utt2spk is None:
+        pitch_scaler = StandardScaler()
+        for utt_info in tqdm(meta_data):
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            pitch_path = os.path.join(
+                processed_dir, utt_info["Dataset"], pitch_dir, f'{utt_info["Uid"]}.npy'
+            )
+            frame_pitch = np.load(pitch_path)
+            assert len(frame_pitch) > 0
+            uv = frame_pitch != 0
+            utt2uv[utt] = uv
+            phone_pitch = phone_average_pitch(frame_pitch, utt2dur[utt], interoperate)
+            if use_log_scale:
+                nonzero_idxes = np.where(phone_pitch != 0)[0]
+                phone_pitch[nonzero_idxes] = np.log(phone_pitch[nonzero_idxes])
+            utt2pitch[utt] = phone_pitch
+            pitch_scaler.partial_fit(remove_outlier(phone_pitch).reshape(-1, 1))
+        mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
+        max_value = np.finfo(np.float64).min
+        min_value = np.finfo(np.float64).max
+        if return_norm:
+            for utt_info in meta_data:
+                utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+                pitch = utt2pitch[utt]
+                normalized_pitch = (pitch - mean) / std
+                max_value = max(max_value, max(normalized_pitch))
+                min_value = min(min_value, min(normalized_pitch))
+                utt2pitch[utt] = normalized_pitch
+                phone_normalized_pitch_path = os.path.join(
+                    processed_dir,
+                    utt_info["Dataset"],
+                    "phone_level_" + pitch_dir,
+                    f'{utt_info["Uid"]}.npy',
+                )
+        pitch_statistic = {
+            "mean": mean,
+            "std": std,
+            "min_value": min_value,
+            "max_value": max_value,
+        }
+    else:
+        spk2utt = {}
+        pitch_statistic = []
+        for utt_info in tqdm(meta_data):
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            if not utt2spk[utt] in spk2utt:
+                spk2utt[utt2spk[utt]] = []
+            spk2utt[utt2spk[utt]].append(utt)
+        for spk in spk2utt:
+            pitch_scaler = StandardScaler()
+            for utt in spk2utt[spk]:
+                dataset = utt.split("_")[0]
+                uid = "_".join(utt.split("_")[1:])
+                pitch_path = os.path.join(
+                    processed_dir, dataset, pitch_dir, f"{uid}.npy"
+                )
+                frame_pitch = np.load(pitch_path)
+                assert len(frame_pitch) > 0
+                uv = frame_pitch != 0
+                utt2uv[utt] = uv
+                phone_pitch = phone_average_pitch(
+                    frame_pitch, utt2dur[utt], interoperate
+                )
+                if use_log_scale:
+                    nonzero_idxes = np.where(phone_pitch != 0)[0]
+                    phone_pitch[nonzero_idxes] = np.log(phone_pitch[nonzero_idxes])
+                utt2pitch[utt] = phone_pitch
+                pitch_scaler.partial_fit(remove_outlier(phone_pitch).reshape(-1, 1))
+            mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
+            max_value = np.finfo(np.float64).min
+            min_value = np.finfo(np.float64).max
+            if return_norm:
+                for utt in spk2utt[spk]:
+                    pitch = utt2pitch[utt]
+                    normalized_pitch = (pitch - mean) / std
+                    max_value = max(max_value, max(normalized_pitch))
+                    min_value = min(min_value, min(normalized_pitch))
+                    utt2pitch[utt] = normalized_pitch
+            pitch_statistic.append(
+                {
+                    "spk": spk,
+                    "mean": mean,
+                    "std": std,
+                    "min_value": min_value,
+                    "max_value": max_value,
+                }
+            )
+    return utt2pitch, utt2uv, pitch_statistic
+def phone_average_pitch(pitch, dur, interoperate=False):
+    pos = 0
+    if interoperate:
+        nonzero_ids = np.where(pitch != 0)[0]
+        interp_fn = interp1d(
+            nonzero_ids,
+            pitch[nonzero_ids],
+            fill_value=(pitch[nonzero_ids[0]], pitch[nonzero_ids[-1]]),
+            bounds_error=False,
+        )
+        pitch = interp_fn(np.arange(0, len(pitch)))
+    phone_pitch = np.zeros(len(dur))
+    for i, d in enumerate(dur):
+        d = int(d)
+        if d > 0 and pos < len(pitch):
+            phone_pitch[i] = np.mean(pitch[pos : pos + d])
+        else:
+            phone_pitch[i] = 0
+        pos += d
+    return phone_pitch
+def load_energy(
+    meta_data,
+    processed_dir,
+    energy_dir,
+    use_log_scale=False,
+    return_norm=False,
+    utt2spk=None,
+):
+    utt2energy = {}
+    if utt2spk is None:
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            energy_path = os.path.join(
+                processed_dir, utt_info["Dataset"], energy_dir, f'{utt_info["Uid"]}.npy'
+            )
+            if not os.path.exists(energy_path):
+                continue
+            energy = np.load(energy_path)
+            assert len(energy) > 0
+            if use_log_scale:
+                nonzero_idxes = np.where(energy != 0)[0]
+                energy[nonzero_idxes] = np.log(energy[nonzero_idxes])
+            utt2energy[utt] = energy
+        if return_norm:
+            with open(
+                os.path.join(
+                    processed_dir, utt_info["Dataset"], energy_dir, "statistics.json"
+                )
+            ) as f:
+                stats = json.load(f)
+                mean, std = (
+                    stats[utt_info["Dataset"] + "_" + utt_info["Singer"]][
+                        "voiced_positions"
+                    ]["mean"],
+                    stats["LJSpeech_LJSpeech"]["voiced_positions"]["std"],
+                )
+            for utt in utt2energy.keys():
+                energy = utt2energy[utt]
+                normalized_energy = (energy - mean) / std
+                utt2energy[utt] = normalized_energy
+        energy_statistic = {"mean": mean, "std": std}
+    else:
+        spk2utt = {}
+        energy_statistic = []
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            if not utt2spk[utt] in spk2utt:
+                spk2utt[utt2spk[utt]] = []
+            spk2utt[utt2spk[utt]].append(utt)
+        for spk in spk2utt:
+            energy_scaler = StandardScaler()
+            for utt in spk2utt[spk]:
+                dataset = utt.split("_")[0]
+                uid = "_".join(utt.split("_")[1:])
+                energy_path = os.path.join(
+                    processed_dir, dataset, energy_dir, f"{uid}.npy"
+                )
+                if not os.path.exists(energy_path):
+                    continue
+                frame_energy = np.load(energy_path)
+                assert len(frame_energy) > 0
+                if use_log_scale:
+                    nonzero_idxes = np.where(frame_energy != 0)[0]
+                    frame_energy[nonzero_idxes] = np.log(frame_energy[nonzero_idxes])
+                utt2energy[utt] = frame_energy
+                energy_scaler.partial_fit(frame_energy.reshape(-1, 1))
+            mean, std = energy_scaler.mean_[0], energy_scaler.scale_[0]
+            if return_norm:
+                for utt in spk2utt[spk]:
+                    energy = utt2energy[utt]
+                    normalized_energy = (energy - mean) / std
+                    utt2energy[utt] = normalized_energy
+            energy_statistic.append({"spk": spk, "mean": mean, "std": std})
+    return utt2energy, energy_statistic
+def load_frame_energy(
+    meta_data,
+    processed_dir,
+    energy_dir,
+    use_log_scale=False,
+    return_norm=False,
+    interoperate=False,
+    utt2spk=None,
+):
+    utt2energy = {}
+    if utt2spk is None:
+        energy_scaler = StandardScaler()
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            energy_path = os.path.join(
+                processed_dir, utt_info["Dataset"], energy_dir, f'{utt_info["Uid"]}.npy'
+            )
+            frame_energy = np.load(energy_path)
+            assert len(frame_energy) > 0
+            if use_log_scale:
+                nonzero_idxes = np.where(frame_energy != 0)[0]
+                frame_energy[nonzero_idxes] = np.log(frame_energy[nonzero_idxes])
+            utt2energy[utt] = frame_energy
+            energy_scaler.partial_fit(frame_energy.reshape(-1, 1))
+        mean, std = energy_scaler.mean_[0], energy_scaler.scale_[0]
+        if return_norm:
+            for utt_info in meta_data:
+                utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+                energy = utt2energy[utt]
+                normalized_energy = (energy - mean) / std
+                utt2energy[utt] = normalized_energy
+        energy_statistic = {"mean": mean, "std": std}
+    else:
+        spk2utt = {}
+        energy_statistic = []
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            if not utt2spk[utt] in spk2utt:
+                spk2utt[utt2spk[utt]] = []
+            spk2utt[utt2spk[utt]].append(utt)
+        for spk in spk2utt:
+            energy_scaler = StandardScaler()
+            for utt in spk2utt[spk]:
+                dataset = utt.split("_")[0]
+                uid = "_".join(utt.split("_")[1:])
+                energy_path = os.path.join(
+                    processed_dir, dataset, energy_dir, f"{uid}.npy"
+                )
+                frame_energy = np.load(energy_path)
+                assert len(frame_energy) > 0
+                if use_log_scale:
+                    nonzero_idxes = np.where(frame_energy != 0)[0]
+                    frame_energy[nonzero_idxes] = np.log(frame_energy[nonzero_idxes])
+                utt2energy[utt] = frame_energy
+                energy_scaler.partial_fit(frame_energy.reshape(-1, 1))
+            mean, std = energy_scaler.mean_[0], energy_scaler.scale_[0]
+            if return_norm:
+                for utt in spk2utt[spk]:
+                    energy = utt2energy[utt]
+                    normalized_energy = (energy - mean) / std
+                    utt2energy[utt] = normalized_energy
+            energy_statistic.append({"spk": spk, "mean": mean, "std": std})
+    return utt2energy, energy_statistic
+def align_length(feature, target_len, pad_value=0.0):
+    feature_len = feature.shape[-1]
+    dim = len(feature.shape)
+    # align 1-D data
+    if dim == 2:
+        if target_len > feature_len:
+            feature = np.pad(
+                feature,
+                ((0, 0), (0, target_len - feature_len)),
+                constant_values=pad_value,
+            )
+        else:
+            feature = feature[:, :target_len]
+    # align 2-D data
+    elif dim == 1:
+        if target_len > feature_len:
+            feature = np.pad(
+                feature, (0, target_len - feature_len), constant_values=pad_value
+            )
+        else:
+            feature = feature[:target_len]
+    else:
+        raise NotImplementedError
+    return feature
+def align_whisper_feauture_length(
+    feature, target_len, fast_mapping=True, source_hop=320, target_hop=256
+):
+    factor = np.gcd(source_hop, target_hop)
+    source_hop //= factor
+    target_hop //= factor
+    # print(
+    #     "Mapping source's {} frames => target's {} frames".format(
+    #         target_hop, source_hop
+    #     )
+    # )
+    max_source_len = 1500
+    target_len = min(target_len, max_source_len * source_hop // target_hop)
+    width = feature.shape[-1]
+    if fast_mapping:
+        source_len = target_len * target_hop // source_hop + 1
+        feature = feature[:source_len]
+    else:
+        source_len = max_source_len
+    # const ~= target_len * target_hop
+    const = source_len * source_hop // target_hop * target_hop
+    # (source_len * source_hop, dim)
+    up_sampling_feats = np.repeat(feature, source_hop, axis=0)
+    # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
+    down_sampling_feats = np.average(
+        up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
+    )
+    assert len(down_sampling_feats) >= target_len
+    # (target_len, dim)
+    feat = down_sampling_feats[:target_len]
+    return feat
+def align_content_feature_length(feature, target_len, source_hop=320, target_hop=256):
+    factor = np.gcd(source_hop, target_hop)
+    source_hop //= factor
+    target_hop //= factor
+    # print(
+    #     "Mapping source's {} frames => target's {} frames".format(
+    #         target_hop, source_hop
+    #     )
+    # )
+    # (source_len, 256)
+    source_len, width = feature.shape
+    # const ~= target_len * target_hop
+    const = source_len * source_hop // target_hop * target_hop
+    # (source_len * source_hop, dim)
+    up_sampling_feats = np.repeat(feature, source_hop, axis=0)
+    # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
+    down_sampling_feats = np.average(
+        up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
+    )
+    err = abs(target_len - len(down_sampling_feats))
+    if err > 4:  ## why 4 not 3?
+        print("target_len:", target_len)
+        print("raw feature:", feature.shape)
+        print("up_sampling:", up_sampling_feats.shape)
+        print("down_sampling_feats:", down_sampling_feats.shape)
+        exit()
+    if len(down_sampling_feats) < target_len:
+        # (1, dim) -> (err, dim)
+        end = down_sampling_feats[-1][None, :].repeat(err, axis=0)
+        down_sampling_feats = np.concatenate([down_sampling_feats, end], axis=0)
+    # (target_len, dim)
+    feat = down_sampling_feats[:target_len]
+    return feat
+def remove_outlier(values):
+    values = np.array(values)
+    p25 = np.percentile(values, 25)
+    p75 = np.percentile(values, 75)
+    lower = p25 - 1.5 * (p75 - p25)
+    upper = p75 + 1.5 * (p75 - p25)
+    normal_indices = np.logical_and(values > lower, values < upper)
+    return values[normal_indices]

utils/distribution.py ADDED Viewed

	@@ -0,0 +1,270 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.distributions import Normal
+def log_sum_exp(x):
+    """numerically stable log_sum_exp implementation that prevents overflow"""
+    # TF ordering
+    axis = len(x.size()) - 1
+    m, _ = torch.max(x, dim=axis)
+    m2, _ = torch.max(x, dim=axis, keepdim=True)
+    return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
+def discretized_mix_logistic_loss(
+    y_hat, y, num_classes=256, log_scale_min=-7.0, reduce=True
+):
+    """Discretized mixture of logistic distributions loss
+    Note that it is assumed that input is scaled to [-1, 1].
+    Args:
+        y_hat (Tensor): Predicted output (B x C x T)
+        y (Tensor): Target (B x T x 1).
+        num_classes (int): Number of classes
+        log_scale_min (float): Log scale minimum value
+        reduce (bool): If True, the losses are averaged or summed for each
+          minibatch.
+    Returns
+        Tensor: loss
+    """
+    assert y_hat.dim() == 3
+    assert y_hat.size(1) % 3 == 0
+    nr_mix = y_hat.size(1) // 3
+    # (B x T x C)
+    y_hat = y_hat.transpose(1, 2)
+    # unpack parameters. (B, T, num_mixtures) x 3
+    logit_probs = y_hat[:, :, :nr_mix]
+    means = y_hat[:, :, nr_mix : 2 * nr_mix]
+    log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min)
+    # B x T x 1 -> B x T x num_mixtures
+    y = y.expand_as(means)
+    centered_y = y - means
+    inv_stdv = torch.exp(-log_scales)
+    plus_in = inv_stdv * (centered_y + 1.0 / (num_classes - 1))
+    cdf_plus = torch.sigmoid(plus_in)
+    min_in = inv_stdv * (centered_y - 1.0 / (num_classes - 1))
+    cdf_min = torch.sigmoid(min_in)
+    # log probability for edge case of 0 (before scaling)
+    # equivalent: torch.log(torch.sigmoid(plus_in))
+    log_cdf_plus = plus_in - F.softplus(plus_in)
+    # log probability for edge case of 255 (before scaling)
+    # equivalent: (1 - torch.sigmoid(min_in)).log()
+    log_one_minus_cdf_min = -F.softplus(min_in)
+    # probability for all other cases
+    cdf_delta = cdf_plus - cdf_min
+    mid_in = inv_stdv * centered_y
+    # log probability in the center of the bin, to be used in extreme cases
+    # (not actually used in our code)
+    log_pdf_mid = mid_in - log_scales - 2.0 * F.softplus(mid_in)
+    # tf equivalent
+    """
+    log_probs = tf.where(x < -0.999, log_cdf_plus,
+                         tf.where(x > 0.999, log_one_minus_cdf_min,
+                                  tf.where(cdf_delta > 1e-5,
+                                           tf.log(tf.maximum(cdf_delta, 1e-12)),
+                                           log_pdf_mid - np.log(127.5))))
+    """
+    # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
+    # for num_classes=65536 case? 1e-7? not sure..
+    inner_inner_cond = (cdf_delta > 1e-5).float()
+    inner_inner_out = inner_inner_cond * torch.log(
+        torch.clamp(cdf_delta, min=1e-12)
+    ) + (1.0 - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
+    inner_cond = (y > 0.999).float()
+    inner_out = (
+        inner_cond * log_one_minus_cdf_min + (1.0 - inner_cond) * inner_inner_out
+    )
+    cond = (y < -0.999).float()
+    log_probs = cond * log_cdf_plus + (1.0 - cond) * inner_out
+    log_probs = log_probs + F.log_softmax(logit_probs, -1)
+    if reduce:
+        return -torch.sum(log_sum_exp(log_probs))
+    else:
+        return -log_sum_exp(log_probs).unsqueeze(-1)
+def to_one_hot(tensor, n, fill_with=1.0):
+    # we perform one hot encore with respect to the last axis
+    one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_()
+    if tensor.is_cuda:
+        one_hot = one_hot.cuda()
+    one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with)
+    return one_hot
+def sample_from_discretized_mix_logistic(y, log_scale_min=-7.0, clamp_log_scale=False):
+    """
+    Sample from discretized mixture of logistic distributions
+    Args:
+        y (Tensor): B x C x T
+        log_scale_min (float): Log scale minimum value
+    Returns:
+        Tensor: sample in range of [-1, 1].
+    """
+    assert y.size(1) % 3 == 0
+    nr_mix = y.size(1) // 3
+    # B x T x C
+    y = y.transpose(1, 2)
+    logit_probs = y[:, :, :nr_mix]
+    # sample mixture indicator from softmax
+    temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
+    temp = logit_probs.data - torch.log(-torch.log(temp))
+    _, argmax = temp.max(dim=-1)
+    # (B, T) -> (B, T, nr_mix)
+    one_hot = to_one_hot(argmax, nr_mix)
+    # select logistic parameters
+    means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1)
+    log_scales = torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1)
+    if clamp_log_scale:
+        log_scales = torch.clamp(log_scales, min=log_scale_min)
+    # sample from logistic & clip to interval
+    # we don't actually round to the nearest 8bit value when sampling
+    u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5)
+    x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1.0 - u))
+    x = torch.clamp(torch.clamp(x, min=-1.0), max=1.0)
+    return x
+# we can easily define discretized version of the gaussian loss, however,
+# use continuous version as same as the https://clarinet-demo.github.io/
+def mix_gaussian_loss(y_hat, y, log_scale_min=-7.0, reduce=True):
+    """Mixture of continuous gaussian distributions loss
+    Note that it is assumed that input is scaled to [-1, 1].
+    Args:
+        y_hat (Tensor): Predicted output (B x C x T)
+        y (Tensor): Target (B x T x 1).
+        log_scale_min (float): Log scale minimum value
+        reduce (bool): If True, the losses are averaged or summed for each
+          minibatch.
+    Returns
+        Tensor: loss
+    """
+    assert y_hat.dim() == 3
+    C = y_hat.size(1)
+    if C == 2:
+        nr_mix = 1
+    else:
+        assert y_hat.size(1) % 3 == 0
+        nr_mix = y_hat.size(1) // 3
+    # (B x T x C)
+    y_hat = y_hat.transpose(1, 2)
+    # unpack parameters.
+    if C == 2:
+        # special case for C == 2, just for compatibility
+        logit_probs = None
+        means = y_hat[:, :, 0:1]
+        log_scales = torch.clamp(y_hat[:, :, 1:2], min=log_scale_min)
+    else:
+        #  (B, T, num_mixtures) x 3
+        logit_probs = y_hat[:, :, :nr_mix]
+        means = y_hat[:, :, nr_mix : 2 * nr_mix]
+        log_scales = torch.clamp(
+            y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min
+        )
+    # B x T x 1 -> B x T x num_mixtures
+    y = y.expand_as(means)
+    centered_y = y - means
+    dist = Normal(loc=0.0, scale=torch.exp(log_scales))
+    # do we need to add a trick to avoid log(0)?
+    log_probs = dist.log_prob(centered_y)
+    if nr_mix > 1:
+        log_probs = log_probs + F.log_softmax(logit_probs, -1)
+    if reduce:
+        if nr_mix == 1:
+            return -torch.sum(log_probs)
+        else:
+            return -torch.sum(log_sum_exp(log_probs))
+    else:
+        if nr_mix == 1:
+            return -log_probs
+        else:
+            return -log_sum_exp(log_probs).unsqueeze(-1)
+def sample_from_mix_gaussian(y, log_scale_min=-7.0):
+    """
+    Sample from (discretized) mixture of gaussian distributions
+    Args:
+        y (Tensor): B x C x T
+        log_scale_min (float): Log scale minimum value
+    Returns:
+        Tensor: sample in range of [-1, 1].
+    """
+    C = y.size(1)
+    if C == 2:
+        nr_mix = 1
+    else:
+        assert y.size(1) % 3 == 0
+        nr_mix = y.size(1) // 3
+    # B x T x C
+    y = y.transpose(1, 2)
+    if C == 2:
+        logit_probs = None
+    else:
+        logit_probs = y[:, :, :nr_mix]
+    if nr_mix > 1:
+        # sample mixture indicator from softmax
+        temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
+        temp = logit_probs.data - torch.log(-torch.log(temp))
+        _, argmax = temp.max(dim=-1)
+        # (B, T) -> (B, T, nr_mix)
+        one_hot = to_one_hot(argmax, nr_mix)
+        # Select means and log scales
+        means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1)
+        log_scales = torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1)
+    else:
+        if C == 2:
+            means, log_scales = y[:, :, 0], y[:, :, 1]
+        elif C == 3:
+            means, log_scales = y[:, :, 1], y[:, :, 2]
+        else:
+            assert False, "shouldn't happen"
+    scales = torch.exp(log_scales)
+    dist = Normal(loc=means, scale=scales)
+    x = dist.sample()
+    x = torch.clamp(x, min=-1.0, max=1.0)
+    return x

utils/dsp.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+# ZERO = 1e-12
+def gaussian_normalize_mel_channel(mel, mu, sigma):
+    """
+    Shift to Standorm Normal Distribution
+    Args:
+        mel: (n_mels, frame_len)
+        mu: (n_mels,), mean value
+        sigma: (n_mels,), sd value
+    Return:
+        Tensor like mel
+    """
+    mu = np.expand_dims(mu, -1)
+    sigma = np.expand_dims(sigma, -1)
+    return (mel - mu) / sigma
+def de_gaussian_normalize_mel_channel(mel, mu, sigma):
+    """
+    Args:
+        mel: (n_mels, frame_len)
+        mu: (n_mels,), mean value
+        sigma: (n_mels,), sd value
+    Return:
+        Tensor like mel
+    """
+    mu = np.expand_dims(mu, -1)
+    sigma = np.expand_dims(sigma, -1)
+    return sigma * mel + mu
+def decompress(audio_compressed, bits):
+    mu = 2**bits - 1
+    audio = np.sign(audio_compressed) / mu * ((1 + mu) ** np.abs(audio_compressed) - 1)
+    return audio
+def compress(audio, bits):
+    mu = 2**bits - 1
+    audio_compressed = np.sign(audio) * np.log(1 + mu * np.abs(audio)) / np.log(mu + 1)
+    return audio_compressed
+def label_to_audio(quant, bits):
+    classes = 2**bits
+    audio = 2 * quant / (classes - 1.0) - 1.0
+    return audio
+def audio_to_label(audio, bits):
+    """Normalized audio data tensor to digit array
+    Args:
+        audio (tensor): audio data
+        bits (int): data bits
+    Returns:
+        array<int>: digit array of audio data
+    """
+    classes = 2**bits
+    # initialize an increasing array with values from -1 to 1
+    bins = np.linspace(-1, 1, classes)
+    # change value in audio tensor to digits
+    quant = np.digitize(audio, bins) - 1
+    return quant
+def label_to_onehot(x, bits):
+    """Converts a class vector (integers) to binary class matrix.
+    Args:
+        x: class vector to be converted into a matrix
+            (integers from 0 to num_classes).
+        num_classes: total number of classes.
+    Returns:
+        A binary matrix representation of the input. The classes axis
+        is placed last.
+    """
+    classes = 2**bits
+    result = torch.zeros((x.shape[0], classes), dtype=torch.float32)
+    for i in range(x.shape[0]):
+        result[i, x[i]] = 1
+    output_shape = x.shape + (classes,)
+    output = torch.reshape(result, output_shape)
+    return output

utils/duration.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import os
+import tgt
+def get_alignment(tier, cfg):
+    sample_rate = cfg["sample_rate"]
+    hop_size = cfg["hop_size"]
+    sil_phones = ["sil", "sp", "spn"]
+    phones = []
+    durations = []
+    start_time = 0
+    end_time = 0
+    end_idx = 0
+    for t in tier._objects:
+        s, e, p = t.start_time, t.end_time, t.text
+        # Trim leading silences
+        if phones == []:
+            if p in sil_phones:
+                continue
+            else:
+                start_time = s
+        if p not in sil_phones:
+            # For ordinary phones
+            phones.append(p)
+            end_time = e
+            end_idx = len(phones)
+        else:
+            # For silent phones
+            phones.append(p)
+        durations.append(
+            int(
+                np.round(e * sample_rate / hop_size)
+                - np.round(s * sample_rate / hop_size)
+            )
+        )
+    # Trim tailing silences
+    phones = phones[:end_idx]
+    durations = durations[:end_idx]
+    return phones, durations, start_time, end_time
+def get_duration(utt, wav, cfg):
+    speaker = utt["Singer"]
+    basename = utt["Uid"]
+    dataset = utt["Dataset"]
+    sample_rate = cfg["sample_rate"]
+    # print(cfg.processed_dir, dataset, speaker, basename)
+    wav_path = os.path.join(
+        cfg.processed_dir, dataset, "raw_data", speaker, "{}.wav".format(basename)
+    )
+    text_path = os.path.join(
+        cfg.processed_dir, dataset, "raw_data", speaker, "{}.lab".format(basename)
+    )
+    tg_path = os.path.join(
+        cfg.processed_dir, dataset, "TextGrid", speaker, "{}.TextGrid".format(basename)
+    )
+    # Read raw text
+    with open(text_path, "r") as f:
+        raw_text = f.readline().strip("\n")
+    # Get alignments
+    textgrid = tgt.io.read_textgrid(tg_path)
+    phone, duration, start, end = get_alignment(
+        textgrid.get_tier_by_name("phones"), cfg
+    )
+    text = "{" + " ".join(phone) + "}"
+    if start >= end:
+        return None
+    return duration, text, int(sample_rate * start), int(sample_rate * end)

utils/f0.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import librosa
+import numpy as np
+import torch
+import parselmouth
+import torchcrepe
+import pyworld as pw
+def get_bin_index(f0, m, M, n_bins, use_log_scale):
+    """
+    WARNING: to abandon!
+    Args:
+        raw_f0: tensor whose shpae is (N, frame_len)
+    Returns:
+        index: tensor whose shape is same to f0
+    """
+    raw_f0 = f0.clone()
+    raw_m, raw_M = m, M
+    if use_log_scale:
+        f0[torch.where(f0 == 0)] = 1
+        f0 = torch.log(f0)
+        m, M = float(np.log(m)), float(np.log(M))
+    # Set normal index in [1, n_bins - 1]
+    width = (M + 1e-7 - m) / (n_bins - 1)
+    index = (f0 - m) // width + 1
+    # Set unvoiced frames as 0, Therefore, the vocabulary is [0, n_bins- 1], whose size is n_bins
+    index[torch.where(f0 == 0)] = 0
+    # TODO: Boundary check (special: to judge whether 0 for unvoiced)
+    if torch.any(raw_f0 > raw_M):
+        print("F0 Warning: too high f0: {}".format(raw_f0[torch.where(raw_f0 > raw_M)]))
+        index[torch.where(raw_f0 > raw_M)] = n_bins - 1
+    if torch.any(raw_f0 < raw_m):
+        print("F0 Warning: too low f0: {}".format(raw_f0[torch.where(f0 < m)]))
+        index[torch.where(f0 < m)] = 0
+    return torch.as_tensor(index, dtype=torch.long, device=f0.device)
+def f0_to_coarse(f0, pitch_bin, pitch_min, pitch_max):
+    ## TODO: Figure out the detail of this function
+    f0_mel_min = 1127 * np.log(1 + pitch_min / 700)
+    f0_mel_max = 1127 * np.log(1 + pitch_max / 700)
+    is_torch = isinstance(f0, torch.Tensor)
+    f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
+    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (pitch_bin - 2) / (
+        f0_mel_max - f0_mel_min
+    ) + 1
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > pitch_bin - 1] = pitch_bin - 1
+    f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int32)
+    assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
+        f0_coarse.max(),
+        f0_coarse.min(),
+    )
+    return f0_coarse
+def interpolate(f0):
+    """Interpolate the unvoiced part. Thus the f0 can be passed to a subtractive synthesizer.
+    Args:
+        f0: A numpy array of shape (seq_len,)
+    Returns:
+        f0: Interpolated f0 of shape (seq_len,)
+        uv: Unvoiced part of shape (seq_len,)
+    """
+    uv = f0 == 0
+    if len(f0[~uv]) > 0:
+        # interpolate the unvoiced f0
+        f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
+        uv = uv.astype("float")
+        uv = np.min(np.array([uv[:-2], uv[1:-1], uv[2:]]), axis=0)
+        uv = np.pad(uv, (1, 1))
+    return f0, uv
+def get_log_f0(f0):
+    f0[np.where(f0 == 0)] = 1
+    log_f0 = np.log(f0)
+    return log_f0
+# ========== Methods ==========
+def get_f0_features_using_pyin(audio, cfg):
+    """Using pyin to extract the f0 feature.
+    Args:
+        audio
+        fs
+        win_length
+        hop_length
+        f0_min
+        f0_max
+    Returns:
+        f0: numpy array of shape (frame_len,)
+    """
+    f0, voiced_flag, voiced_probs = librosa.pyin(
+        y=audio,
+        fmin=cfg.f0_min,
+        fmax=cfg.f0_max,
+        sr=cfg.sample_rate,
+        win_length=cfg.win_size,
+        hop_length=cfg.hop_size,
+    )
+    # Set nan to 0
+    f0[voiced_flag == False] = 0
+    return f0
+def get_f0_features_using_parselmouth(audio, cfg, speed=1):
+    """Using parselmouth to extract the f0 feature.
+    Args:
+        audio
+        mel_len
+        hop_length
+        fs
+        f0_min
+        f0_max
+        speed(default=1)
+    Returns:
+        f0: numpy array of shape (frame_len,)
+        pitch_coarse: numpy array of shape (frame_len,)
+    """
+    hop_size = int(np.round(cfg.hop_size * speed))
+    # Calculate the time step for pitch extraction
+    time_step = hop_size / cfg.sample_rate * 1000
+    f0 = (
+        parselmouth.Sound(audio, cfg.sample_rate)
+        .to_pitch_ac(
+            time_step=time_step / 1000,
+            voicing_threshold=0.6,
+            pitch_floor=cfg.f0_min,
+            pitch_ceiling=cfg.f0_max,
+        )
+        .selected_array["frequency"]
+    )
+    # Pad the pitch to the mel_len
+    # pad_size = (int(len(audio) // hop_size) - len(f0) + 1) // 2
+    # f0 = np.pad(f0, [[pad_size, mel_len - len(f0) - pad_size]], mode="constant")
+    # Get the coarse part
+    pitch_coarse = f0_to_coarse(f0, cfg.pitch_bin, cfg.f0_min, cfg.f0_max)
+    return f0, pitch_coarse
+def get_f0_features_using_dio(audio, cfg):
+    """Using dio to extract the f0 feature.
+    Args:
+        audio
+        mel_len
+        fs
+        hop_length
+        f0_min
+        f0_max
+    Returns:
+        f0: numpy array of shape (frame_len,)
+    """
+    # Get the raw f0
+    _f0, t = pw.dio(
+        audio.astype("double"),
+        cfg.sample_rate,
+        f0_floor=cfg.f0_min,
+        f0_ceil=cfg.f0_max,
+        channels_in_octave=2,
+        frame_period=(1000 * cfg.hop_size / cfg.sample_rate),
+    )
+    # Get the f0
+    f0 = pw.stonemask(audio.astype("double"), _f0, t, cfg.sample_rate)
+    return f0
+def get_f0_features_using_harvest(audio, mel_len, fs, hop_length, f0_min, f0_max):
+    """Using harvest to extract the f0 feature.
+    Args:
+        audio
+        mel_len
+        fs
+        hop_length
+        f0_min
+        f0_max
+    Returns:
+        f0: numpy array of shape (frame_len,)
+    """
+    f0, _ = pw.harvest(
+        audio.astype("double"),
+        fs,
+        f0_floor=f0_min,
+        f0_ceil=f0_max,
+        frame_period=(1000 * hop_length / fs),
+    )
+    f0 = f0.astype("float")[:mel_len]
+    return f0
+def get_f0_features_using_crepe_legacy(
+    audio, mel_len, fs, hop_length, hop_length_new, f0_min, f0_max, threshold=0.3
+):
+    """Using torchcrepe to extract the f0 feature.
+    Args:
+        audio
+        mel_len
+        fs
+        hop_length
+        hop_length_new
+        f0_min
+        f0_max
+        threshold(default=0.3)
+    Returns:
+        f0: numpy array of shape (frame_len,)
+    """
+    # Currently, crepe only supports 16khz audio
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    audio_16k = librosa.resample(audio, orig_sr=fs, target_sr=16000)
+    audio_16k_torch = torch.FloatTensor(audio_16k).unsqueeze(0).to(device)
+    # Get the raw pitch
+    f0, pd = torchcrepe.predict(
+        audio_16k_torch,
+        16000,
+        hop_length_new,
+        f0_min,
+        f0_max,
+        pad=True,
+        model="full",
+        batch_size=1024,
+        device=device,
+        return_periodicity=True,
+    )
+    # Filter, de-silence, set up threshold for unvoiced part
+    pd = torchcrepe.filter.median(pd, 3)
+    pd = torchcrepe.threshold.Silence(-60.0)(pd, audio_16k_torch, 16000, hop_length_new)
+    f0 = torchcrepe.threshold.At(threshold)(f0, pd)
+    f0 = torchcrepe.filter.mean(f0, 3)
+    # Convert unvoiced part to 0hz
+    f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0)
+    # Interpolate f0
+    nzindex = torch.nonzero(f0[0]).squeeze()
+    f0 = torch.index_select(f0[0], dim=0, index=nzindex).cpu().numpy()
+    time_org = 0.005 * nzindex.cpu().numpy()
+    time_frame = np.arange(mel_len) * hop_length / fs
+    f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
+    return f0
+def get_f0_features_using_crepe(audio, cfg):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    audio_torch = torch.FloatTensor(audio).unsqueeze(0).to(device)
+    crepe_pitch, pd = torchcrepe.predict(audio_torch, cfg.sample_rate, cfg.hop_size, fmin=cfg.f0_min, fmax=cfg.f0_max, return_periodicity=True)
+    threshold = 0.3
+    # Filter, de-silence, set up threshold for unvoiced part
+    pd = torchcrepe.filter.median(pd, 3)
+    pd = torchcrepe.threshold.Silence(-60.0)(pd, audio_torch, cfg.sample_rate, 256)
+    crepe_pitch = torchcrepe.threshold.At(threshold)(crepe_pitch, pd)
+    crepe_pitch = torchcrepe.filter.mean(crepe_pitch, 3)
+    # Convert unvoiced part to 0hz
+    crepe_pitch = torch.where(torch.isnan(crepe_pitch), torch.full_like(crepe_pitch, 0), crepe_pitch)
+    return crepe_pitch[0].cpu().numpy()
+def get_f0(audio, cfg):
+    if cfg.pitch_extractor == "dio":
+        f0 = get_f0_features_using_dio(audio, cfg)
+    elif cfg.pitch_extractor == "pyin":
+        f0 = get_f0_features_using_pyin(audio, cfg)
+    elif cfg.pitch_extractor == "parselmouth":
+        f0, _ = get_f0_features_using_parselmouth(audio, cfg)
+    elif cfg.pitch_extractor == "crepe":
+        f0 = get_f0_features_using_crepe(audio, cfg)
+    # elif cfg.data.f0_extractor == 'cwt': # todo
+    return f0
+def get_cents(f0_hz):
+    """
+    F_{cent} = 1200 * log2 (F/440)
+    Reference:
+        APSIPA'17, Perceptual Evaluation of Singing Quality
+    """
+    voiced_f0 = f0_hz[f0_hz != 0]
+    return 1200 * np.log2(voiced_f0 / 440)
+def get_pitch_derivatives(f0_hz):
+    """
+    f0_hz: (,T)
+    """
+    f0_cent = get_cents(f0_hz)
+    return f0_cent[1:] - f0_cent[:-1]
+def get_pitch_sub_median(f0_hz):
+    """
+    f0_hz: (,T)
+    """
+    f0_cent = get_cents(f0_hz)
+    return f0_cent - np.median(f0_cent)

utils/hparam.py ADDED Viewed

	@@ -0,0 +1,659 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/training/python/training/hparam.py  pylint: disable=line-too-long
+"""Hyperparameter values."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import json
+import numbers
+import re
+import six
+# Define the regular expression for parsing a single clause of the input
+# (delimited by commas).  A legal clause looks like:
+#   <variable name>[<index>]? = <rhs>
+# where <rhs> is either a single token or [] enclosed list of tokens.
+# For example:  "var[1] = a" or "x = [1,2,3]"
+PARAM_RE = re.compile(
+    r"""
+  (?P<name>[a-zA-Z][\w\.]*)      # variable name: "var" or "x"
+  (\[\s*(?P<index>\d+)\s*\])?  # (optional) index: "1" or None
+  \s*=\s*
+  ((?P<val>[^,\[]*)            # single value: "a" or None
+   |
+   \[(?P<vals>[^\]]*)\])       # list of values: None or "1,2,3"
+  ($|,\s*)""",
+    re.VERBOSE,
+)
+def _parse_fail(name, var_type, value, values):
+    """Helper function for raising a value error for bad assignment."""
+    raise ValueError(
+        "Could not parse hparam '%s' of type '%s' with value '%s' in %s"
+        % (name, var_type.__name__, value, values)
+    )
+def _reuse_fail(name, values):
+    """Helper function for raising a value error for reuse of name."""
+    raise ValueError("Multiple assignments to variable '%s' in %s" % (name, values))
+def _process_scalar_value(name, parse_fn, var_type, m_dict, values, results_dictionary):
+    """Update results_dictionary with a scalar value.
+    Used to update the results_dictionary to be returned by parse_values when
+    encountering a clause with a scalar RHS (e.g.  "s=5" or "arr[0]=5".)
+    Mutates results_dictionary.
+    Args:
+      name: Name of variable in assignment ("s" or "arr").
+      parse_fn: Function for parsing the actual value.
+      var_type: Type of named variable.
+      m_dict: Dictionary constructed from regex parsing.
+        m_dict['val']: RHS value (scalar)
+        m_dict['index']: List index value (or None)
+      values: Full expression being parsed
+      results_dictionary: The dictionary being updated for return by the parsing
+        function.
+    Raises:
+      ValueError: If the name has already been used.
+    """
+    try:
+        parsed_value = parse_fn(m_dict["val"])
+    except ValueError:
+        _parse_fail(name, var_type, m_dict["val"], values)
+    # If no index is provided
+    if not m_dict["index"]:
+        if name in results_dictionary:
+            _reuse_fail(name, values)
+        results_dictionary[name] = parsed_value
+    else:
+        if name in results_dictionary:
+            # The name has already been used as a scalar, then it
+            # will be in this dictionary and map to a non-dictionary.
+            if not isinstance(results_dictionary.get(name), dict):
+                _reuse_fail(name, values)
+        else:
+            results_dictionary[name] = {}
+        index = int(m_dict["index"])
+        # Make sure the index position hasn't already been assigned a value.
+        if index in results_dictionary[name]:
+            _reuse_fail("{}[{}]".format(name, index), values)
+        results_dictionary[name][index] = parsed_value
+def _process_list_value(name, parse_fn, var_type, m_dict, values, results_dictionary):
+    """Update results_dictionary from a list of values.
+    Used to update results_dictionary to be returned by parse_values when
+    encountering a clause with a list RHS (e.g.  "arr=[1,2,3]".)
+    Mutates results_dictionary.
+    Args:
+      name: Name of variable in assignment ("arr").
+      parse_fn: Function for parsing individual values.
+      var_type: Type of named variable.
+      m_dict: Dictionary constructed from regex parsing.
+        m_dict['val']: RHS value (scalar)
+      values: Full expression being parsed
+      results_dictionary: The dictionary being updated for return by the parsing
+        function.
+    Raises:
+      ValueError: If the name has an index or the values cannot be parsed.
+    """
+    if m_dict["index"] is not None:
+        raise ValueError("Assignment of a list to a list index.")
+    elements = filter(None, re.split("[ ,]", m_dict["vals"]))
+    # Make sure the name hasn't already been assigned a value
+    if name in results_dictionary:
+        raise _reuse_fail(name, values)
+    try:
+        results_dictionary[name] = [parse_fn(e) for e in elements]
+    except ValueError:
+        _parse_fail(name, var_type, m_dict["vals"], values)
+def _cast_to_type_if_compatible(name, param_type, value):
+    """Cast hparam to the provided type, if compatible.
+    Args:
+      name: Name of the hparam to be cast.
+      param_type: The type of the hparam.
+      value: The value to be cast, if compatible.
+    Returns:
+      The result of casting `value` to `param_type`.
+    Raises:
+      ValueError: If the type of `value` is not compatible with param_type.
+        * If `param_type` is a string type, but `value` is not.
+        * If `param_type` is a boolean, but `value` is not, or vice versa.
+        * If `param_type` is an integer type, but `value` is not.
+        * If `param_type` is a float type, but `value` is not a numeric type.
+    """
+    fail_msg = "Could not cast hparam '%s' of type '%s' from value %r" % (
+        name,
+        param_type,
+        value,
+    )
+    # Some callers use None, for which we can't do any casting/checking. :(
+    if issubclass(param_type, type(None)):
+        return value
+    # Avoid converting a non-string type to a string.
+    if issubclass(param_type, (six.string_types, six.binary_type)) and not isinstance(
+        value, (six.string_types, six.binary_type)
+    ):
+        raise ValueError(fail_msg)
+    # Avoid converting a number or string type to a boolean or vice versa.
+    if issubclass(param_type, bool) != isinstance(value, bool):
+        raise ValueError(fail_msg)
+    # Avoid converting float to an integer (the reverse is fine).
+    if issubclass(param_type, numbers.Integral) and not isinstance(
+        value, numbers.Integral
+    ):
+        raise ValueError(fail_msg)
+    # Avoid converting a non-numeric type to a numeric type.
+    if issubclass(param_type, numbers.Number) and not isinstance(value, numbers.Number):
+        raise ValueError(fail_msg)
+    return param_type(value)
+def parse_values(values, type_map, ignore_unknown=False):
+    """Parses hyperparameter values from a string into a python map.
+    `values` is a string containing comma-separated `name=value` pairs.
+    For each pair, the value of the hyperparameter named `name` is set to
+    `value`.
+    If a hyperparameter name appears multiple times in `values`, a ValueError
+    is raised (e.g. 'a=1,a=2', 'a[1]=1,a[1]=2').
+    If a hyperparameter name in both an index assignment and scalar assignment,
+    a ValueError is raised.  (e.g. 'a=[1,2,3],a[0] = 1').
+    The hyperparameter name may contain '.' symbols, which will result in an
+    attribute name that is only accessible through the getattr and setattr
+    functions.  (And must be first explicit added through add_hparam.)
+    WARNING: Use of '.' in your variable names is allowed, but is not well
+    supported and not recommended.
+    The `value` in `name=value` must follows the syntax according to the
+    type of the parameter:
+    *  Scalar integer: A Python-parsable integer point value.  E.g.: 1,
+       100, -12.
+    *  Scalar float: A Python-parsable floating point value.  E.g.: 1.0,
+       -.54e89.
+    *  Boolean: Either true or false.
+    *  Scalar string: A non-empty sequence of characters, excluding comma,
+       spaces, and square brackets.  E.g.: foo, bar_1.
+    *  List: A comma separated list of scalar values of the parameter type
+       enclosed in square brackets.  E.g.: [1,2,3], [1.0,1e-12], [high,low].
+    When index assignment is used, the corresponding type_map key should be the
+    list name.  E.g. for "arr[1]=0" the type_map must have the key "arr" (not
+    "arr[1]").
+    Args:
+      values: String.  Comma separated list of `name=value` pairs where
+        'value' must follow the syntax described above.
+      type_map: A dictionary mapping hyperparameter names to types.  Note every
+        parameter name in values must be a key in type_map.  The values must
+        conform to the types indicated, where a value V is said to conform to a
+        type T if either V has type T, or V is a list of elements of type T.
+        Hence, for a multidimensional parameter 'x' taking float values,
+        'x=[0.1,0.2]' will parse successfully if type_map['x'] = float.
+      ignore_unknown: Bool. Whether values that are missing a type in type_map
+        should be ignored. If set to True, a ValueError will not be raised for
+        unknown hyperparameter type.
+    Returns:
+      A python map mapping each name to either:
+      * A scalar value.
+      * A list of scalar values.
+      * A dictionary mapping index numbers to scalar values.
+      (e.g. "x=5,L=[1,2],arr[1]=3" results in {'x':5,'L':[1,2],'arr':{1:3}}")
+    Raises:
+      ValueError: If there is a problem with input.
+      * If `values` cannot be parsed.
+      * If a list is assigned to a list index (e.g. 'a[1] = [1,2,3]').
+      * If the same rvalue is assigned two different values (e.g. 'a=1,a=2',
+        'a[1]=1,a[1]=2', or 'a=1,a=[1]')
+    """
+    results_dictionary = {}
+    pos = 0
+    while pos < len(values):
+        m = PARAM_RE.match(values, pos)
+        if not m:
+            raise ValueError("Malformed hyperparameter value: %s" % values[pos:])
+        # Check that there is a comma between parameters and move past it.
+        pos = m.end()
+        # Parse the values.
+        m_dict = m.groupdict()
+        name = m_dict["name"]
+        if name not in type_map:
+            if ignore_unknown:
+                continue
+            raise ValueError("Unknown hyperparameter type for %s" % name)
+        type_ = type_map[name]
+        # Set up correct parsing function (depending on whether type_ is a bool)
+        if type_ == bool:
+            def parse_bool(value):
+                if value in ["true", "True"]:
+                    return True
+                elif value in ["false", "False"]:
+                    return False
+                else:
+                    try:
+                        return bool(int(value))
+                    except ValueError:
+                        _parse_fail(name, type_, value, values)
+            parse = parse_bool
+        else:
+            parse = type_
+        # If a singe value is provided
+        if m_dict["val"] is not None:
+            _process_scalar_value(
+                name, parse, type_, m_dict, values, results_dictionary
+            )
+        # If the assigned value is a list:
+        elif m_dict["vals"] is not None:
+            _process_list_value(name, parse, type_, m_dict, values, results_dictionary)
+        else:  # Not assigned a list or value
+            _parse_fail(name, type_, "", values)
+    return results_dictionary
+class HParams(object):
+    """Class to hold a set of hyperparameters as name-value pairs.
+    A `HParams` object holds hyperparameters used to build and train a model,
+    such as the number of hidden units in a neural net layer or the learning rate
+    to use when training.
+    You first create a `HParams` object by specifying the names and values of the
+    hyperparameters.
+    To make them easily accessible the parameter names are added as direct
+    attributes of the class.  A typical usage is as follows:
+    ```python
+    # Create a HParams object specifying names and values of the model
+    # hyperparameters:
+    hparams = HParams(learning_rate=0.1, num_hidden_units=100)
+    # The hyperparameter are available as attributes of the HParams object:
+    hparams.learning_rate ==> 0.1
+    hparams.num_hidden_units ==> 100
+    ```
+    Hyperparameters have type, which is inferred from the type of their value
+    passed at construction type.   The currently supported types are: integer,
+    float, boolean, string, and list of integer, float, boolean, or string.
+    You can override hyperparameter values by calling the
+    [`parse()`](#HParams.parse) method, passing a string of comma separated
+    `name=value` pairs.  This is intended to make it possible to override
+    any hyperparameter values from a single command-line flag to which
+    the user passes 'hyper-param=value' pairs.  It avoids having to define
+    one flag for each hyperparameter.
+    The syntax expected for each value depends on the type of the parameter.
+    See `parse()` for a description of the syntax.
+    Example:
+    ```python
+    # Define a command line flag to pass name=value pairs.
+    # For example using argparse:
+    import argparse
+    parser = argparse.ArgumentParser(description='Train my model.')
+    parser.add_argument('--hparams', type=str,
+                        help='Comma separated list of "name=value" pairs.')
+    args = parser.parse_args()
+    ...
+    def my_program():
+      # Create a HParams object specifying the names and values of the
+      # model hyperparameters:
+      hparams = tf.HParams(learning_rate=0.1, num_hidden_units=100,
+                           activations=['relu', 'tanh'])
+      # Override hyperparameters values by parsing the command line
+      hparams.parse(args.hparams)
+      # If the user passed `--hparams=learning_rate=0.3` on the command line
+      # then 'hparams' has the following attributes:
+      hparams.learning_rate ==> 0.3
+      hparams.num_hidden_units ==> 100
+      hparams.activations ==> ['relu', 'tanh']
+      # If the hyperparameters are in json format use parse_json:
+      hparams.parse_json('{"learning_rate": 0.3, "activations": "relu"}')
+    ```
+    """
+    _HAS_DYNAMIC_ATTRIBUTES = True  # Required for pytype checks.
+    def __init__(self, model_structure=None, **kwargs):
+        """Create an instance of `HParams` from keyword arguments.
+        The keyword arguments specify name-values pairs for the hyperparameters.
+        The parameter types are inferred from the type of the values passed.
+        The parameter names are added as attributes of `HParams` object, so they
+        can be accessed directly with the dot notation `hparams._name_`.
+        Example:
+        ```python
+        # Define 3 hyperparameters: 'learning_rate' is a float parameter,
+        # 'num_hidden_units' an integer parameter, and 'activation' a string
+        # parameter.
+        hparams = tf.HParams(
+            learning_rate=0.1, num_hidden_units=100, activation='relu')
+        hparams.activation ==> 'relu'
+        ```
+        Note that a few names are reserved and cannot be used as hyperparameter
+        names.  If you use one of the reserved name the constructor raises a
+        `ValueError`.
+        Args:
+          model_structure: An instance of ModelStructure, defining the feature
+            crosses to be used in the Trial.
+          **kwargs: Key-value pairs where the key is the hyperparameter name and
+            the value is the value for the parameter.
+        Raises:
+          ValueError: If both `hparam_def` and initialization values are provided,
+            or if one of the arguments is invalid.
+        """
+        # Register the hyperparameters and their type in _hparam_types.
+        # This simplifies the implementation of parse().
+        # _hparam_types maps the parameter name to a tuple (type, bool).
+        # The type value is the type of the parameter for scalar hyperparameters,
+        # or the type of the list elements for multidimensional hyperparameters.
+        # The bool value is True if the value is a list, False otherwise.
+        self._hparam_types = {}
+        self._model_structure = model_structure
+        for name, value in six.iteritems(kwargs):
+            self.add_hparam(name, value)
+    def add_hparam(self, name, value):
+        """Adds {name, value} pair to hyperparameters.
+        Args:
+          name: Name of the hyperparameter.
+          value: Value of the hyperparameter. Can be one of the following types:
+            int, float, string, int list, float list, or string list.
+        Raises:
+          ValueError: if one of the arguments is invalid.
+        """
+        # Keys in kwargs are unique, but 'name' could the name of a pre-existing
+        # attribute of this object.  In that case we refuse to use it as a
+        # hyperparameter name.
+        if getattr(self, name, None) is not None:
+            raise ValueError("Hyperparameter name is reserved: %s" % name)
+        if isinstance(value, (list, tuple)):
+            if not value:
+                raise ValueError(
+                    "Multi-valued hyperparameters cannot be empty: %s" % name
+                )
+            self._hparam_types[name] = (type(value[0]), True)
+        else:
+            self._hparam_types[name] = (type(value), False)
+        setattr(self, name, value)
+    def set_hparam(self, name, value):
+        """Set the value of an existing hyperparameter.
+        This function verifies that the type of the value matches the type of the
+        existing hyperparameter.
+        Args:
+          name: Name of the hyperparameter.
+          value: New value of the hyperparameter.
+        Raises:
+          KeyError: If the hyperparameter doesn't exist.
+          ValueError: If there is a type mismatch.
+        """
+        param_type, is_list = self._hparam_types[name]
+        if isinstance(value, list):
+            if not is_list:
+                raise ValueError(
+                    "Must not pass a list for single-valued parameter: %s" % name
+                )
+            setattr(
+                self,
+                name,
+                [_cast_to_type_if_compatible(name, param_type, v) for v in value],
+            )
+        else:
+            if is_list:
+                raise ValueError(
+                    "Must pass a list for multi-valued parameter: %s." % name
+                )
+            setattr(self, name, _cast_to_type_if_compatible(name, param_type, value))
+    def del_hparam(self, name):
+        """Removes the hyperparameter with key 'name'.
+        Does nothing if it isn't present.
+        Args:
+          name: Name of the hyperparameter.
+        """
+        if hasattr(self, name):
+            delattr(self, name)
+            del self._hparam_types[name]
+    def parse(self, values):
+        """Override existing hyperparameter values, parsing new values from a string.
+        See parse_values for more detail on the allowed format for values.
+        Args:
+          values: String.  Comma separated list of `name=value` pairs where 'value'
+            must follow the syntax described above.
+        Returns:
+          The `HParams` instance.
+        Raises:
+          ValueError: If `values` cannot be parsed or a hyperparameter in `values`
+          doesn't exist.
+        """
+        type_map = {}
+        for name, t in self._hparam_types.items():
+            param_type, _ = t
+            type_map[name] = param_type
+        values_map = parse_values(values, type_map)
+        return self.override_from_dict(values_map)
+    def override_from_dict(self, values_dict):
+        """Override existing hyperparameter values, parsing new values from a dictionary.
+        Args:
+          values_dict: Dictionary of name:value pairs.
+        Returns:
+          The `HParams` instance.
+        Raises:
+          KeyError: If a hyperparameter in `values_dict` doesn't exist.
+          ValueError: If `values_dict` cannot be parsed.
+        """
+        for name, value in values_dict.items():
+            self.set_hparam(name, value)
+        return self
+    def set_model_structure(self, model_structure):
+        self._model_structure = model_structure
+    def get_model_structure(self):
+        return self._model_structure
+    def to_json(self, indent=None, separators=None, sort_keys=False):
+        """Serializes the hyperparameters into JSON.
+        Args:
+          indent: If a non-negative integer, JSON array elements and object members
+            will be pretty-printed with that indent level. An indent level of 0, or
+            negative, will only insert newlines. `None` (the default) selects the
+            most compact representation.
+          separators: Optional `(item_separator, key_separator)` tuple. Default is
+            `(', ', ': ')`.
+          sort_keys: If `True`, the output dictionaries will be sorted by key.
+        Returns:
+          A JSON string.
+        """
+        def remove_callables(x):
+            """Omit callable elements from input with arbitrary nesting."""
+            if isinstance(x, dict):
+                return {
+                    k: remove_callables(v)
+                    for k, v in six.iteritems(x)
+                    if not callable(v)
+                }
+            elif isinstance(x, list):
+                return [remove_callables(i) for i in x if not callable(i)]
+            return x
+        return json.dumps(
+            remove_callables(self.values()),
+            indent=indent,
+            separators=separators,
+            sort_keys=sort_keys,
+        )
+    def parse_json(self, values_json):
+        """Override existing hyperparameter values, parsing new values from a json object.
+        Args:
+          values_json: String containing a json object of name:value pairs.
+        Returns:
+          The `HParams` instance.
+        Raises:
+          KeyError: If a hyperparameter in `values_json` doesn't exist.
+          ValueError: If `values_json` cannot be parsed.
+        """
+        values_map = json.loads(values_json)
+        return self.override_from_dict(values_map)
+    def values(self):
+        """Return the hyperparameter values as a Python dictionary.
+        Returns:
+          A dictionary with hyperparameter names as keys.  The values are the
+          hyperparameter values.
+        """
+        return {n: getattr(self, n) for n in self._hparam_types.keys()}
+    def get(self, key, default=None):
+        """Returns the value of `key` if it exists, else `default`."""
+        if key in self._hparam_types:
+            # Ensure that default is compatible with the parameter type.
+            if default is not None:
+                param_type, is_param_list = self._hparam_types[key]
+                type_str = "list<%s>" % param_type if is_param_list else str(param_type)
+                fail_msg = (
+                    "Hparam '%s' of type '%s' is incompatible with "
+                    "default=%s" % (key, type_str, default)
+                )
+                is_default_list = isinstance(default, list)
+                if is_param_list != is_default_list:
+                    raise ValueError(fail_msg)
+                try:
+                    if is_default_list:
+                        for value in default:
+                            _cast_to_type_if_compatible(key, param_type, value)
+                    else:
+                        _cast_to_type_if_compatible(key, param_type, default)
+                except ValueError as e:
+                    raise ValueError("%s. %s" % (fail_msg, e))
+            return getattr(self, key)
+        return default
+    def __contains__(self, key):
+        return key in self._hparam_types
+    def __str__(self):
+        return str(sorted(self.values().items()))
+    def __repr__(self):
+        return "%s(%s)" % (type(self).__name__, self.__str__())
+    @staticmethod
+    def _get_kind_name(param_type, is_list):
+        """Returns the field name given parameter type and is_list.
+        Args:
+          param_type: Data type of the hparam.
+          is_list: Whether this is a list.
+        Returns:
+          A string representation of the field name.
+        Raises:
+          ValueError: If parameter type is not recognized.
+        """
+        if issubclass(param_type, bool):
+            # This check must happen before issubclass(param_type, six.integer_types),
+            # since Python considers bool to be a subclass of int.
+            typename = "bool"
+        elif issubclass(param_type, six.integer_types):
+            # Setting 'int' and 'long' types to be 'int64' to ensure the type is
+            # compatible with both Python2 and Python3.
+            typename = "int64"
+        elif issubclass(param_type, (six.string_types, six.binary_type)):
+            # Setting 'string' and 'bytes' types to be 'bytes' to ensure the type is
+            # compatible with both Python2 and Python3.
+            typename = "bytes"
+        elif issubclass(param_type, float):
+            typename = "float"
+        else:
+            raise ValueError("Unsupported parameter type: %s" % str(param_type))
+        suffix = "list" if is_list else "value"
+        return "_".join([typename, suffix])

utils/hubert.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://github.com/svc-develop-team/so-vits-svc/blob/4.0/preprocess_hubert_f0.py
+import os
+import librosa
+import torch
+import numpy as np
+from fairseq import checkpoint_utils
+from tqdm import tqdm
+import torch
+def load_hubert_model(hps):
+    # Load model
+    ckpt_path = hps.hubert_file
+    print("Load Hubert Model...")
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+        [ckpt_path],
+        suffix="",
+    )
+    model = models[0]
+    model.eval()
+    if torch.cuda.is_available():
+        model = model.cuda()
+    return model
+def get_hubert_content(hmodel, wav_16k_tensor):
+    feats = wav_16k_tensor
+    if feats.dim() == 2:  # double channels
+        feats = feats.mean(-1)
+    assert feats.dim() == 1, feats.dim()
+    feats = feats.view(1, -1)
+    padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+    inputs = {
+        "source": feats.to(wav_16k_tensor.device),
+        "padding_mask": padding_mask.to(wav_16k_tensor.device),
+        "output_layer": 9,  # layer 9
+    }
+    with torch.no_grad():
+        logits = hmodel.extract_features(**inputs)
+        feats = hmodel.final_proj(logits[0]).squeeze(0)
+    return feats
+def content_vector_encoder(model, audio_path, default_sampling_rate=16000):
+    """
+    # content vector default sr: 16000
+    """
+    wav16k, sr = librosa.load(audio_path, sr=default_sampling_rate)
+    device = next(model.parameters()).device
+    wav16k = torch.from_numpy(wav16k).to(device)
+    # (1, 256, frame_len)
+    content_feature = get_hubert_content(model, wav_16k_tensor=wav16k)
+    return content_feature.cpu().detach().numpy()
+def repeat_expand_2d(content, target_len):
+    """
+    content : [hubert_dim(256), src_len]
+    target: [hubert_dim(256), target_len]
+    """
+    src_len = content.shape[-1]
+    target = torch.zeros([content.shape[0], target_len], dtype=torch.float).to(
+        content.device
+    )
+    temp = torch.arange(src_len + 1) * target_len / src_len
+    current_pos = 0
+    for i in range(target_len):
+        if i < temp[current_pos + 1]:
+            target[:, i] = content[:, current_pos]
+        else:
+            current_pos += 1
+            target[:, i] = content[:, current_pos]
+    return target
+def get_mapped_features(raw_content_features, mapping_features):
+    """
+    Content Vector: frameshift = 20ms, hop_size = 480 in 24k
+    Now it's only used for mapping to bigvgan's mels (sr = 24k, hop_size = 256, frameshift ~= 10.7 ms)
+    """
+    source_hop = 480
+    target_hop = 256
+    factor = np.gcd(source_hop, target_hop)
+    source_hop //= factor
+    target_hop //= factor
+    print(
+        "Mapping source's {} frames => target's {} frames".format(
+            target_hop, source_hop
+        )
+    )
+    results = []
+    for index, mapping_feat in enumerate(tqdm(mapping_features)):
+        # mappping_feat: (mels_frame_len, n_mels)
+        target_len = len(mapping_feat)
+        # (source_len, 256)
+        raw_feats = raw_content_features[index][0].cpu().numpy().T
+        source_len, width = raw_feats.shape
+        # const ~= target_len * target_hop
+        const = source_len * source_hop // target_hop * target_hop
+        # (source_len * source_hop, dim)
+        up_sampling_feats = np.repeat(raw_feats, source_hop, axis=0)
+        # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
+        down_sampling_feats = np.average(
+            up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
+        )
+        err = abs(target_len - len(down_sampling_feats))
+        if err > 3:
+            print("index:", index)
+            print("mels:", mapping_feat.shape)
+            print("raw content vector:", raw_feats.shape)
+            print("up_sampling:", up_sampling_feats.shape)
+            print("down_sampling_feats:", down_sampling_feats.shape)
+            exit()
+        if len(down_sampling_feats) < target_len:
+            # (1, dim) -> (err, dim)
+            end = down_sampling_feats[-1][None, :].repeat(err, axis=0)
+            down_sampling_feats = np.concatenate([down_sampling_feats, end], axis=0)
+        # (target_len, dim)
+        feats = down_sampling_feats[:target_len]
+        results.append(feats)
+    return results
+def extract_hubert_features_of_dataset(datasets, model, out_dir):
+    for utt in tqdm(datasets):
+        uid = utt["Uid"]
+        audio_path = utt["Path"]
+        content_vector_feature = content_vector_encoder(model, audio_path)  # (T, 256)
+        save_path = os.path.join(out_dir, uid + ".npy")
+        np.save(save_path, content_vector_feature)

utils/io.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import numpy as np
+import torch
+import torchaudio
+def save_feature(process_dir, feature_dir, item, feature, overrides=True):
+    """Save features to path
+    Args:
+        process_dir (str): directory to store features
+        feature_dir (_type_): directory to store one type of features (mel, energy, ...)
+        item (str): uid
+        feature (tensor): feature tensor
+        overrides (bool, optional): whether to override existing files. Defaults to True.
+    """
+    process_dir = os.path.join(process_dir, feature_dir)
+    os.makedirs(process_dir, exist_ok=True)
+    out_path = os.path.join(process_dir, item + ".npy")
+    if os.path.exists(out_path):
+        if overrides:
+            np.save(out_path, feature)
+    else:
+        np.save(out_path, feature)
+def save_txt(process_dir, feature_dir, item, feature, overrides=True):
+    process_dir = os.path.join(process_dir, feature_dir)
+    os.makedirs(process_dir, exist_ok=True)
+    out_path = os.path.join(process_dir, item + ".txt")
+    if os.path.exists(out_path):
+        if overrides:
+            f = open(out_path, "w")
+            f.writelines(feature)
+            f.close()
+    else:
+        f = open(out_path, "w")
+        f.writelines(feature)
+        f.close()
+def save_audio(path, waveform, fs, add_silence=False, turn_up=False, volume_peak=0.9):
+    if turn_up:
+        # continue to turn up to volume_peak
+        ratio = volume_peak / max(waveform.max(), abs(waveform.min()))
+        waveform = waveform * ratio
+    if add_silence:
+        silence_len = fs // 20
+        silence = np.zeros((silence_len,), dtype=waveform.dtype)
+        result = np.concatenate([silence, waveform, silence])
+        waveform = result
+    waveform = torch.as_tensor(waveform, dtype=torch.float32, device="cpu")
+    if len(waveform.size()) == 1:
+        waveform = waveform[None, :]
+    elif waveform.size(0) != 1:
+        # Stereo to mono
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
+    torchaudio.save(path, waveform, fs, encoding="PCM_S", bits_per_sample=16)
+async def async_load_audio(path, sample_rate: int = 24000):
+    r"""
+    Args:
+        path: The source loading path.
+        sample_rate: The target sample rate, will automatically resample if necessary.
+    Returns:
+        waveform: The waveform object. Should be [1 x sequence_len].
+    """
+    async def use_torchaudio_load(path):
+        return torchaudio.load(path)
+    waveform, sr = await use_torchaudio_load(path)
+    waveform = torch.mean(waveform, dim=0, keepdim=True)
+    if sr != sample_rate:
+        waveform = torchaudio.functional.resample(waveform, sr, sample_rate)
+    if torch.any(torch.isnan(waveform) or torch.isinf(waveform)):
+        raise ValueError("NaN or Inf found in waveform.")
+    return waveform
+async def async_save_audio(
+    path,
+    waveform,
+    sample_rate: int = 24000,
+    add_silence: bool = False,
+    volume_peak: float = 0.9,
+):
+    r"""
+    Args:
+        path: The target saving path.
+        waveform: The waveform object. Should be [n_channel x sequence_len].
+        sample_rate: Sample rate.
+        add_silence: If ``true``, concat 0.05s silence to beginning and end.
+        volume_peak: Turn up volume for larger number, vice versa.
+    """
+    async def use_torchaudio_save(path, waveform, sample_rate):
+        torchaudio.save(
+            path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16
+        )
+    waveform = torch.as_tensor(waveform, device="cpu", dtype=torch.float32)
+    shape = waveform.size()[:-1]
+    ratio = abs(volume_peak) / max(waveform.max(), abs(waveform.min()))
+    waveform = waveform * ratio
+    if add_silence:
+        silence_len = sample_rate // 20
+        silence = torch.zeros((*shape, silence_len), dtype=waveform.type())
+        waveform = torch.concatenate((silence, waveform, silence), dim=-1)
+    if waveform.dim() == 1:
+        waveform = waveform[None]
+    await use_torchaudio_save(path, waveform, sample_rate)
+def load_mel_extrema(cfg, dataset_name, split):
+    dataset_dir = os.path.join(
+        cfg.OUTPUT_PATH,
+        "preprocess/{}_version".format(cfg.data.process_version),
+        dataset_name,
+    )
+    min_file = os.path.join(
+        dataset_dir,
+        "mel_min_max",
+        split.split("_")[-1],
+        "mel_min.npy",
+    )
+    max_file = os.path.join(
+        dataset_dir,
+        "mel_min_max",
+        split.split("_")[-1],
+        "mel_max.npy",
+    )
+    mel_min = np.load(min_file)
+    mel_max = np.load(max_file)
+    return mel_min, mel_max

utils/io_optim.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torchaudio
+import json
+import os
+import numpy as np
+import librosa
+from torch.nn.utils.rnn import pad_sequence
+from modules import whisper_extractor as whisper
+class TorchaudioDataset(torch.utils.data.Dataset):
+    def __init__(self, cfg, dataset, sr, accelerator=None, metadata=None):
+        """
+        Args:
+            cfg: config
+            dataset: dataset name
+        """
+        assert isinstance(dataset, str)
+        self.sr = sr
+        self.cfg = cfg
+        if metadata is None:
+            self.train_metadata_path = os.path.join(
+                cfg.preprocess.processed_dir, dataset, cfg.preprocess.train_file
+            )
+            self.valid_metadata_path = os.path.join(
+                cfg.preprocess.processed_dir, dataset, cfg.preprocess.valid_file
+            )
+            self.metadata = self.get_metadata()
+        else:
+            self.metadata = metadata
+        if accelerator is not None:
+            self.device = accelerator.device
+        elif torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        else:
+            self.device = torch.device("cpu")
+    def get_metadata(self):
+        metadata = []
+        with open(self.train_metadata_path, "r", encoding="utf-8") as t:
+            metadata.extend(json.load(t))
+        with open(self.valid_metadata_path, "r", encoding="utf-8") as v:
+            metadata.extend(json.load(v))
+        return metadata
+    def __len__(self):
+        return len(self.metadata)
+    def __getitem__(self, index):
+        utt_info = self.metadata[index]
+        wav_path = utt_info["Path"]
+        wav, sr = torchaudio.load(wav_path)
+        # resample
+        if sr != self.sr:
+            wav = torchaudio.functional.resample(wav, sr, self.sr)
+        # downmixing
+        if wav.shape[0] > 1:
+            wav = torch.mean(wav, dim=0, keepdim=True)
+        assert wav.shape[0] == 1
+        wav = wav.squeeze(0)
+        # record the length of wav without padding
+        length = wav.shape[0]
+        # wav: (T)
+        return utt_info, wav, length
+class LibrosaDataset(TorchaudioDataset):
+    def __init__(self, cfg, dataset, sr, accelerator=None, metadata=None):
+        super().__init__(cfg, dataset, sr, accelerator, metadata)
+    def __getitem__(self, index):
+        utt_info = self.metadata[index]
+        wav_path = utt_info["Path"]
+        wav, _ = librosa.load(wav_path, sr=self.sr)
+        # wav: (T)
+        wav = torch.from_numpy(wav)
+        # record the length of wav without padding
+        length = wav.shape[0]
+        return utt_info, wav, length
+class FFmpegDataset(TorchaudioDataset):
+    def __init__(self, cfg, dataset, sr, accelerator=None, metadata=None):
+        super().__init__(cfg, dataset, sr, accelerator, metadata)
+    def __getitem__(self, index):
+        utt_info = self.metadata[index]
+        wav_path = utt_info["Path"]
+        # wav: (T,)
+        wav = whisper.load_audio(wav_path)  # sr = 16000
+        # convert to torch tensor
+        wav = torch.from_numpy(wav)
+        # record the length of wav without padding
+        length = wav.shape[0]
+        return utt_info, wav, length
+def collate_batch(batch_list):
+    """
+    Args:
+        batch_list: list of (metadata, wav, length)
+    """
+    metadata = [item[0] for item in batch_list]
+    # wavs: (B, T)
+    wavs = pad_sequence([item[1] for item in batch_list], batch_first=True)
+    lens = [item[2] for item in batch_list]
+    return metadata, wavs, lens

utils/mel.py ADDED Viewed

	@@ -0,0 +1,283 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from librosa.filters import mel as librosa_mel_fn
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def extract_linear_features(y, cfg, center=False):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    global hann_window
+    hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+    # complex tensor as default, then use view_as_real for future pytorch compatibility
+    spec = torch.stft(
+        y,
+        cfg.n_fft,
+        hop_length=cfg.hop_size,
+        win_length=cfg.win_size,
+        window=hann_window[str(y.device)],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = torch.view_as_real(spec)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.squeeze(spec, 0)
+    return spec
+def mel_spectrogram_torch(y, cfg, center=False):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    global mel_basis, hann_window
+    if cfg.fmax not in mel_basis:
+        mel = librosa_mel_fn(
+            sr=cfg.sample_rate,
+            n_fft=cfg.n_fft,
+            n_mels=cfg.n_mel,
+            fmin=cfg.fmin,
+            fmax=cfg.fmax,
+        )
+        mel_basis[str(cfg.fmax) + "_" + str(y.device)] = (
+            torch.from_numpy(mel).float().to(y.device)
+        )
+        hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+    spec = torch.stft(
+        y,
+        cfg.n_fft,
+        hop_length=cfg.hop_size,
+        win_length=cfg.win_size,
+        window=hann_window[str(y.device)],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = torch.view_as_real(spec)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+mel_basis = {}
+hann_window = {}
+def extract_mel_features(
+    y,
+    cfg,
+    center=False
+    # n_fft, n_mel, sampling_rate, hop_size, win_size, fmin, fmax, center=False
+):
+    """Extract mel features
+    Args:
+        y (tensor): audio data in tensor
+        cfg (dict): configuration in cfg.preprocess
+        center (bool, optional): In STFT, whether t-th frame is centered at time t*hop_length. Defaults to False.
+    Returns:
+        tensor: a tensor containing the mel feature calculated based on STFT result
+    """
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    global mel_basis, hann_window
+    if cfg.fmax not in mel_basis:
+        mel = librosa_mel_fn(
+            sr=cfg.sample_rate,
+            n_fft=cfg.n_fft,
+            n_mels=cfg.n_mel,
+            fmin=cfg.fmin,
+            fmax=cfg.fmax,
+        )
+        mel_basis[str(cfg.fmax) + "_" + str(y.device)] = (
+            torch.from_numpy(mel).float().to(y.device)
+        )
+        hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+    # complex tensor as default, then use view_as_real for future pytorch compatibility
+    spec = torch.stft(
+        y,
+        cfg.n_fft,
+        hop_length=cfg.hop_size,
+        win_length=cfg.win_size,
+        window=hann_window[str(y.device)],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = torch.view_as_real(spec)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec.squeeze(0)
+def extract_mel_features_tts(
+    y,
+    cfg,
+    center=False,
+    taco=False,
+    _stft=None,
+):
+    """Extract mel features
+    Args:
+        y (tensor): audio data in tensor
+        cfg (dict): configuration in cfg.preprocess
+        center (bool, optional): In STFT, whether t-th frame is centered at time t*hop_length. Defaults to False.
+        taco: use tacotron mel
+    Returns:
+        tensor: a tensor containing the mel feature calculated based on STFT result
+    """
+    if not taco:
+        if torch.min(y) < -1.0:
+            print("min value is ", torch.min(y))
+        if torch.max(y) > 1.0:
+            print("max value is ", torch.max(y))
+        global mel_basis, hann_window
+        if cfg.fmax not in mel_basis:
+            mel = librosa_mel_fn(
+                sr=cfg.sample_rate,
+                n_fft=cfg.n_fft,
+                n_mels=cfg.n_mel,
+                fmin=cfg.fmin,
+                fmax=cfg.fmax,
+            )
+            mel_basis[str(cfg.fmax) + "_" + str(y.device)] = (
+                torch.from_numpy(mel).float().to(y.device)
+            )
+            hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
+        y = torch.nn.functional.pad(
+            y.unsqueeze(1),
+            (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+            mode="reflect",
+        )
+        y = y.squeeze(1)
+        # complex tensor as default, then use view_as_real for future pytorch compatibility
+        spec = torch.stft(
+            y,
+            cfg.n_fft,
+            hop_length=cfg.hop_size,
+            win_length=cfg.win_size,
+            window=hann_window[str(y.device)],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+        spec = torch.view_as_real(spec)
+        spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+        spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
+        spec = spectral_normalize_torch(spec)
+        spec = spec.squeeze(0)
+    else:
+        audio = torch.clip(y, -1, 1)
+        audio = torch.autograd.Variable(audio, requires_grad=False)
+        spec, energy = _stft.mel_spectrogram(audio)
+        spec = torch.squeeze(spec, 0)
+    spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec.squeeze(0)
+def amplitude_phase_spectrum(y, cfg):
+    hann_window = torch.hann_window(cfg.win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+    stft_spec = torch.stft(
+        y,
+        cfg.n_fft,
+        hop_length=cfg.hop_size,
+        win_length=cfg.win_size,
+        window=hann_window,
+        center=False,
+        return_complex=True,
+    )
+    stft_spec = torch.view_as_real(stft_spec)
+    if stft_spec.size()[0] == 1:
+        stft_spec = stft_spec.squeeze(0)
+    if len(list(stft_spec.size())) == 4:
+        rea = stft_spec[:, :, :, 0]  # [batch_size, n_fft//2+1, frames]
+        imag = stft_spec[:, :, :, 1]  # [batch_size, n_fft//2+1, frames]
+    else:
+        rea = stft_spec[:, :, 0]  # [n_fft//2+1, frames]
+        imag = stft_spec[:, :, 1]  # [n_fft//2+1, frames]
+    log_amplitude = torch.log(
+        torch.abs(torch.sqrt(torch.pow(rea, 2) + torch.pow(imag, 2))) + 1e-5
+    )  # [n_fft//2+1, frames]
+    phase = torch.atan2(imag, rea)  # [n_fft//2+1, frames]
+    return log_amplitude, phase, rea, imag

utils/mert.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://huggingface.co/m-a-p/MERT-v1-330M
+import torch
+from tqdm import tqdm
+import numpy as np
+from transformers import Wav2Vec2FeatureExtractor
+from transformers import AutoModel
+import torchaudio
+import torchaudio.transforms as T
+from sklearn.preprocessing import StandardScaler
+def mert_encoder(model, processor, audio_path, hps):
+    """
+    # mert default sr: 24000
+    """
+    with torch.no_grad():
+        resample_rate = processor.sampling_rate
+        device = next(model.parameters()).device
+        input_audio, sampling_rate = torchaudio.load(audio_path)
+        input_audio = input_audio.squeeze()
+        if sampling_rate != resample_rate:
+            resampler = T.Resample(sampling_rate, resample_rate)
+            input_audio = resampler(input_audio)
+        inputs = processor(
+            input_audio, sampling_rate=resample_rate, return_tensors="pt"
+        ).to(
+            device
+        )  # {input_values: tensor, attention_mask: tensor}
+        outputs = model(**inputs, output_hidden_states=True)  # list: len is 25
+    # [25 layer, Time steps, 1024 feature_dim]
+    # all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze()
+    # mert_features.append(all_layer_hidden_states)
+    feature = outputs.hidden_states[
+        hps.mert_feature_layer
+    ].squeeze()  # [1, frame len, 1024] ->  [frame len, 1024]
+    return feature.cpu().detach().numpy()
+def mert_features_normalization(raw_mert_features):
+    normalized_mert_features = list()
+    mert_features = np.array(raw_mert_features)
+    scaler = StandardScaler().fit(mert_features)
+    for raw_mert_feature in raw_mert_feature:
+        normalized_mert_feature = scaler.transform(raw_mert_feature)
+        normalized_mert_features.append(normalized_mert_feature)
+    return normalized_mert_features
+def get_mapped_mert_features(raw_mert_features, mapping_features, fast_mapping=True):
+    source_hop = 320
+    target_hop = 256
+    factor = np.gcd(source_hop, target_hop)
+    source_hop //= factor
+    target_hop //= factor
+    print(
+        "Mapping source's {} frames => target's {} frames".format(
+            target_hop, source_hop
+        )
+    )
+    mert_features = []
+    for index, mapping_feat in enumerate(tqdm(mapping_features)):
+        # mapping_feat: (mels_frame_len, n_mels)
+        target_len = mapping_feat.shape[0]
+        # (frame_len, 1024)
+        raw_feats = raw_mert_features[index].cpu().numpy()
+        source_len, width = raw_feats.shape
+        # const ~= target_len * target_hop
+        const = source_len * source_hop // target_hop * target_hop
+        # (source_len * source_hop, dim)
+        up_sampling_feats = np.repeat(raw_feats, source_hop, axis=0)
+        # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
+        down_sampling_feats = np.average(
+            up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
+        )
+        err = abs(target_len - len(down_sampling_feats))
+        if err > 3:
+            print("index:", index)
+            print("mels:", mapping_feat.shape)
+            print("raw mert vector:", raw_feats.shape)
+            print("up_sampling:", up_sampling_feats.shape)
+            print("const:", const)
+            print("down_sampling_feats:", down_sampling_feats.shape)
+            exit()
+        if len(down_sampling_feats) < target_len:
+            # (1, dim) -> (err, dim)
+            end = down_sampling_feats[-1][None, :].repeat(err, axis=0)
+            down_sampling_feats = np.concatenate([down_sampling_feats, end], axis=0)
+        # (target_len, dim)
+        feats = down_sampling_feats[:target_len]
+        mert_features.append(feats)
+    return mert_features
+def load_mert_model(hps):
+    print("Loading MERT Model: ", hps.mert_model)
+    # Load model
+    model_name = hps.mert_model
+    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
+    if torch.cuda.is_available():
+        model = model.cuda()
+    # model = model.eval()
+    preprocessor = Wav2Vec2FeatureExtractor.from_pretrained(
+        model_name, trust_remote_code=True
+    )
+    return model, preprocessor
+# loading the corresponding preprocessor config
+# def load_preprocessor (model_name="m-a-p/MERT-v1-330M"):
+#     print('load_preprocessor...')
+#     preprocessor = Wav2Vec2FeatureExtractor.from_pretrained(model_name,trust_remote_code=True)
+#     return preprocessor

utils/model_summary.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import humanfriendly
+import numpy as np
+import torch
+def get_human_readable_count(number: int) -> str:
+    """Return human_readable_count
+    Originated from:
+    https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pytorch_lightning/core/memory.py
+    Abbreviates an integer number with K, M, B, T for thousands, millions,
+    billions and trillions, respectively.
+    Examples:
+        >>> get_human_readable_count(123)
+        '123  '
+        >>> get_human_readable_count(1234)  # (one thousand)
+        '1 K'
+        >>> get_human_readable_count(2e6)   # (two million)
+        '2 M'
+        >>> get_human_readable_count(3e9)   # (three billion)
+        '3 B'
+        >>> get_human_readable_count(4e12)  # (four trillion)
+        '4 T'
+        >>> get_human_readable_count(5e15)  # (more than trillion)
+        '5,000 T'
+    Args:
+        number: a positive integer number
+    Return:
+        A string formatted according to the pattern described above.
+    """
+    assert number >= 0
+    labels = [" ", "K", "M", "B", "T"]
+    num_digits = int(np.floor(np.log10(number)) + 1 if number > 0 else 1)
+    num_groups = int(np.ceil(num_digits / 3))
+    num_groups = min(num_groups, len(labels))
+    shift = -3 * (num_groups - 1)
+    number = number * (10**shift)
+    index = num_groups - 1
+    return f"{number:.2f} {labels[index]}"
+def to_bytes(dtype) -> int:
+    return int(str(dtype)[-2:]) // 8
+def model_summary(model: torch.nn.Module) -> str:
+    message = "Model structure:\n"
+    message += str(model)
+    tot_params = sum(p.numel() for p in model.parameters())
+    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    percent_trainable = "{:.1f}".format(num_params * 100.0 / tot_params)
+    tot_params = get_human_readable_count(tot_params)
+    num_params = get_human_readable_count(num_params)
+    message += "\n\nModel summary:\n"
+    message += f"    Class Name: {model.__class__.__name__}\n"
+    message += f"    Total Number of model parameters: {tot_params}\n"
+    message += (
+        f"    Number of trainable parameters: {num_params} ({percent_trainable}%)\n"
+    )
+    num_bytes = humanfriendly.format_size(
+        sum(
+            p.numel() * to_bytes(p.dtype) for p in model.parameters() if p.requires_grad
+        )
+    )
+    message += f"    Size: {num_bytes}\n"
+    dtype = next(iter(model.parameters())).dtype
+    message += f"    Type: {dtype}"
+    return message

utils/prompt_preparer.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+class PromptPreparer:
+    def prepare_prompts(self, y, y_lens, codes, nar_stage, y_prompts_codes):
+        if self.prefix_mode == 0:
+            y_emb, prefix_len = self._handle_prefix_mode_0(y, codes, nar_stage)
+        elif self.prefix_mode == 1:
+            y_emb, prefix_len = self._handle_prefix_mode_1(y, y_lens, codes, nar_stage)
+        elif self.prefix_mode in [2, 4]:
+            y_emb, prefix_len = self._handle_prefix_mode_2_4(y, y_lens, codes, nar_stage, y_prompts_codes)
+        else:
+            raise ValueError("Invalid prefix mode")
+        return y_emb, prefix_len
+    def _handle_prefix_mode_0(self, y, codes, nar_stage):
+        prefix_len = 0
+        y_emb = self.nar_audio_embeddings[0](y)
+        for j in range(1, nar_stage):
+            y_emb = y_emb + self.nar_audio_embeddings[j](codes[..., j])
+        return y_emb, 0
+    def _handle_prefix_mode_1(self, y, y_lens, codes, nar_stage):
+        int_low = (0.25 * y_lens.min()).type(torch.int64).item()
+        prefix_len = torch.randint(int_low, int_low * 2, size=()).item()
+        prefix_len = min(prefix_len, 225)
+        y_prompts = self.nar_audio_embeddings[0](y[:, :prefix_len])
+        y_emb = self.nar_audio_embeddings[0](y[:, prefix_len:])
+        for j in range(1, self.num_quantizers):
+            y_prompts += self.nar_audio_embeddings[j](
+                codes[:, :prefix_len, j]
+            )
+            if j < nar_stage:
+                y_emb += self.nar_audio_embeddings[j](
+                    codes[:, prefix_len:, j]
+                )
+        y_emb = torch.concat([y_prompts, y_emb], axis=1)
+        return y_emb, prefix_len
+    def _handle_prefix_mode_2_4(self, y, y_lens, codes, nar_stage, y_prompts_codes):
+        if self.prefix_mode == 2:
+            prefix_len = min(225, int(0.25 * y_lens.min().item()))
+            y_prompts_codes = []
+            for b in range(codes.shape[0]):
+                start = self.rng.randint(0, y_lens[b].item() - prefix_len)
+                y_prompts_codes.append(
+                    torch.clone(codes[b, start : start + prefix_len])
+                )
+                codes[
+                    b, start : start + prefix_len, nar_stage
+                ] = self.audio_token_num
+            y_prompts_codes = torch.stack(y_prompts_codes, dim=0)
+        else:
+            prefix_len = y_prompts_codes.shape[1]
+        y_prompts = self.nar_audio_embeddings[0](y_prompts_codes[..., 0])
+        y_emb = self.nar_audio_embeddings[0](y)
+        for j in range(1, self.num_quantizers):
+            y_prompts += self.nar_audio_embeddings[j](
+                y_prompts_codes[..., j]
+            )
+            if j < nar_stage:
+                y_emb += self.nar_audio_embeddings[j](codes[..., j])
+        y_emb = torch.concat([y_prompts, y_emb], axis=1)
+        return y_emb, prefix_len

utils/ssim.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://github.com/Po-Hsun-Su/pytorch-ssim
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+from math import exp
+def gaussian(window_size, sigma):
+    gauss = torch.Tensor(
+        [
+            exp(-((x - window_size // 2) ** 2) / float(2 * sigma**2))
+            for x in range(window_size)
+        ]
+    )
+    return gauss / gauss.sum()
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
+    window = Variable(
+        _2D_window.expand(channel, 1, window_size, window_size).contiguous()
+    )
+    return window
+def _ssim(img1, img2, window, window_size, channel, size_average=True):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = (
+        F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+    )
+    sigma2_sq = (
+        F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+    )
+    sigma12 = (
+        F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel)
+        - mu1_mu2
+    )
+    C1 = 0.01**2
+    C2 = 0.03**2
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (
+        (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
+    )
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1)
+class SSIM(torch.nn.Module):
+    def __init__(self, window_size=11, size_average=True):
+        super(SSIM, self).__init__()
+        self.window_size = window_size
+        self.size_average = size_average
+        self.channel = 1
+        self.window = create_window(window_size, self.channel)
+    def forward(self, fake, real, bias=6.0):
+        fake = fake[:, None, :, :] + bias  # [B, 1, T, n_mels]
+        real = real[:, None, :, :] + bias  # [B, 1, T, n_mels]
+        self.window = self.window.to(dtype=fake.dtype, device=fake.device)
+        loss = 1 - _ssim(
+            fake, real, self.window, self.window_size, self.channel, self.size_average
+        )
+        return loss

utils/stft.py ADDED Viewed

	@@ -0,0 +1,278 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+import numpy as np
+from scipy.signal import get_window
+from librosa.util import pad_center, tiny
+from librosa.filters import mel as librosa_mel_fn
+import torch
+import numpy as np
+import librosa.util as librosa_util
+from scipy.signal import get_window
+def window_sumsquare(
+    window,
+    n_frames,
+    hop_length,
+    win_length,
+    n_fft,
+    dtype=np.float32,
+    norm=None,
+):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+    n_frames : int > 0
+        The number of analysis frames
+    hop_length : int > 0
+        The number of samples to advance between frames
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+    n_fft : int > 0
+        The length of each analysis frame.
+    dtype : np.dtype
+        The data type of the output
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
+    win_sq = librosa_util.pad_center(win_sq, n_fft)
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
+    return x
+def griffin_lim(magnitudes, stft_fn, n_iters=30):
+    """
+    PARAMS
+    ------
+    magnitudes: spectrogram magnitudes
+    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
+    """
+    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
+    angles = angles.astype(np.float32)
+    angles = torch.autograd.Variable(torch.from_numpy(angles))
+    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    for i in range(n_iters):
+        _, angles = stft_fn.transform(signal)
+        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    return signal
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+class STFT(torch.nn.Module):
+    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+    def __init__(self, filter_length, hop_length, win_length, window="hann"):
+        super(STFT, self).__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.forward_transform = None
+        scale = self.filter_length / self.hop_length
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack(
+            [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
+        )
+        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+        inverse_basis = torch.FloatTensor(
+            np.linalg.pinv(scale * fourier_basis).T[:, None, :]
+        )
+        if window is not None:
+            assert filter_length >= win_length
+            # get window and zero center pad it to filter_length
+            fft_window = get_window(window, win_length, fftbins=True)
+            fft_window = pad_center(fft_window, filter_length)
+            fft_window = torch.from_numpy(fft_window).float()
+            # window the bases
+            forward_basis *= fft_window
+            inverse_basis *= fft_window
+        self.register_buffer("forward_basis", forward_basis.float())
+        self.register_buffer("inverse_basis", inverse_basis.float())
+    def transform(self, input_data):
+        num_batches = input_data.size(0)
+        num_samples = input_data.size(1)
+        self.num_samples = num_samples
+        # similar to librosa, reflect-pad the input
+        input_data = input_data.view(num_batches, 1, num_samples)
+        input_data = F.pad(
+            input_data.unsqueeze(1),
+            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
+            mode="reflect",
+        )
+        input_data = input_data.squeeze(1)
+        forward_transform = F.conv1d(
+            input_data.cuda(),
+            torch.autograd.Variable(self.forward_basis, requires_grad=False).cuda(),
+            stride=self.hop_length,
+            padding=0,
+        ).cpu()
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
+        return magnitude, phase
+    def inverse(self, magnitude, phase):
+        recombine_magnitude_phase = torch.cat(
+            [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
+        )
+        inverse_transform = F.conv_transpose1d(
+            recombine_magnitude_phase,
+            torch.autograd.Variable(self.inverse_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0,
+        )
+        if self.window is not None:
+            window_sum = window_sumsquare(
+                self.window,
+                magnitude.size(-1),
+                hop_length=self.hop_length,
+                win_length=self.win_length,
+                n_fft=self.filter_length,
+                dtype=np.float32,
+            )
+            # remove modulation effects
+            approx_nonzero_indices = torch.from_numpy(
+                np.where(window_sum > tiny(window_sum))[0]
+            )
+            window_sum = torch.autograd.Variable(
+                torch.from_numpy(window_sum), requires_grad=False
+            )
+            window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
+            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
+                approx_nonzero_indices
+            ]
+            # scale by hop ratio
+            inverse_transform *= float(self.filter_length) / self.hop_length
+        inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
+        inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
+        return inverse_transform
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction
+class TacotronSTFT(torch.nn.Module):
+    def __init__(
+        self,
+        filter_length,
+        hop_length,
+        win_length,
+        n_mel_channels,
+        sampling_rate,
+        mel_fmin,
+        mel_fmax,
+    ):
+        super(TacotronSTFT, self).__init__()
+        self.n_mel_channels = n_mel_channels
+        self.sampling_rate = sampling_rate
+        self.stft_fn = STFT(filter_length, hop_length, win_length)
+        mel_basis = librosa_mel_fn(
+            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax
+        )
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+    def spectral_normalize(self, magnitudes):
+        output = dynamic_range_compression(magnitudes)
+        return output
+    def spectral_de_normalize(self, magnitudes):
+        output = dynamic_range_decompression(magnitudes)
+        return output
+    def mel_spectrogram(self, y):
+        """Computes mel-spectrograms from a batch of waves
+        PARAMS
+        ------
+        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+        RETURNS
+        -------
+        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
+        """
+        assert torch.min(y.data) >= -1
+        assert torch.max(y.data) <= 1
+        magnitudes, phases = self.stft_fn.transform(y)
+        magnitudes = magnitudes.data
+        mel_output = torch.matmul(self.mel_basis, magnitudes)
+        mel_output = self.spectral_normalize(mel_output)
+        energy = torch.norm(magnitudes, dim=1)
+        return mel_output, energy

utils/symbol_table.py ADDED Viewed

	@@ -0,0 +1,313 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from
+# https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/utils/symbol_table.py
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Dict
+from typing import Generic
+from typing import List
+from typing import Optional
+from typing import TypeVar
+from typing import Union
+Symbol = TypeVar('Symbol')
+@dataclass(repr=False)
+class SymbolTable(Generic[Symbol]):
+    '''SymbolTable that maps symbol IDs, found on the FSA arcs to
+    actual objects. These objects can be arbitrary Python objects
+    that can serve as keys in a dictionary (i.e. they need to be
+    hashable and immutable).
+    The SymbolTable can only be read to/written from disk if the
+    symbols are strings.
+    '''
+    _id2sym: Dict[int, Symbol] = field(default_factory=dict)
+    '''Map an integer to a symbol.
+    '''
+    _sym2id: Dict[Symbol, int] = field(default_factory=dict)
+    '''Map a symbol to an integer.
+    '''
+    _next_available_id: int = 1
+    '''A helper internal field that helps adding new symbols
+    to the table efficiently.
+    '''
+    eps: Symbol = '<eps>'
+    '''Null symbol, always mapped to index 0.
+    '''
+    def __post_init__(self):
+        assert all(self._sym2id[sym] == idx for idx, sym in self._id2sym.items())
+        assert all(self._id2sym[idx] == sym for sym, idx in self._sym2id.items())
+        assert 0 not in self._id2sym or self._id2sym[0] == self.eps
+        self._next_available_id = max(self._id2sym, default=0) + 1
+        self._id2sym.setdefault(0, self.eps)
+        self._sym2id.setdefault(self.eps, 0)
+    @staticmethod
+    def from_str(s: str) -> 'SymbolTable':
+        '''Build a symbol table from a string.
+        The string consists of lines. Every line has two fields separated
+        by space(s), tab(s) or both. The first field is the symbol and the
+        second the integer id of the symbol.
+        Args:
+          s:
+            The input string with the format described above.
+        Returns:
+          An instance of :class:`SymbolTable`.
+        '''
+        id2sym: Dict[int, str] = dict()
+        sym2id: Dict[str, int] = dict()
+        for line in s.split('\n'):
+            fields = line.split()
+            if len(fields) == 0:
+                continue  # skip empty lines
+            assert len(fields) == 2, \
+                    f'Expect a line with 2 fields. Given: {len(fields)}'
+            sym, idx = fields[0], int(fields[1])
+            assert sym not in sym2id, f'Duplicated symbol {sym}'
+            assert idx not in id2sym, f'Duplicated id {idx}'
+            id2sym[idx] = sym
+            sym2id[sym] = idx
+        eps = id2sym.get(0, '<eps>')
+        return SymbolTable(_id2sym=id2sym, _sym2id=sym2id, eps=eps)
+    @staticmethod
+    def from_file(filename: str) -> 'SymbolTable':
+        '''Build a symbol table from file.
+        Every line in the symbol table file has two fields separated by
+        space(s), tab(s) or both. The following is an example file:
+        .. code-block::
+            <eps> 0
+            a 1
+            b 2
+            c 3
+        Args:
+          filename:
+            Name of the symbol table file. Its format is documented above.
+        Returns:
+          An instance of :class:`SymbolTable`.
+        '''
+        with open(filename, 'r', encoding='utf-8') as f:
+            return SymbolTable.from_str(f.read().strip())
+    def to_str(self) -> str:
+        '''
+        Returns:
+          Return a string representation of this object. You can pass
+          it to the method ``from_str`` to recreate an identical object.
+        '''
+        s = ''
+        for idx, symbol in sorted(self._id2sym.items()):
+            s += f'{symbol} {idx}\n'
+        return s
+    def to_file(self, filename: str):
+        '''Serialize the SymbolTable to a file.
+        Every line in the symbol table file has two fields separated by
+        space(s), tab(s) or both. The following is an example file:
+        .. code-block::
+            <eps> 0
+            a 1
+            b 2
+            c 3
+        Args:
+          filename:
+            Name of the symbol table file. Its format is documented above.
+        '''
+        with open(filename, 'w') as f:
+            for idx, symbol in sorted(self._id2sym.items()):
+                print(symbol, idx, file=f)
+    def add(self, symbol: Symbol, index: Optional[int] = None) -> int:
+        '''Add a new symbol to the SymbolTable.
+        Args:
+            symbol:
+                The symbol to be added.
+            index:
+                Optional int id to which the symbol should be assigned.
+                If it is not available, a ValueError will be raised.
+        Returns:
+            The int id to which the symbol has been assigned.
+        '''
+        # Already in the table? Return its ID.
+        if symbol in self._sym2id:
+            return self._sym2id[symbol]
+        # Specific ID not provided - use next available.
+        if index is None:
+            index = self._next_available_id
+        # Specific ID provided but not available.
+        if index in self._id2sym:
+            raise ValueError(f"Cannot assign id '{index}' to '{symbol}' - "
+                             f"already occupied by {self._id2sym[index]}")
+        self._sym2id[symbol] = index
+        self._id2sym[index] = symbol
+        # Update next available ID if needed
+        if self._next_available_id <= index:
+            self._next_available_id = index + 1
+        return index
+    def get(self, k: Union[int, Symbol]) -> Union[Symbol, int]:
+        '''Get a symbol for an id or get an id for a symbol
+        Args:
+          k:
+            If it is an id, it tries to find the symbol corresponding
+            to the id; if it is a symbol, it tries to find the id
+            corresponding to the symbol.
+        Returns:
+          An id or a symbol depending on the given `k`.
+        '''
+        if isinstance(k, int):
+            return self._id2sym[k]
+        else:
+            return self._sym2id[k]
+    def merge(self, other: 'SymbolTable') -> 'SymbolTable':
+        '''Create a union of two SymbolTables.
+        Raises an AssertionError if the same IDs are occupied by
+        different symbols.
+        Args:
+            other:
+                A symbol table to merge with ``self``.
+        Returns:
+            A new symbol table.
+        '''
+        self._check_compatible(other)
+        return SymbolTable(
+            _id2sym={**self._id2sym, **other._id2sym},
+            _sym2id={**self._sym2id, **other._sym2id},
+            eps=self.eps
+        )
+    def _check_compatible(self, other: 'SymbolTable') -> None:
+        # Epsilon compatibility
+        assert self.eps == other.eps, f'Mismatched epsilon symbol: ' \
+                                      f'{self.eps} != {other.eps}'
+        # IDs compatibility
+        common_ids = set(self._id2sym).intersection(other._id2sym)
+        for idx in common_ids:
+            assert self[idx] == other[idx], f'ID conflict for id: {idx}, ' \
+                                            f'self[idx] = "{self[idx]}", ' \
+                                            f'other[idx] = "{other[idx]}"'
+        # Symbols compatibility
+        common_symbols = set(self._sym2id).intersection(other._sym2id)
+        for sym in common_symbols:
+            assert self[sym] == other[sym], f'ID conflict for id: {sym}, ' \
+                                            f'self[sym] = "{self[sym]}", ' \
+                                            f'other[sym] = "{other[sym]}"'
+    def __getitem__(self, item: Union[int, Symbol]) -> Union[Symbol, int]:
+        return self.get(item)
+    def __contains__(self, item: Union[int, Symbol]) -> bool:
+        if isinstance(item, int):
+            return item in self._id2sym
+        else:
+            return item in self._sym2id
+    def __len__(self) -> int:
+        return len(self._id2sym)
+    def __eq__(self, other: 'SymbolTable') -> bool:
+        if len(self) != len(other):
+            return False
+        for s in self.symbols:
+            if self[s] != other[s]:
+                return False
+        return True
+    @property
+    def ids(self) -> List[int]:
+        '''Returns a list of integer IDs corresponding to the symbols.
+        '''
+        ans = list(self._id2sym.keys())
+        ans.sort()
+        return ans
+    @property
+    def symbols(self) -> List[Symbol]:
+        '''Returns a list of symbols (e.g., strings) corresponding to
+        the integer IDs.
+        '''
+        ans = list(self._sym2id.keys())
+        ans.sort()
+        return ans
+class TextToken:
+    def __init__(
+        self,
+        text_tokens: List[str],
+        add_eos: bool = True,
+        add_bos: bool = True,
+        pad_symbol: str = "<pad>",
+        bos_symbol: str = "<bos>",
+        eos_symbol: str = "<eos>",
+    ):
+        self.pad_symbol = pad_symbol
+        self.add_eos = add_eos
+        self.add_bos = add_bos
+        self.bos_symbol = bos_symbol
+        self.eos_symbol = eos_symbol
+        unique_tokens = [pad_symbol]
+        if add_bos:
+            unique_tokens.append(bos_symbol)
+        if add_eos:
+            unique_tokens.append(eos_symbol)
+        unique_tokens.extend(sorted(text_tokens))
+        self.token2idx = {token: idx for idx, token in enumerate(unique_tokens)}
+        self.idx2token = unique_tokens
+    def get_token_id_seq(self, text):
+        tokens_seq = [p for p in text]
+        seq = (
+            ([self.bos_symbol] if self.add_bos else [])
+            + tokens_seq
+            + ([self.eos_symbol] if self.add_eos else [])
+        )
+        token_ids = [self.token2idx[token] for token in seq]
+        token_lens = len(tokens_seq) + self.add_eos + self.add_bos
+        return token_ids, token_lens

utils/tokenizer.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from
+# https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/data/tokenizer.py
+import re
+from typing import Any, Dict, List, Optional, Pattern, Union
+import torch
+import torchaudio
+from encodec import EncodecModel
+from encodec.utils import convert_audio
+class AudioTokenizer:
+    """EnCodec audio tokenizer for encoding and decoding audio.
+    Attributes:
+        device: The device on which the codec model is loaded.
+        codec: The pretrained EnCodec model.
+        sample_rate: Sample rate of the model.
+        channels: Number of audio channels in the model.
+    """
+    def __init__(self, device: Any = None) -> None:
+        model = EncodecModel.encodec_model_24khz()
+        model.set_target_bandwidth(6.0)
+        remove_encodec_weight_norm(model)
+        if not device:
+            device = torch.device("cpu")
+            if torch.cuda.is_available():
+                device = torch.device("cuda:0")
+        self._device = device
+        self.codec = model.to(device)
+        self.sample_rate = model.sample_rate
+        self.channels = model.channels
+    @property
+    def device(self):
+        return self._device
+    def encode(self, wav: torch.Tensor) -> torch.Tensor:
+        """Encode the audio waveform.
+        Args:
+            wav: A tensor representing the audio waveform.
+        Returns:
+            A tensor representing the encoded audio.
+        """
+        return self.codec.encode(wav.to(self.device))
+    def decode(self, frames: torch.Tensor) -> torch.Tensor:
+        """Decode the encoded audio frames.
+        Args:
+            frames: A tensor representing the encoded audio frames.
+        Returns:
+            A tensor representing the decoded audio waveform.
+        """
+        return self.codec.decode(frames)
+def tokenize_audio(tokenizer: AudioTokenizer, audio_path: str):
+    """
+    Tokenize the audio waveform using the given AudioTokenizer.
+    Args:
+        tokenizer: An instance of AudioTokenizer.
+        audio_path: Path to the audio file.
+    Returns:
+        A tensor of encoded frames from the audio.
+    Raises:
+        FileNotFoundError: If the audio file is not found.
+        RuntimeError: If there's an error processing the audio data.
+    """
+    # try:
+        # Load and preprocess the audio waveform
+    wav, sr = torchaudio.load(audio_path)
+    wav = convert_audio(wav, sr, tokenizer.sample_rate, tokenizer.channels)
+    wav = wav.unsqueeze(0)
+    # Extract discrete codes from EnCodec
+    with torch.no_grad():
+        encoded_frames = tokenizer.encode(wav)
+    return encoded_frames
+    # except FileNotFoundError:
+    #     raise FileNotFoundError(f"Audio file not found at {audio_path}")
+    # except Exception as e:
+    #     raise RuntimeError(f"Error processing audio data: {e}")
+def remove_encodec_weight_norm(model):
+    from encodec.modules import SConv1d
+    from encodec.modules.seanet import SConvTranspose1d, SEANetResnetBlock
+    from torch.nn.utils import remove_weight_norm
+    encoder = model.encoder.model
+    for key in encoder._modules:
+        if isinstance(encoder._modules[key], SEANetResnetBlock):
+            remove_weight_norm(encoder._modules[key].shortcut.conv.conv)
+            block_modules = encoder._modules[key].block._modules
+            for skey in block_modules:
+                if isinstance(block_modules[skey], SConv1d):
+                    remove_weight_norm(block_modules[skey].conv.conv)
+        elif isinstance(encoder._modules[key], SConv1d):
+            remove_weight_norm(encoder._modules[key].conv.conv)
+    decoder = model.decoder.model
+    for key in decoder._modules:
+        if isinstance(decoder._modules[key], SEANetResnetBlock):
+            remove_weight_norm(decoder._modules[key].shortcut.conv.conv)
+            block_modules = decoder._modules[key].block._modules
+            for skey in block_modules:
+                if isinstance(block_modules[skey], SConv1d):
+                    remove_weight_norm(block_modules[skey].conv.conv)
+        elif isinstance(decoder._modules[key], SConvTranspose1d):
+            remove_weight_norm(decoder._modules[key].convtr.convtr)
+        elif isinstance(decoder._modules[key], SConv1d):
+            remove_weight_norm(decoder._modules[key].conv.conv)
+def extract_encodec_token(wav_path):
+    model = EncodecModel.encodec_model_24khz()
+    model.set_target_bandwidth(6.0)
+    wav, sr = torchaudio.load(wav_path)
+    wav = convert_audio(wav, sr, model.sample_rate, model.channels)
+    wav = wav.unsqueeze(0)
+    if torch.cuda.is_available():
+        model = model.cuda()
+        wav = wav.cuda()
+    with torch.no_grad():
+        encoded_frames = model.encode(wav)
+        codes_ = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)  # [B, n_q, T]
+        codes = codes_.cpu().numpy()[0,:,:].T # [T, 8]
+        return codes

utils/topk_sampling.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+# This function is modified from https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py
+def top_k_top_p_filtering(
+    logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1
+):
+    """
+    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering.
+    Args:
+        logits (torch.Tensor): Logits distribution with shape (batch size, vocabulary size).
+        top_k (int, optional): Keep only top k tokens with highest probability (top-k filtering).
+                               Set to 0 to disable. Defaults to 0.
+        top_p (float, optional): Keep the top tokens with a cumulative probability >= top_p (nucleus filtering).
+                                 Must be between 0 and 1, inclusive. Defaults to 1.0.
+        filter_value (float, optional): The value to assign to filtered logits. Defaults to -float('Inf').
+        min_tokens_to_keep (int, optional): Ensure that at least this number of tokens are kept per batch example.
+                                            Defaults to 1.
+    Returns:
+        torch.Tensor: The filtered logits.
+    """
+    """
+        Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    if top_k > 0:
+        # Apply top-k filtering
+        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))
+        indices_to_remove = logits < torch.topk(logits, top_k).values[..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p < 1.0:
+        # Apply top-p filtering
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Create a mask to remove tokens with cumulative probability above the top_p threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens_to_keep > 1:
+            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        # Scatter sorted tensors back to original indexing
+        indices_to_remove = sorted_indices.scatter(1, sorted_indices, sorted_indices_to_remove)
+        logits[indices_to_remove] = filter_value
+    return logits
+def topk_sampling(logits, top_k=50, top_p=1.0, temperature=1.0):
+    """
+    Perform top-k and top-p sampling on logits.
+    Args:
+        logits (torch.Tensor): The logits to sample from.
+        top_k (int, optional): The number of highest probability tokens to keep for top-k filtering.
+                               Must be a positive integer. Defaults to 50.
+        top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+                                 Must be between 0 and 1. Defaults to 1.0.
+        temperature (float, optional): The scaling factor to adjust the logits distribution.
+                                       Must be strictly positive. Defaults to 1.0.
+    Returns:
+        torch.Tensor: The sampled token.
+    """
+    # Adjust logits using temperature
+    if temperature != 1.0:
+        logits = logits / temperature
+    # Top-p/top-k filtering
+    logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+    # Sample from the filtered distribution
+    token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
+    return token

utils/trainer_utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+def check_nan(logger, loss, y_pred, y_gt):
+    if torch.any(torch.isnan(loss)):
+        logger.info("out has nan: ", torch.any(torch.isnan(y_pred)))
+        logger.info("y_gt has nan: ", torch.any(torch.isnan(y_gt)))
+        logger.info("out: ", y_pred)
+        logger.info("y_gt: ", y_gt)
+        logger.info("loss = {:.4f}\n".format(loss.item()))
+        exit()

utils/util.py ADDED Viewed

	@@ -0,0 +1,688 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import collections
+import glob
+import os
+import random
+import time
+import argparse
+from collections import OrderedDict
+import json5
+import numpy as np
+import glob
+from torch.nn import functional as F
+try:
+    from ruamel.yaml import YAML as yaml
+except:
+    from ruamel_yaml import YAML as yaml
+import torch
+from utils.hparam import HParams
+import logging
+from logging import handlers
+def str2bool(v):
+    """Used in argparse.ArgumentParser.add_argument to indicate
+    that a type is a bool type and user can enter
+        - yes, true, t, y, 1, to represent True
+        - no, false, f, n, 0, to represent False
+    See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse  # noqa
+    """
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+def find_checkpoint_of_mapper(mapper_ckpt_dir):
+    mapper_ckpts = glob.glob(os.path.join(mapper_ckpt_dir, "ckpts/*.pt"))
+    # Select the max steps
+    mapper_ckpts.sort()
+    mapper_weights_file = mapper_ckpts[-1]
+    return mapper_weights_file
+def pad_f0_to_tensors(f0s, batched=None):
+    # Initialize
+    tensors = []
+    if batched == None:
+        # Get the max frame for padding
+        size = -1
+        for f0 in f0s:
+            size = max(size, f0.shape[-1])
+        tensor = torch.zeros(len(f0s), size)
+        for i, f0 in enumerate(f0s):
+            tensor[i, : f0.shape[-1]] = f0[:]
+        tensors.append(tensor)
+    else:
+        start = 0
+        while start + batched - 1 < len(f0s):
+            end = start + batched - 1
+            # Get the max frame for padding
+            size = -1
+            for i in range(start, end + 1):
+                size = max(size, f0s[i].shape[-1])
+            tensor = torch.zeros(batched, size)
+            for i in range(start, end + 1):
+                tensor[i - start, : f0s[i].shape[-1]] = f0s[i][:]
+            tensors.append(tensor)
+            start = start + batched
+        if start != len(f0s):
+            end = len(f0s)
+            # Get the max frame for padding
+            size = -1
+            for i in range(start, end):
+                size = max(size, f0s[i].shape[-1])
+            tensor = torch.zeros(len(f0s) - start, size)
+            for i in range(start, end):
+                tensor[i - start, : f0s[i].shape[-1]] = f0s[i][:]
+            tensors.append(tensor)
+    return tensors
+def pad_mels_to_tensors(mels, batched=None):
+    """
+    Args:
+        mels: A list of mel-specs
+    Returns:
+        tensors: A list of tensors containing the batched mel-specs
+        mel_frames: A list of tensors containing the frames of the original mel-specs
+    """
+    # Initialize
+    tensors = []
+    mel_frames = []
+    # Split mel-specs into batches to avoid cuda memory exceed
+    if batched == None:
+        # Get the max frame for padding
+        size = -1
+        for mel in mels:
+            size = max(size, mel.shape[-1])
+        tensor = torch.zeros(len(mels), mels[0].shape[0], size)
+        mel_frame = torch.zeros(len(mels), dtype=torch.int32)
+        for i, mel in enumerate(mels):
+            tensor[i, :, : mel.shape[-1]] = mel[:]
+            mel_frame[i] = mel.shape[-1]
+        tensors.append(tensor)
+        mel_frames.append(mel_frame)
+    else:
+        start = 0
+        while start + batched - 1 < len(mels):
+            end = start + batched - 1
+            # Get the max frame for padding
+            size = -1
+            for i in range(start, end + 1):
+                size = max(size, mels[i].shape[-1])
+            tensor = torch.zeros(batched, mels[0].shape[0], size)
+            mel_frame = torch.zeros(batched, dtype=torch.int32)
+            for i in range(start, end + 1):
+                tensor[i - start, :, : mels[i].shape[-1]] = mels[i][:]
+                mel_frame[i - start] = mels[i].shape[-1]
+            tensors.append(tensor)
+            mel_frames.append(mel_frame)
+            start = start + batched
+        if start != len(mels):
+            end = len(mels)
+            # Get the max frame for padding
+            size = -1
+            for i in range(start, end):
+                size = max(size, mels[i].shape[-1])
+            tensor = torch.zeros(len(mels) - start, mels[0].shape[0], size)
+            mel_frame = torch.zeros(len(mels) - start, dtype=torch.int32)
+            for i in range(start, end):
+                tensor[i - start, :, : mels[i].shape[-1]] = mels[i][:]
+                mel_frame[i - start] = mels[i].shape[-1]
+            tensors.append(tensor)
+            mel_frames.append(mel_frame)
+    return tensors, mel_frames
+def load_model_config(args):
+    """Load model configurations (in args.json under checkpoint directory)
+    Args:
+        args (ArgumentParser): arguments to run bins/preprocess.py
+    Returns:
+        dict: dictionary that stores model configurations
+    """
+    if args.checkpoint_dir is None:
+        assert args.checkpoint_file is not None
+        checkpoint_dir = os.path.split(args.checkpoint_file)[0]
+    else:
+        checkpoint_dir = args.checkpoint_dir
+    config_path = os.path.join(checkpoint_dir, "args.json")
+    print("config_path: ", config_path)
+    config = load_config(config_path)
+    return config
+def remove_and_create(dir):
+    if os.path.exists(dir):
+        os.system("rm -r {}".format(dir))
+    os.makedirs(dir, exist_ok=True)
+def has_existed(path, warning=False):
+    if not warning:
+        return os.path.exists(path)
+    if os.path.exists(path):
+        answer = input(
+            "The path {} has existed. \nInput 'y' (or hit Enter) to skip it, and input 'n' to re-write it [y/n]\n".format(
+                path
+            )
+        )
+        if not answer == "n":
+            return True
+    return False
+def remove_older_ckpt(saved_model_name, checkpoint_dir, max_to_keep=5):
+    if os.path.exists(os.path.join(checkpoint_dir, "checkpoint")):
+        with open(os.path.join(checkpoint_dir, "checkpoint"), "r") as f:
+            ckpts = [x.strip() for x in f.readlines()]
+    else:
+        ckpts = []
+    ckpts.append(saved_model_name)
+    for item in ckpts[:-max_to_keep]:
+        if os.path.exists(os.path.join(checkpoint_dir, item)):
+            os.remove(os.path.join(checkpoint_dir, item))
+    with open(os.path.join(checkpoint_dir, "checkpoint"), "w") as f:
+        for item in ckpts[-max_to_keep:]:
+            f.write("{}\n".format(item))
+def set_all_random_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.random.manual_seed(seed)
+def save_checkpoint(
+    args,
+    generator,
+    g_optimizer,
+    step,
+    discriminator=None,
+    d_optimizer=None,
+    max_to_keep=5,
+):
+    saved_model_name = "model.ckpt-{}.pt".format(step)
+    checkpoint_path = os.path.join(args.checkpoint_dir, saved_model_name)
+    if discriminator and d_optimizer:
+        torch.save(
+            {
+                "generator": generator.state_dict(),
+                "discriminator": discriminator.state_dict(),
+                "g_optimizer": g_optimizer.state_dict(),
+                "d_optimizer": d_optimizer.state_dict(),
+                "global_step": step,
+            },
+            checkpoint_path,
+        )
+    else:
+        torch.save(
+            {
+                "generator": generator.state_dict(),
+                "g_optimizer": g_optimizer.state_dict(),
+                "global_step": step,
+            },
+            checkpoint_path,
+        )
+    print("Saved checkpoint: {}".format(checkpoint_path))
+    if os.path.exists(os.path.join(args.checkpoint_dir, "checkpoint")):
+        with open(os.path.join(args.checkpoint_dir, "checkpoint"), "r") as f:
+            ckpts = [x.strip() for x in f.readlines()]
+    else:
+        ckpts = []
+    ckpts.append(saved_model_name)
+    for item in ckpts[:-max_to_keep]:
+        if os.path.exists(os.path.join(args.checkpoint_dir, item)):
+            os.remove(os.path.join(args.checkpoint_dir, item))
+    with open(os.path.join(args.checkpoint_dir, "checkpoint"), "w") as f:
+        for item in ckpts[-max_to_keep:]:
+            f.write("{}\n".format(item))
+def attempt_to_restore(
+    generator, g_optimizer, checkpoint_dir, discriminator=None, d_optimizer=None
+):
+    checkpoint_list = os.path.join(checkpoint_dir, "checkpoint")
+    if os.path.exists(checkpoint_list):
+        checkpoint_filename = open(checkpoint_list).readlines()[-1].strip()
+        checkpoint_path = os.path.join(checkpoint_dir, "{}".format(checkpoint_filename))
+        print("Restore from {}".format(checkpoint_path))
+        checkpoint = torch.load(checkpoint_path, map_location="cpu")
+        if generator:
+            if not list(generator.state_dict().keys())[0].startswith("module."):
+                raw_dict = checkpoint["generator"]
+                clean_dict = OrderedDict()
+                for k, v in raw_dict.items():
+                    if k.startswith("module."):
+                        clean_dict[k[7:]] = v
+                    else:
+                        clean_dict[k] = v
+                generator.load_state_dict(clean_dict)
+            else:
+                generator.load_state_dict(checkpoint["generator"])
+        if g_optimizer:
+            g_optimizer.load_state_dict(checkpoint["g_optimizer"])
+        global_step = 100000
+        if discriminator and "discriminator" in checkpoint.keys():
+            discriminator.load_state_dict(checkpoint["discriminator"])
+            global_step = checkpoint["global_step"]
+            print("restore discriminator")
+        if d_optimizer and "d_optimizer" in checkpoint.keys():
+            d_optimizer.load_state_dict(checkpoint["d_optimizer"])
+            print("restore d_optimizer...")
+    else:
+        global_step = 0
+    return global_step
+class ExponentialMovingAverage(object):
+    def __init__(self, decay):
+        self.decay = decay
+        self.shadow = {}
+    def register(self, name, val):
+        self.shadow[name] = val.clone()
+    def update(self, name, x):
+        assert name in self.shadow
+        update_delta = self.shadow[name] - x
+        self.shadow[name] -= (1.0 - self.decay) * update_delta
+def apply_moving_average(model, ema):
+    for name, param in model.named_parameters():
+        if name in ema.shadow:
+            ema.update(name, param.data)
+def register_model_to_ema(model, ema):
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            ema.register(name, param.data)
+class YParams(HParams):
+    def __init__(self, yaml_file):
+        if not os.path.exists(yaml_file):
+            raise IOError("yaml file: {} is not existed".format(yaml_file))
+        super().__init__()
+        self.d = collections.OrderedDict()
+        with open(yaml_file) as fp:
+            for _, v in yaml().load(fp).items():
+                for k1, v1 in v.items():
+                    try:
+                        if self.get(k1):
+                            self.set_hparam(k1, v1)
+                        else:
+                            self.add_hparam(k1, v1)
+                        self.d[k1] = v1
+                    except Exception:
+                        import traceback
+                        print(traceback.format_exc())
+    # @property
+    def get_elements(self):
+        return self.d.items()
+def override_config(base_config, new_config):
+    """Update new configurations in the original dict with the new dict
+    Args:
+        base_config (dict): original dict to be overridden
+        new_config (dict): dict with new configurations
+    Returns:
+        dict: updated configuration dict
+    """
+    for k, v in new_config.items():
+        if type(v) == dict:
+            if k not in base_config.keys():
+                base_config[k] = {}
+            base_config[k] = override_config(base_config[k], v)
+        else:
+            base_config[k] = v
+    return base_config
+def get_lowercase_keys_config(cfg):
+    """Change all keys in cfg to lower case
+    Args:
+        cfg (dict): dictionary that stores configurations
+    Returns:
+        dict: dictionary that stores configurations
+    """
+    updated_cfg = dict()
+    for k, v in cfg.items():
+        if type(v) == dict:
+            v = get_lowercase_keys_config(v)
+        updated_cfg[k.lower()] = v
+    return updated_cfg
+def _load_config(config_fn, lowercase=False):
+    """Load configurations into a dictionary
+    Args:
+        config_fn (str): path to configuration file
+        lowercase (bool, optional): whether changing keys to lower case. Defaults to False.
+    Returns:
+        dict: dictionary that stores configurations
+    """
+    with open(config_fn, "r") as f:
+        data = f.read()
+    config_ = json5.loads(data)
+    if "base_config" in config_:
+        # load configurations from new path
+        p_config_path = os.path.join(os.getenv("WORK_DIR"), config_["base_config"])
+        p_config_ = _load_config(p_config_path)
+        config_ = override_config(p_config_, config_)
+    if lowercase:
+        # change keys in config_ to lower case
+        config_ = get_lowercase_keys_config(config_)
+    return config_
+def load_config(config_fn, lowercase=False):
+    """Load configurations into a dictionary
+    Args:
+        config_fn (str): path to configuration file
+        lowercase (bool, optional): _description_. Defaults to False.
+    Returns:
+        JsonHParams: an object that stores configurations
+    """
+    config_ = _load_config(config_fn, lowercase=lowercase)
+    # create an JsonHParams object with configuration dict
+    cfg = JsonHParams(**config_)
+    return cfg
+def save_config(save_path, cfg):
+    """Save configurations into a json file
+    Args:
+        save_path (str): path to save configurations
+        cfg (dict): dictionary that stores configurations
+    """
+    with open(save_path, "w") as f:
+        json5.dump(
+            cfg, f, ensure_ascii=False, indent=4, quote_keys=True, sort_keys=True
+        )
+class JsonHParams:
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if type(v) == dict:
+                v = JsonHParams(**v)
+            self[k] = v
+    def keys(self):
+        return self.__dict__.keys()
+    def items(self):
+        return self.__dict__.items()
+    def values(self):
+        return self.__dict__.values()
+    def __len__(self):
+        return len(self.__dict__)
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+    def __contains__(self, key):
+        return key in self.__dict__
+    def __repr__(self):
+        return self.__dict__.__repr__()
+class ValueWindow:
+    def __init__(self, window_size=100):
+        self._window_size = window_size
+        self._values = []
+    def append(self, x):
+        self._values = self._values[-(self._window_size - 1) :] + [x]
+    @property
+    def sum(self):
+        return sum(self._values)
+    @property
+    def count(self):
+        return len(self._values)
+    @property
+    def average(self):
+        return self.sum / max(1, self.count)
+    def reset(self):
+        self._values = []
+class Logger(object):
+    def __init__(
+        self,
+        filename,
+        level="info",
+        when="D",
+        backCount=10,
+        fmt="%(asctime)s : %(message)s",
+    ):
+        self.level_relations = {
+            "debug": logging.DEBUG,
+            "info": logging.INFO,
+            "warning": logging.WARNING,
+            "error": logging.ERROR,
+            "crit": logging.CRITICAL,
+        }
+        if level == "debug":
+            fmt = "%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s"
+        self.logger = logging.getLogger(filename)
+        format_str = logging.Formatter(fmt)
+        self.logger.setLevel(self.level_relations.get(level))
+        sh = logging.StreamHandler()
+        sh.setFormatter(format_str)
+        th = handlers.TimedRotatingFileHandler(
+            filename=filename, when=when, backupCount=backCount, encoding="utf-8"
+        )
+        th.setFormatter(format_str)
+        self.logger.addHandler(sh)
+        self.logger.addHandler(th)
+        self.logger.info(
+            "==========================New Starting Here=============================="
+        )
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def slice_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = x_lengths - segment_size + 1
+    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
+def subsequent_mask(length):
+    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+    return mask
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+def generate_path(duration, mask):
+    """
+    duration: [b, 1, t_x]
+    mask: [b, 1, t_y, t_x]
+    """
+    device = duration.device
+    b, _, t_y, t_x = mask.shape
+    cum_duration = torch.cumsum(duration, -1)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+    path = path.unsqueeze(1).transpose(2, 3) * mask
+    return path
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    if clip_value is not None:
+        clip_value = float(clip_value)
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+        if clip_value is not None:
+            p.grad.data.clamp_(min=-clip_value, max=clip_value)
+    total_norm = total_norm ** (1.0 / norm_type)
+    return total_norm
+def get_current_time():
+    pass
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """
+    Args:
+      lengths:
+        A 1-D tensor containing sentence lengths.
+      max_len:
+        The length of masks.
+    Returns:
+      Return a 2-D bool tensor, where masked positions
+      are filled with `True` and non-masked positions are
+      filled with `False`.
+    >>> lengths = torch.tensor([1, 3, 2, 5])
+    >>> make_pad_mask(lengths)
+    tensor([[False,  True,  True,  True,  True],
+            [False, False, False,  True,  True],
+            [False, False,  True,  True,  True],
+            [False, False, False, False, False]])
+    """
+    assert lengths.ndim == 1, lengths.ndim
+    max_len = max(max_len, lengths.max())
+    n = lengths.size(0)
+    seq_range = torch.arange(0, max_len, device=lengths.device)
+    expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
+    return expaned_lengths >= lengths.unsqueeze(-1)

utils/whisper.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import os
+import pickle
+from tqdm import tqdm
+import numpy as np
+from modules import whisper_extractor as whisper
+def whisper_encoder_batch(model, audio_paths):
+    batch = len(audio_paths)
+    batch_mel = torch.zeros((batch, 80, 3000), dtype=torch.float32, device=model.device)
+    for i, audio_path in enumerate(audio_paths):
+        # (48000,)
+        audio = whisper.load_audio(str(audio_path))
+        audio = whisper.pad_or_trim(audio)
+        # (80, 3000)
+        mel = whisper.log_mel_spectrogram(audio).to(model.device)
+        batch_mel[i] = mel
+    with torch.no_grad():
+        # (batch, 1500, 1024)
+        features = model.embed_audio(batch_mel)
+    return features.cpu().detach().numpy()
+def whisper_encoder(model, audio_path):
+    audio = whisper.load_audio(str(audio_path))
+    audio = whisper.pad_or_trim(audio)
+    # (80, 3000)
+    mel = whisper.log_mel_spectrogram(audio).to(model.device).unsqueeze(0)
+    with torch.no_grad():
+        # (1, 1500, 1024) -> # (1500, 1024)
+        features = model.embed_audio(mel).squeeze(0)
+    return features.cpu().detach().numpy()
+def get_mapped_whisper_features(
+    raw_whisper_features, mapping_features, fast_mapping=True
+):
+    """
+    Whisper: frameshift = 20ms (30s audio -> 1500 frames), hop_size = 480 in 24k
+    # Ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/model.py#L136
+    Now it's only used for mapping to bigvgan's mels (sr = 24k, hop_size = 256, frameshift ~= 10.7 ms)
+    """
+    source_hop = 480
+    target_hop = 256
+    factor = np.gcd(source_hop, target_hop)
+    source_hop //= factor
+    target_hop //= factor
+    print(
+        "Mapping source's {} frames => target's {} frames".format(
+            target_hop, source_hop
+        )
+    )
+    max_source_len = 1500
+    whisper_features = []
+    for index, mapping_feat in enumerate(tqdm(mapping_features)):
+        # mapping_feat: (mels_frame_len, n_mels)
+        target_len = mapping_feat.shape[0]
+        # The max target_len is 2812
+        target_len = min(target_len, max_source_len * source_hop // target_hop)
+        # (1500, dim)
+        raw_feats = raw_whisper_features[index]
+        width = raw_feats.shape[-1]
+        if fast_mapping:
+            source_len = target_len * target_hop // source_hop + 1
+            raw_feats = raw_feats[:source_len]
+        else:
+            source_len = max_source_len
+        # const ~= target_len * target_hop
+        const = source_len * source_hop // target_hop * target_hop
+        # (source_len * source_hop, dim)
+        up_sampling_feats = np.repeat(raw_feats, source_hop, axis=0)
+        # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
+        down_sampling_feats = np.average(
+            up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
+        )
+        assert len(down_sampling_feats) >= target_len
+        # (target_len, dim)
+        feats = down_sampling_feats[:target_len]
+        whisper_features.append(feats)
+    return whisper_features
+def load_whisper_model(hps):
+    print("Loading Whisper Model: ", hps.whisper_model)
+    model = whisper.load_model(hps.whisper_model)
+    if torch.cuda.is_available():
+        model = model.cuda()
+    model = model.eval()
+    return model
+def load_target_acoustic_features(
+    output_path, dataset, acoustic_features_name, acoustic_features_fs, dataset_type
+):
+    mapping_dir = os.path.join(
+        output_path,
+        dataset,
+        "{}/{}".format(acoustic_features_name, acoustic_features_fs),
+    )
+    with open(os.path.join(mapping_dir, "{}.pkl".format(dataset_type)), "rb") as f:
+        mapping_features = pickle.load(f)
+    # Mels: (n_mels, frame_len) -> (frame_len, n_mels)
+    if acoustic_features_name == "mels":
+        print("Transposing mel features...")
+        mapping_features = [feat.T for feat in mapping_features]
+    print(
+        "Mapping to the acoustic features {}, #sz = {}, feats[0] is {}".format(
+            acoustic_features_name, len(mapping_features), mapping_features[0].shape
+        )
+    )
+    return mapping_features
+def extract_whisper_features_of_dataset(
+    datasets,
+    model,
+    batch_size,
+    out_dir,
+):
+    audio_paths = [utt["Path"] for utt in datasets]
+    if len(audio_paths) < batch_size:
+        batch_size = len(audio_paths)
+    start, end = 0, 0
+    while end < len(audio_paths):
+        # Raw features: (batch_size, 1500, dim)
+        start = end
+        end = start + batch_size
+        tmp_raw_whisper_features = whisper_encoder_batch(model, audio_paths[start:end])
+        # Mapping to acoustic features' lengths
+        for index, utt in enumerate(tqdm(datasets[start:end])):
+            uid = utt["Uid"]
+            raw_whisper_feature = tmp_raw_whisper_features[index]
+            save_path = os.path.join(out_dir, uid + ".npy")
+            np.save(save_path, raw_whisper_feature)
+        print("{}/{} Done...".format(end, len(audio_paths)))

utils/world.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# 1. Extract WORLD features including F0, AP, SP
+# 2. Transform between SP and MCEP
+import torchaudio
+import pyworld as pw
+import numpy as np
+import torch
+import diffsptk
+import os
+from tqdm import tqdm
+import pickle
+import torchaudio
+def get_mcep_params(fs):
+    """Hyperparameters of transformation between SP and MCEP
+    Reference:
+        https://github.com/CSTR-Edinburgh/merlin/blob/master/misc/scripts/vocoder/world_v2/copy_synthesis.sh
+    """
+    if fs in [44100, 48000]:
+        fft_size = 2048
+        alpha = 0.77
+    if fs in [16000]:
+        fft_size = 1024
+        alpha = 0.58
+    return fft_size, alpha
+def extract_world_features(waveform, frameshift=10):
+    # waveform: (1, seq)
+    # x: (seq,)
+    x = np.array(waveform, dtype=np.double)
+    _f0, t = pw.dio(x, fs, frame_period=frameshift)  # raw pitch extractor
+    f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
+    sp = pw.cheaptrick(x, f0, t, fs)  # extract smoothed spectrogram
+    ap = pw.d4c(x, f0, t, fs)  # extract aperiodicity
+    return f0, sp, ap, fs
+def sp2mcep(x, mcsize, fs):
+    fft_size, alpha = get_mcep_params(fs)
+    x = torch.as_tensor(x, dtype=torch.float)
+    tmp = diffsptk.ScalarOperation("SquareRoot")(x)
+    tmp = diffsptk.ScalarOperation("Multiplication", 32768.0)(tmp)
+    mgc = diffsptk.MelCepstralAnalysis(
+        cep_order=mcsize - 1, fft_length=fft_size, alpha=alpha, n_iter=1
+    )(tmp)
+    return mgc.numpy()
+def mcep2sp(x, mcsize, fs):
+    fft_size, alpha = get_mcep_params(fs)
+    x = torch.as_tensor(x, dtype=torch.float)
+    tmp = diffsptk.MelGeneralizedCepstrumToSpectrum(
+        alpha=alpha,
+        cep_order=mcsize - 1,
+        fft_length=fft_size,
+    )(x)
+    tmp = diffsptk.ScalarOperation("Division", 32768.0)(tmp)
+    sp = diffsptk.ScalarOperation("Power", 2)(tmp)
+    return sp.double().numpy()
+def f0_statistics(f0_features, path):
+    print("\nF0 statistics...")
+    total_f0 = []
+    for f0 in tqdm(f0_features):
+        total_f0 += [f for f in f0 if f != 0]
+    mean = sum(total_f0) / len(total_f0)
+    print("Min = {}, Max = {}, Mean = {}".format(min(total_f0), max(total_f0), mean))
+    with open(path, "wb") as f:
+        pickle.dump([mean, total_f0], f)
+def world_synthesis(f0, sp, ap, fs, frameshift):
+    y = pw.synthesize(
+        f0, sp, ap, fs, frame_period=frameshift
+    )  # synthesize an utterance using the parameters
+    return y