Spaces:

amphion
/

NaturalSpeech2

Sleeping

App Files Files Community

yuancwang commited on Dec 17, 2023

Commit

b725c5a

1 Parent(s): 3e8a9fc

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +61 -0
app.py +31 -0
config/audioldm.json +92 -0
config/autoencoderkl.json +69 -0
config/base.json +220 -0
config/comosvc.json +216 -0
config/diffusion.json +227 -0
config/fs2.json +118 -0
config/ns2.json +88 -0
config/transformer.json +180 -0
config/tts.json +23 -0
config/valle.json +53 -0
config/vits.json +101 -0
config/vitssvc.json +192 -0
config/vocoder.json +84 -0
evaluation/__init__.py +0 -0
evaluation/features/__init__.py +0 -0
evaluation/features/long_term_average_spectrum.py +19 -0
evaluation/features/signal_to_noise_ratio.py +133 -0
evaluation/features/singing_power_ratio.py +108 -0
evaluation/metrics/__init__.py +0 -0
evaluation/metrics/energy/__init__.py +0 -0
evaluation/metrics/energy/energy_pearson_coefficients.py +91 -0
evaluation/metrics/energy/energy_rmse.py +86 -0
evaluation/metrics/f0/__init__.py +0 -0
evaluation/metrics/f0/f0_pearson_coefficients.py +111 -0
evaluation/metrics/f0/f0_periodicity_rmse.py +112 -0
evaluation/metrics/f0/f0_rmse.py +110 -0
evaluation/metrics/f0/v_uv_f1.py +110 -0
evaluation/metrics/intelligibility/__init__.py +0 -0
evaluation/metrics/intelligibility/character_error_rate.py +81 -0
evaluation/metrics/intelligibility/word_error_rate.py +81 -0
evaluation/metrics/similarity/__init__.py +0 -0
evaluation/metrics/similarity/models/RawNetBasicBlock.py +146 -0
evaluation/metrics/similarity/models/RawNetModel.py +142 -0
evaluation/metrics/similarity/models/__init__.py +0 -0
evaluation/metrics/similarity/speaker_similarity.py +119 -0
evaluation/metrics/spectrogram/__init__.py +0 -0
evaluation/metrics/spectrogram/frechet_distance.py +31 -0
evaluation/metrics/spectrogram/mel_cepstral_distortion.py +21 -0
evaluation/metrics/spectrogram/multi_resolution_stft_distance.py +225 -0
evaluation/metrics/spectrogram/pesq.py +56 -0
evaluation/metrics/spectrogram/scale_invariant_signal_to_distortion_ratio.py +45 -0
evaluation/metrics/spectrogram/scale_invariant_signal_to_noise_ratio.py +45 -0
evaluation/metrics/spectrogram/short_time_objective_intelligibility.py +56 -0
models/tts/base/__init__.py +7 -0
models/tts/base/tts_dataset.py +389 -0
models/tts/base/tts_inferece.py +268 -0
models/tts/base/tts_trainer.py +699 -0
models/tts/fastspeech2/__init__.py +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,61 @@

+# Mac OS files
+.DS_Store
+# IDEs
+.idea
+.vs
+.vscode
+.cache
+# GitHub files
+.github
+# Byte-compiled / optimized / DLL / cached files
+__pycache__/
+*.py[cod]
+*$py.class
+*.pyc
+.temp
+*.c
+*.so
+*.o
+# Developing mode
+_*.sh
+_*.json
+*.lst
+yard*
+*.out
+evaluation/evalset_selection
+mfa
+egs/svc/*wavmark
+egs/svc/custom
+egs/svc/*/dev*
+egs/svc/dev_exp_config.json
+bins/svc/demo*
+bins/svc/preprocess_custom.py
+data
+# Data and ckpt
+*.pkl
+*.pt
+*.npy
+*.npz
+!modules/whisper_extractor/assets/mel_filters.npz
+*.tar.gz
+*.ckpt
+*.wav
+*.flac
+pretrained/wenet/*conformer_exp
+# Runtime data dirs
+processed_data
+data
+model_ckpt
+logs
+*.ipynb
+*.lst
+source_audio
+result
+conversion_results
+get_available_gpu.py

app.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import gradio as gr
+import os
+import torch
+def build_codec():
+    ...
+def build_model():
+    ...
+def ns2_inference(
+        prmopt_audio_path,
+        text,
+        diffusion_steps=100,
+):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+demo_inputs = ...
+demo_outputs = ...
+demo = gr.Interface(
+    fn=ns2_inference,
+    inputs=demo_inputs,
+    outputs=demo_outputs,
+    title="Amphion Zero-Shot TTS NaturalSpeech2"
+)
+if __name__ == "__main__":
+    demo.launch()

config/audioldm.json ADDED Viewed

	@@ -0,0 +1,92 @@

+{
+  "base_config": "config/base.json",
+  "model_type": "AudioLDM",
+  "task_type": "tta",
+  "dataset": [
+    "AudioCaps"
+  ],
+  "preprocess": {
+    // feature used for model training
+    "use_spkid": false,
+    "use_uv": false,
+    "use_frame_pitch": false,
+    "use_phone_pitch": false,
+    "use_frame_energy": false,
+    "use_phone_energy": false,
+    "use_mel": false,
+    "use_audio": false,
+    "use_label": false,
+    "use_one_hot": false,
+    "cond_mask_prob": 0.1
+  },
+  // model
+  "model": {
+    "audioldm": {
+      "image_size": 32,
+      "in_channels": 4,
+      "out_channels": 4,
+      "model_channels": 256,
+      "attention_resolutions": [
+        4,
+        2,
+        1
+      ],
+      "num_res_blocks": 2,
+      "channel_mult": [
+        1,
+        2,
+        4
+      ],
+      "num_heads": 8,
+      "use_spatial_transformer": true,
+      "transformer_depth": 1,
+      "context_dim": 768,
+      "use_checkpoint": true,
+      "legacy": false
+    },
+    "autoencoderkl": {
+      "ch": 128,
+      "ch_mult": [
+        1,
+        1,
+        2,
+        2,
+        4
+      ],
+      "num_res_blocks": 2,
+      "in_channels": 1,
+      "z_channels": 4,
+      "out_ch": 1,
+      "double_z": true
+    },
+    "noise_scheduler": {
+      "num_train_timesteps": 1000,
+      "beta_start": 0.00085,
+      "beta_end": 0.012,
+      "beta_schedule": "scaled_linear",
+      "clip_sample": false,
+      "steps_offset": 1,
+      "set_alpha_to_one": false,
+      "skip_prk_steps": true,
+      "prediction_type": "epsilon"
+    }
+  },
+  // train
+  "train": {
+    "lronPlateau": {
+      "factor": 0.9,
+      "patience": 100,
+      "min_lr": 4.0e-5,
+      "verbose": true
+    },
+    "adam": {
+      "lr": 5.0e-5,
+      "betas": [
+        0.9,
+        0.999
+      ],
+      "weight_decay": 1.0e-2,
+      "eps": 1.0e-8
+    }
+  }
+}

config/autoencoderkl.json ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+  "base_config": "config/base.json",
+  "model_type": "AutoencoderKL",
+  "task_type": "tta",
+  "dataset": [
+    "AudioCaps"
+  ],
+  "preprocess": {
+    // feature used for model training
+    "use_spkid": false,
+    "use_uv": false,
+    "use_frame_pitch": false,
+    "use_phone_pitch": false,
+    "use_frame_energy": false,
+    "use_phone_energy": false,
+    "use_mel": false,
+    "use_audio": false,
+    "use_label": false,
+    "use_one_hot": false
+  },
+  // model
+  "model": {
+    "autoencoderkl": {
+      "ch": 128,
+      "ch_mult": [
+        1,
+        1,
+        2,
+        2,
+        4
+      ],
+      "num_res_blocks": 2,
+      "in_channels": 1,
+      "z_channels": 4,
+      "out_ch": 1,
+      "double_z": true
+    },
+    "loss": {
+      "kl_weight": 1e-8,
+      "disc_weight": 0.5,
+      "disc_factor": 1.0,
+      "logvar_init": 0.0,
+      "min_adapt_d_weight": 0.0,
+      "max_adapt_d_weight": 10.0,
+      "disc_start": 50001,
+      "disc_in_channels": 1,
+      "disc_num_layers": 3,
+      "use_actnorm": false
+    }
+  },
+  // train
+  "train": {
+    "lronPlateau": {
+      "factor": 0.9,
+      "patience": 100,
+      "min_lr": 4.0e-5,
+      "verbose": true
+    },
+    "adam": {
+      "lr": 4.0e-4,
+      "betas": [
+        0.9,
+        0.999
+      ],
+      "weight_decay": 1.0e-2,
+      "eps": 1.0e-8
+    }
+  }
+}

config/base.json ADDED Viewed

	@@ -0,0 +1,220 @@

+{
+  "supported_model_type": [
+    "GANVocoder",
+    "Fastspeech2",
+    "DiffSVC",
+    "Transformer",
+    "EDM",
+    "CD"
+  ],
+  "task_type": "",
+  "dataset": [],
+  "use_custom_dataset": false,
+  "preprocess": {
+    "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
+    // trim audio silence
+    "data_augment": false,
+    "trim_silence": false,
+    "num_silent_frames": 8,
+    "trim_fft_size": 512, // fft size used in trimming
+    "trim_hop_size": 128, // hop size used in trimming
+    "trim_top_db": 30, // top db used in trimming sensitive to each dataset
+    // acoustic features
+    "extract_mel": false,
+    "mel_extract_mode": "",
+    "extract_linear_spec": false,
+    "extract_mcep": false,
+    "extract_pitch": false,
+    "extract_acoustic_token": false,
+    "pitch_remove_outlier": false,
+    "extract_uv": false,
+    "pitch_norm": false,
+    "extract_audio": false,
+    "extract_label": false,
+    "pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
+    "extract_energy": false,
+    "energy_remove_outlier": false,
+    "energy_norm": false,
+    "energy_extract_mode": "from_mel",
+    "extract_duration": false,
+    "extract_amplitude_phase": false,
+    "mel_min_max_norm": false,
+    // lingusitic features
+    "extract_phone": false,
+    "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
+    // content features
+    "extract_whisper_feature": false,
+    "extract_contentvec_feature": false,
+    "extract_mert_feature": false,
+    "extract_wenet_feature": false,
+    // Settings for data preprocessing
+    "n_mel": 80,
+    "win_size": 480,
+    "hop_size": 120,
+    "sample_rate": 24000,
+    "n_fft": 1024,
+    "fmin": 0,
+    "fmax": 12000,
+    "min_level_db": -115,
+    "ref_level_db": 20,
+    "bits": 8,
+    // Directory names of processed data or extracted features
+    "processed_dir": "processed_data",
+    "trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
+    "raw_data": "raw_data",
+    "phone_dir": "phones",
+    "wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
+    "audio_dir": "audios",
+    "log_amplitude_dir": "log_amplitudes",
+    "phase_dir": "phases",
+    "real_dir": "reals",
+    "imaginary_dir": "imaginarys",
+    "label_dir": "labels",
+    "linear_dir": "linears",
+    "mel_dir": "mels", // directory name of extraced mel features
+    "mcep_dir": "mcep", // directory name of extraced mcep features
+    "dur_dir": "durs",
+    "symbols_dict": "symbols.dict",
+    "lab_dir": "labs", // directory name of extraced label features
+    "wenet_dir": "wenet", // directory name of extraced wenet features
+    "contentvec_dir": "contentvec", // directory name of extraced wenet features
+    "pitch_dir": "pitches", // directory name of extraced pitch features
+    "energy_dir": "energys", // directory name of extracted energy features
+    "phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
+    "phone_energy_dir": "phone_energys", // directory name of extracted energy features
+    "uv_dir": "uvs", // directory name of extracted unvoiced features
+    "duration_dir": "duration", // ground-truth duration file
+    "phone_seq_file": "phone_seq_file", // phoneme sequence file
+    "file_lst": "file.lst",
+    "train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
+    "valid_file": "valid.json", // validattion set
+    "spk2id": "spk2id.json", // used for multi-speaker dataset
+    "utt2spk": "utt2spk", // used for multi-speaker dataset
+    "emo2id": "emo2id.json", // used for multi-emotion dataset
+    "utt2emo": "utt2emo", // used for multi-emotion dataset
+    // Features used for model training
+    "use_text": false,
+    "use_phone": false,
+    "use_phn_seq": false,
+    "use_lab": false,
+    "use_linear": false,
+    "use_mel": false,
+    "use_min_max_norm_mel": false,
+    "use_wav": false,
+    "use_phone_pitch": false,
+    "use_log_scale_pitch": false,
+    "use_phone_energy": false,
+    "use_phone_duration": false,
+    "use_log_scale_energy": false,
+    "use_wenet": false,
+    "use_dur": false,
+    "use_spkid": false, // True: use speaker id for multi-speaker dataset
+    "use_emoid": false, // True: use emotion id for multi-emotion dataset
+    "use_frame_pitch": false,
+    "use_uv": false,
+    "use_frame_energy": false,
+    "use_frame_duration": false,
+    "use_audio": false,
+    "use_label": false,
+    "use_one_hot": false,
+    "use_amplitude_phase": false,
+    "data_augment": false,
+    "align_mel_duration": false
+  },
+  "train": {
+    "ddp": true,
+    "random_seed": 970227,
+    "batch_size": 16,
+    "max_steps": 1000000,
+    // Trackers
+    "tracker": [
+      "tensorboard"
+      // "wandb",
+      // "cometml",
+      // "mlflow",
+    ],
+    "max_epoch": -1,
+    // -1 means no limit
+    "save_checkpoint_stride": [
+      5,
+      20
+    ],
+    // unit is epoch
+    "keep_last": [
+      3,
+      -1
+    ],
+    // -1 means infinite, if one number will broadcast
+    "run_eval": [
+      false,
+      true
+    ],
+    // if one number will broadcast
+    // Fix the random seed
+    "random_seed": 10086,
+    // Optimizer
+    "optimizer": "AdamW",
+    "adamw": {
+      "lr": 4.0e-4
+      // nn model lr
+    },
+    // LR Scheduler
+    "scheduler": "ReduceLROnPlateau",
+    "reducelronplateau": {
+      "factor": 0.8,
+      "patience": 10,
+      // unit is epoch
+      "min_lr": 1.0e-4
+    },
+    // Batchsampler
+    "sampler": {
+      "holistic_shuffle": true,
+      "drop_last": true
+    },
+    // Dataloader
+    "dataloader": {
+      "num_worker": 32,
+      "pin_memory": true
+    },
+    "gradient_accumulation_step": 1,
+    "total_training_steps": 50000,
+    "save_summary_steps": 500,
+    "save_checkpoints_steps": 10000,
+    "valid_interval": 10000,
+    "keep_checkpoint_max": 5,
+    "multi_speaker_training": false, // True: train multi-speaker model; False: training single-speaker model;
+    "max_epoch": -1,
+    // -1 means no limit
+    "save_checkpoint_stride": [
+      5,
+      20
+    ],
+    // unit is epoch
+    "keep_last": [
+      3,
+      -1
+    ],
+    // -1 means infinite, if one number will broadcast
+    "run_eval": [
+      false,
+      true
+    ],
+    // Batchsampler
+    "sampler": {
+      "holistic_shuffle": true,
+      "drop_last": true
+    },
+    // Dataloader
+    "dataloader": {
+      "num_worker": 32,
+      "pin_memory": true
+    },
+    // Trackers
+    "tracker": [
+      "tensorboard"
+      // "wandb",
+      // "cometml",
+      // "mlflow",
+    ],
+  },
+}

config/comosvc.json ADDED Viewed

	@@ -0,0 +1,216 @@

+{
+    "base_config": "config/base.json",
+    "model_type": "DiffComoSVC",
+    "task_type": "svc",
+    "use_custom_dataset": false,
+    "preprocess": {
+        // data augmentations
+        "use_pitch_shift": false,
+        "use_formant_shift": false,
+        "use_time_stretch": false,
+        "use_equalizer": false,
+        // acoustic features
+        "extract_mel": true,
+        "mel_min_max_norm": true,
+        "extract_pitch": true,
+        "pitch_extractor": "parselmouth",
+        "extract_uv": true,
+        "extract_energy": true,
+        // content features
+        "extract_whisper_feature": false,
+        "whisper_sample_rate": 16000,
+        "extract_contentvec_feature": false,
+        "contentvec_sample_rate": 16000,
+        "extract_wenet_feature": false,
+        "wenet_sample_rate": 16000,
+        "extract_mert_feature": false,
+        "mert_sample_rate": 16000,
+        // Default config for whisper
+        "whisper_frameshift": 0.01,
+        "whisper_downsample_rate": 2,
+        // Default config for content vector
+        "contentvec_frameshift": 0.02,
+        // Default config for mert
+        "mert_model": "m-a-p/MERT-v1-330M",
+        "mert_feature_layer": -1,
+        "mert_hop_size": 320,
+        // 24k
+        "mert_frameshit": 0.01333,
+        // 10ms
+        "wenet_frameshift": 0.01,
+        // wenetspeech is 4, gigaspeech is 6
+        "wenet_downsample_rate": 4,
+        // Default config
+        "n_mel": 100,
+        "win_size": 1024,
+        // todo
+        "hop_size": 256,
+        "sample_rate": 24000,
+        "n_fft": 1024,
+        // todo
+        "fmin": 0,
+        "fmax": 12000,
+        // todo
+        "f0_min": 50,
+        // ~C2
+        "f0_max": 1100,
+        //1100,    // ~C6(1100), ~G5(800)
+        "pitch_bin": 256,
+        "pitch_max": 1100.0,
+        "pitch_min": 50.0,
+        "is_label": true,
+        "is_mu_law": true,
+        "bits": 8,
+        "mel_min_max_stats_dir": "mel_min_max_stats",
+        "whisper_dir": "whisper",
+        "contentvec_dir": "contentvec",
+        "wenet_dir": "wenet",
+        "mert_dir": "mert",
+        // Extract content features using dataloader
+        "pin_memory": true,
+        "num_workers": 8,
+        "content_feature_batch_size": 16,
+        // Features used for model training
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_uv": true,
+        "use_frame_energy": true,
+        "use_log_scale_pitch": false,
+        "use_log_scale_energy": false,
+        "use_spkid": true,
+        // Meta file
+        "train_file": "train.json",
+        "valid_file": "test.json",
+        "spk2id": "singers.json",
+        "utt2spk": "utt2singer"
+    },
+    "model": {
+        "teacher_model_path": "[Your Teacher Model Path].bin",
+        "condition_encoder": {
+            "merge_mode": "add",
+            "input_melody_dim": 1,
+            "use_log_f0": true,
+            "n_bins_melody": 256,
+            //# Quantization (0 for not quantization)
+            "output_melody_dim": 384,
+            "input_loudness_dim": 1,
+            "use_log_loudness": true,
+            "n_bins_loudness": 256,
+            "output_loudness_dim": 384,
+            "use_whisper": false,
+            "use_contentvec": false,
+            "use_wenet": false,
+            "use_mert": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "mert_dim": 256,
+            "wenet_dim": 512,
+            "content_encoder_dim": 384,
+            "output_singer_dim": 384,
+            "singer_table_size": 512,
+            "output_content_dim": 384,
+            "use_spkid": true
+        },
+        "comosvc": {
+            "distill": false,
+            // conformer encoder
+            "input_dim": 384,
+            "output_dim": 100,
+            "n_heads": 2,
+            "n_layers": 6,
+            "filter_channels": 512,
+            "dropout": 0.1,
+            // karras diffusion
+            "P_mean": -1.2,
+            "P_std": 1.2,
+            "sigma_data": 0.5,
+            "sigma_min": 0.002,
+            "sigma_max": 80,
+            "rho": 7,
+            "n_timesteps": 40,
+        },
+        "diffusion": {
+            // Diffusion steps encoder
+            "step_encoder": {
+                "dim_raw_embedding": 128,
+                "dim_hidden_layer": 512,
+                "activation": "SiLU",
+                "num_layer": 2,
+                "max_period": 10000
+            },
+            // Diffusion decoder
+            "model_type": "bidilconv",
+            // bidilconv, unet2d, TODO: unet1d
+            "bidilconv": {
+                "base_channel": 384,
+                "n_res_block": 20,
+                "conv_kernel_size": 3,
+                "dilation_cycle_length": 4,
+                // specially, 1 means no dilation
+                "conditioner_size": 100
+            }
+        },
+    },
+    "train": {
+        // Basic settings
+        "fast_steps": 0,
+        "batch_size": 32,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1,
+        // -1 means no limit
+        "save_checkpoint_stride": [
+            10,
+            100
+        ],
+        // unit is epoch
+        "keep_last": [
+            3,
+            -1
+        ],
+        // -1 means infinite, if one number will broadcast
+        "run_eval": [
+            false,
+            true
+        ],
+        // if one number will broadcast
+        // Fix the random seed
+        "random_seed": 10086,
+        // Batchsampler
+        "sampler": {
+            "holistic_shuffle": true,
+            "drop_last": true
+        },
+        // Dataloader
+        "dataloader": {
+            "num_worker": 32,
+            "pin_memory": true
+        },
+        // Trackers
+        "tracker": [
+            "tensorboard"
+            // "wandb",
+            // "cometml",
+            // "mlflow",
+        ],
+        // Optimizer
+        "optimizer": "AdamW",
+        "adamw": {
+            "lr": 4.0e-4
+            // nn model lr
+        },
+        // LR Scheduler
+        "scheduler": "ReduceLROnPlateau",
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 10,
+            // unit is epoch
+            "min_lr": 1.0e-4
+        }
+    },
+    "inference": {
+        "comosvc": {
+            "inference_steps": 40
+        }
+    }
+}

config/diffusion.json ADDED Viewed

	@@ -0,0 +1,227 @@

+{
+    // FIXME: THESE ARE LEGACY
+    "base_config": "config/base.json",
+    "model_type": "diffusion",
+    "task_type": "svc",
+    "use_custom_dataset": false,
+    "preprocess": {
+        // data augmentations
+        "use_pitch_shift": false,
+        "use_formant_shift": false,
+        "use_time_stretch": false,
+        "use_equalizer": false,
+        // acoustic features
+        "extract_mel": true,
+        "mel_min_max_norm": true,
+        "extract_pitch": true,
+        "pitch_extractor": "parselmouth",
+        "extract_uv": true,
+        "extract_energy": true,
+        // content features
+        "extract_whisper_feature": false,
+        "whisper_sample_rate": 16000,
+        "extract_contentvec_feature": false,
+        "contentvec_sample_rate": 16000,
+        "extract_wenet_feature": false,
+        "wenet_sample_rate": 16000,
+        "extract_mert_feature": false,
+        "mert_sample_rate": 16000,
+        // Default config for whisper
+        "whisper_frameshift": 0.01,
+        "whisper_downsample_rate": 2,
+        // Default config for content vector
+        "contentvec_frameshift": 0.02,
+        // Default config for mert
+        "mert_model": "m-a-p/MERT-v1-330M",
+        "mert_feature_layer": -1,
+        "mert_hop_size": 320,
+        // 24k
+        "mert_frameshit": 0.01333,
+        // 10ms
+        "wenet_frameshift": 0.01,
+        // wenetspeech is 4, gigaspeech is 6
+        "wenet_downsample_rate": 4,
+        // Default config
+        "n_mel": 100,
+        "win_size": 1024,
+        // todo
+        "hop_size": 256,
+        "sample_rate": 24000,
+        "n_fft": 1024,
+        // todo
+        "fmin": 0,
+        "fmax": 12000,
+        // todo
+        "f0_min": 50,
+        // ~C2
+        "f0_max": 1100,
+        //1100,    // ~C6(1100), ~G5(800)
+        "pitch_bin": 256,
+        "pitch_max": 1100.0,
+        "pitch_min": 50.0,
+        "is_label": true,
+        "is_mu_law": true,
+        "bits": 8,
+        "mel_min_max_stats_dir": "mel_min_max_stats",
+        "whisper_dir": "whisper",
+        "contentvec_dir": "contentvec",
+        "wenet_dir": "wenet",
+        "mert_dir": "mert",
+        // Extract content features using dataloader
+        "pin_memory": true,
+        "num_workers": 8,
+        "content_feature_batch_size": 16,
+        // Features used for model training
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_uv": true,
+        "use_frame_energy": true,
+        "use_log_scale_pitch": false,
+        "use_log_scale_energy": false,
+        "use_spkid": true,
+        // Meta file
+        "train_file": "train.json",
+        "valid_file": "test.json",
+        "spk2id": "singers.json",
+        "utt2spk": "utt2singer"
+    },
+    "model": {
+        "condition_encoder": {
+            "merge_mode": "add",
+            "input_melody_dim": 1,
+            "use_log_f0": true,
+            "n_bins_melody": 256,
+            //# Quantization (0 for not quantization)
+            "output_melody_dim": 384,
+            "input_loudness_dim": 1,
+            "use_log_loudness": true,
+            "n_bins_loudness": 256,
+            "output_loudness_dim": 384,
+            "use_whisper": false,
+            "use_contentvec": false,
+            "use_wenet": false,
+            "use_mert": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "mert_dim": 256,
+            "wenet_dim": 512,
+            "content_encoder_dim": 384,
+            "output_singer_dim": 384,
+            "singer_table_size": 512,
+            "output_content_dim": 384,
+            "use_spkid": true
+        },
+        // FIXME: FOLLOWING ARE NEW!!
+        "diffusion": {
+            "scheduler": "ddpm",
+            "scheduler_settings": {
+                "num_train_timesteps": 1000,
+                "beta_start": 1.0e-4,
+                "beta_end": 0.02,
+                "beta_schedule": "linear"
+            },
+            // Diffusion steps encoder
+            "step_encoder": {
+                "dim_raw_embedding": 128,
+                "dim_hidden_layer": 512,
+                "activation": "SiLU",
+                "num_layer": 2,
+                "max_period": 10000
+            },
+            // Diffusion decoder
+            "model_type": "bidilconv",
+            // bidilconv, unet2d, TODO: unet1d
+            "bidilconv": {
+                "base_channel": 384,
+                "n_res_block": 20,
+                "conv_kernel_size": 3,
+                "dilation_cycle_length": 4,
+                // specially, 1 means no dilation
+                "conditioner_size": 384
+            },
+            "unet2d": {
+                "in_channels": 1,
+                "out_channels": 1,
+                "down_block_types": [
+                    "CrossAttnDownBlock2D",
+                    "CrossAttnDownBlock2D",
+                    "CrossAttnDownBlock2D",
+                    "DownBlock2D"
+                ],
+                "mid_block_type": "UNetMidBlock2DCrossAttn",
+                "up_block_types": [
+                    "UpBlock2D",
+                    "CrossAttnUpBlock2D",
+                    "CrossAttnUpBlock2D",
+                    "CrossAttnUpBlock2D"
+                ],
+                "only_cross_attention": false
+            }
+        }
+    },
+    // FIXME: FOLLOWING ARE NEW!!
+    "train": {
+        // Basic settings
+        "batch_size": 64,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1,
+        // -1 means no limit
+        "save_checkpoint_stride": [
+            5,
+            20
+        ],
+        // unit is epoch
+        "keep_last": [
+            3,
+            -1
+        ],
+        // -1 means infinite, if one number will broadcast
+        "run_eval": [
+            false,
+            true
+        ],
+        // if one number will broadcast
+        // Fix the random seed
+        "random_seed": 10086,
+        // Batchsampler
+        "sampler": {
+            "holistic_shuffle": true,
+            "drop_last": true
+        },
+        // Dataloader
+        "dataloader": {
+            "num_worker": 32,
+            "pin_memory": true
+        },
+        // Trackers
+        "tracker": [
+            "tensorboard"
+            // "wandb",
+            // "cometml",
+            // "mlflow",
+        ],
+        // Optimizer
+        "optimizer": "AdamW",
+        "adamw": {
+            "lr": 4.0e-4
+            // nn model lr
+        },
+        // LR Scheduler
+        "scheduler": "ReduceLROnPlateau",
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 10,
+            // unit is epoch
+            "min_lr": 1.0e-4
+        }
+    },
+    "inference": {
+        "diffusion": {
+            "scheduler": "pndm",
+            "scheduler_settings": {
+                "num_inference_timesteps": 1000
+            }
+        }
+    }
+}

config/fs2.json ADDED Viewed

	@@ -0,0 +1,118 @@

+{
+    "base_config": "config/tts.json",
+    "model_type": "FastSpeech2",
+    "task_type": "tts",
+    "dataset": ["LJSpeech"],
+    "preprocess": {
+      // acoustic features
+      "extract_audio": true,
+      "extract_mel": true,
+      "mel_extract_mode": "taco",
+      "mel_min_max_norm": false,
+      "extract_pitch": true,
+      "extract_uv": false,
+      "pitch_extractor": "dio",
+      "extract_energy": true,
+      "energy_extract_mode": "from_tacotron_stft",
+      "extract_duration": true,
+      "use_phone": true,
+      "pitch_norm": true,
+      "energy_norm": true,
+      "pitch_remove_outlier": true,
+      "energy_remove_outlier": true,
+      // Default config
+      "n_mel": 80,
+      "win_size": 1024,  // todo
+      "hop_size": 256,
+      "sample_rate": 22050,
+      "n_fft": 1024, // todo
+      "fmin": 0,
+      "fmax": 8000, // todo
+      "raw_data": "raw_data",
+      "text_cleaners": ["english_cleaners"],
+      "f0_min": 71,    // ~C2
+      "f0_max": 800, //1100,    // ~C6(1100), ~G5(800)
+      "pitch_bin": 256,
+      "pitch_max": 1100.0,
+      "pitch_min": 50.0,
+      "is_label": true,
+      "is_mu_law": true,
+      "bits": 8,
+      "mel_min_max_stats_dir": "mel_min_max_stats",
+      "whisper_dir": "whisper",
+      "content_vector_dir": "content_vector",
+      "wenet_dir": "wenet",
+      "mert_dir": "mert",
+      "spk2id":"spk2id.json",
+      "utt2spk":"utt2spk",
+      // Features used for model training
+      "use_mel": true,
+      "use_min_max_norm_mel": false,
+      "use_frame_pitch": false,
+      "use_frame_energy": false,
+      "use_phone_pitch": true,
+      "use_phone_energy": true,
+      "use_log_scale_pitch": false,
+      "use_log_scale_energy": false,
+      "use_spkid": false,
+      "align_mel_duration": true,
+      "text_cleaners": ["english_cleaners"],
+      "phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
+      },
+    "model": {
+      // Settings for transformer
+      "transformer": {
+        "encoder_layer": 4,
+        "encoder_head": 2,
+        "encoder_hidden": 256,
+        "decoder_layer": 6,
+        "decoder_head": 2,
+        "decoder_hidden": 256,
+        "conv_filter_size": 1024,
+        "conv_kernel_size": [9, 1],
+        "encoder_dropout": 0.2,
+        "decoder_dropout": 0.2
+      },
+      // Settings for variance_predictor
+      "variance_predictor":{
+        "filter_size": 256,
+        "kernel_size": 3,
+        "dropout": 0.5
+      },
+    "variance_embedding":{
+        "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
+        "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
+        "n_bins": 256
+      },
+    "max_seq_len": 1000
+    },
+    "train":{
+      "batch_size": 16,
+      "sort_sample": true,
+      "drop_last": true,
+      "group_size": 4,
+      "grad_clip_thresh": 1.0,
+      "dataloader": {
+        "num_worker": 8,
+        "pin_memory": true
+      },
+      "lr_scheduler":{
+        "num_warmup": 4000
+      },
+      // LR Scheduler
+      "scheduler": "NoamLR",
+      // Optimizer
+      "optimizer": "Adam",
+      "adam": {
+        "lr": 0.0625,
+        "betas": [0.9, 0.98],
+        "eps": 0.000000001,
+        "weight_decay": 0.0
+      },
+    }
+}

config/ns2.json ADDED Viewed

	@@ -0,0 +1,88 @@

+{
+    "base_config": "config/base.json",
+    "model_type": "NaturalSpeech2",
+    "dataset": ["LibriTTS"],
+    "preprocess": {
+        "use_mel": false,
+        "use_code": true,
+        "use_spkid": true,
+        "use_pitch": true,
+        "use_duration": true,
+        "use_phone": true,
+        "use_len": true,
+        "use_cross_reference": true,
+        "train_file": "train.json",
+        "melspec_dir": "mel",
+        "code_dir": "code",
+        "pitch_dir": "pitch",
+        "duration_dir": "duration",
+        "clip_mode": "start"
+    },
+    "model": {
+        "latent_dim": 128,
+        "prior_encoder": {
+            "vocab_size": 100,
+            "pitch_min": 50,
+            "pitch_max": 1100,
+            "pitch_bins_num": 512,
+            "encoder": {
+                "encoder_layer": 6,
+                "encoder_hidden": 512,
+                "encoder_head": 8,
+                "conv_filter_size": 2048,
+                "conv_kernel_size": 9,
+                "encoder_dropout": 0.2,
+                "use_cln": true
+            },
+            "duration_predictor": {
+                "input_size": 512,
+                "filter_size": 512,
+                "kernel_size": 3,
+                "conv_layers": 30,
+                "cross_attn_per_layer": 3,
+                "attn_head": 8,
+                "drop_out": 0.5
+            },
+            "pitch_predictor": {
+                "input_size": 512,
+                "filter_size": 512,
+                "kernel_size": 5,
+                "conv_layers": 30,
+                "cross_attn_per_layer": 3,
+                "attn_head": 8,
+                "drop_out": 0.5
+            }
+        },
+        "diffusion": {
+            "wavenet": {
+                "input_size": 128,
+                "hidden_size": 512,
+                "out_size": 128,
+                "num_layers": 40,
+                "cross_attn_per_layer": 3,
+                "dilation_cycle": 2,
+                "attn_head": 8,
+                "drop_out": 0.2
+            },
+            "beta_min": 0.05,
+            "beta_max": 20,
+            "sigma": 1.0,
+            "noise_factor": 1.0,
+            "ode_solver": "euler"
+        },
+        "prompt_encoder": {
+            "encoder_layer": 6,
+            "encoder_hidden": 512,
+            "encoder_head": 8,
+            "conv_filter_size": 2048,
+            "conv_kernel_size": 9,
+            "encoder_dropout": 0.2,
+            "use_cln": false
+        },
+        "query_emb": {
+            "query_token_num": 32,
+            "hidden_size": 512,
+            "head_num": 8
+        }
+    }
+}

config/transformer.json ADDED Viewed

	@@ -0,0 +1,180 @@

+{
+    "base_config": "config/base.json",
+    "model_type": "Transformer",
+    "task_type": "svc",
+    "use_custom_dataset": false,
+    "preprocess": {
+        // data augmentations
+        "use_pitch_shift": false,
+        "use_formant_shift": false,
+        "use_time_stretch": false,
+        "use_equalizer": false,
+        // acoustic features
+        "extract_mel": true,
+        "mel_min_max_norm": true,
+        "extract_pitch": true,
+        "pitch_extractor": "parselmouth",
+        "extract_uv": true,
+        "extract_energy": true,
+        // content features
+        "extract_whisper_feature": false,
+        "whisper_sample_rate": 16000,
+        "extract_contentvec_feature": false,
+        "contentvec_sample_rate": 16000,
+        "extract_wenet_feature": false,
+        "wenet_sample_rate": 16000,
+        "extract_mert_feature": false,
+        "mert_sample_rate": 16000,
+        // Default config for whisper
+        "whisper_frameshift": 0.01,
+        "whisper_downsample_rate": 2,
+        // Default config for content vector
+        "contentvec_frameshift": 0.02,
+        // Default config for mert
+        "mert_model": "m-a-p/MERT-v1-330M",
+        "mert_feature_layer": -1,
+        "mert_hop_size": 320,
+        // 24k
+        "mert_frameshit": 0.01333,
+        // 10ms
+        "wenet_frameshift": 0.01,
+        // wenetspeech is 4, gigaspeech is 6
+        "wenet_downsample_rate": 4,
+        // Default config
+        "n_mel": 100,
+        "win_size": 1024,
+        // todo
+        "hop_size": 256,
+        "sample_rate": 24000,
+        "n_fft": 1024,
+        // todo
+        "fmin": 0,
+        "fmax": 12000,
+        // todo
+        "f0_min": 50,
+        // ~C2
+        "f0_max": 1100,
+        //1100,    // ~C6(1100), ~G5(800)
+        "pitch_bin": 256,
+        "pitch_max": 1100.0,
+        "pitch_min": 50.0,
+        "is_label": true,
+        "is_mu_law": true,
+        "bits": 8,
+        "mel_min_max_stats_dir": "mel_min_max_stats",
+        "whisper_dir": "whisper",
+        "contentvec_dir": "contentvec",
+        "wenet_dir": "wenet",
+        "mert_dir": "mert",
+        // Extract content features using dataloader
+        "pin_memory": true,
+        "num_workers": 8,
+        "content_feature_batch_size": 16,
+        // Features used for model training
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_uv": true,
+        "use_frame_energy": true,
+        "use_log_scale_pitch": false,
+        "use_log_scale_energy": false,
+        "use_spkid": true,
+        // Meta file
+        "train_file": "train.json",
+        "valid_file": "test.json",
+        "spk2id": "singers.json",
+        "utt2spk": "utt2singer"
+    },
+    "model": {
+        "condition_encoder": {
+            "merge_mode": "add",
+            "input_melody_dim": 1,
+            "use_log_f0": true,
+            "n_bins_melody": 256,
+            //# Quantization (0 for not quantization)
+            "output_melody_dim": 384,
+            "input_loudness_dim": 1,
+            "use_log_loudness": true,
+            "n_bins_loudness": 256,
+            "output_loudness_dim": 384,
+            "use_whisper": false,
+            "use_contentvec": true,
+            "use_wenet": false,
+            "use_mert": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "mert_dim": 256,
+            "wenet_dim": 512,
+            "content_encoder_dim": 384,
+            "output_singer_dim": 384,
+            "singer_table_size": 512,
+            "output_content_dim": 384,
+            "use_spkid": true
+        },
+        "transformer": {
+            "type": "conformer",
+            // 'conformer' or 'transformer'
+            "input_dim": 384,
+            "output_dim": 100,
+            "n_heads": 2,
+            "n_layers": 6,
+            "filter_channels": 512,
+            "dropout": 0.1,
+        }
+    },
+    "train": {
+        // Basic settings
+        "batch_size": 64,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1,
+        // -1 means no limit
+        "save_checkpoint_stride": [
+            10,
+            100
+        ],
+        // unit is epoch
+        "keep_last": [
+            3,
+            -1
+        ],
+        // -1 means infinite, if one number will broadcast
+        "run_eval": [
+            false,
+            true
+        ],
+        // if one number will broadcast
+        // Fix the random seed
+        "random_seed": 10086,
+        // Batchsampler
+        "sampler": {
+            "holistic_shuffle": true,
+            "drop_last": true
+        },
+        // Dataloader
+        "dataloader": {
+            "num_worker": 32,
+            "pin_memory": true
+        },
+        // Trackers
+        "tracker": [
+            "tensorboard"
+            // "wandb",
+            // "cometml",
+            // "mlflow",
+        ],
+        // Optimizer
+        "optimizer": "AdamW",
+        "adamw": {
+            "lr": 4.0e-4
+            // nn model lr
+        },
+        // LR Scheduler
+        "scheduler": "ReduceLROnPlateau",
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 10,
+            // unit is epoch
+            "min_lr": 1.0e-4
+        }
+    }
+}

config/tts.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "base_config": "config/base.json",
+  "supported_model_type": [
+    "Fastspeech2",
+    "VITS",
+    "VALLE",
+  ],
+  "task_type": "tts",
+  "preprocess": {
+    "language": "en-us",
+    // linguistic features
+    "extract_phone": true,
+    "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
+    "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
+    // Directory names of processed data or extracted features
+    "phone_dir": "phones",
+    "use_phone": true,
+  },
+  "model": {
+      "text_token_num": 512,
+  }
+}

config/valle.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+    "base_config": "config/tts.json",
+    "model_type": "VALLE",
+    "task_type": "tts",
+    "dataset": [
+        "libritts"
+    ],
+    "preprocess": {
+        "extract_phone": true,
+        "phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
+        "extract_acoustic_token": true,
+        "acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
+        "acoustic_token_dir": "acoutic_tokens",
+        "use_text": false,
+        "use_phone": true,
+        "use_acoustic_token": true,
+        "symbols_dict": "symbols.dict",
+        "min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
+        "max_duration": 14, //  the duration uperbound to filter the audio with duration > max_duration.
+        "sample_rate": 24000,
+        "codec_hop_size": 320
+    },
+    "model": {
+        "text_token_num": 512,
+        "audio_token_num": 1024,
+        "decoder_dim": 1024, // embedding dimension of the decoder model
+        "nhead": 16, // number of attention heads in the decoder layers
+        "num_decoder_layers": 12, // number of decoder layers
+        "norm_first": true, // pre or post Normalization.
+        "add_prenet": false, // whether add PreNet after Inputs
+        "prefix_mode": 0, //  mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
+        "share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
+        "nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
+        "prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
+        "num_quantizers": 8, // numbert of the audio quantization layers
+        // "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
+    },
+    "train": {
+        "ddp": false,
+        "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
+        "max_epoch": 20,
+        "optimizer": "AdamW",
+        "scheduler": "cosine",
+        "warmup_steps": 16000, // number of steps that affects how rapidly the learning rate decreases
+        "base_lr": 1e-4, // base learning rate."
+        "valid_interval": 1000,
+        "log_epoch_step": 1000,
+        "save_checkpoint_stride": [
+            1,
+            1
+        ]
+    }
+}

config/vits.json ADDED Viewed

	@@ -0,0 +1,101 @@

+{
+    "base_config": "config/tts.json",
+    "model_type": "VITS",
+    "task_type": "tts",
+    "preprocess": {
+        "extract_phone": true,
+        "extract_mel": true,
+        "n_mel": 80,
+        "fmin": 0,
+        "fmax": null,
+        "extract_linear_spec": true,
+        "extract_audio": true,
+        "use_linear": true,
+        "use_mel": true,
+        "use_audio": true,
+        "use_text": false,
+        "use_phone": true,
+        "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
+        "n_fft": 1024,
+        "win_size": 1024,
+        "hop_size": 256,
+        "segment_size": 8192,
+        "text_cleaners": [
+            "english_cleaners"
+        ]
+    },
+    "model": {
+        "text_token_num": 512,
+        "inter_channels": 192,
+        "hidden_channels": 192,
+        "filter_channels": 768,
+        "n_heads": 2,
+        "n_layers": 6,
+        "kernel_size": 3,
+        "p_dropout": 0.1,
+        "resblock": "1",
+        "resblock_kernel_sizes": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates": [
+            8,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel": 512,
+        "upsample_kernel_sizes": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "n_layers_q": 3,
+        "use_spectral_norm": false,
+        "n_speakers": 0, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
+        "gin_channels": 256,
+        "use_sdp": true
+    },
+    "train": {
+        "fp16_run": true,
+        "learning_rate": 2e-4,
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-9,
+        "batch_size": 16,
+        "lr_decay": 0.999875,
+        // "segment_size": 8192,
+        "init_lr_ratio": 1,
+        "warmup_epochs": 0,
+        "c_mel": 45,
+        "c_kl": 1.0,
+        "AdamW": {
+            "betas": [
+                0.8,
+                0.99
+            ],
+            "eps": 1e-9,
+        }
+    }
+}

config/vitssvc.json ADDED Viewed

	@@ -0,0 +1,192 @@

+{
+    "base_config": "config/base.json",
+    "model_type": "VITS",
+    "task_type": "svc",
+    "preprocess": {
+        "extract_phone": false,
+        "extract_mel": true,
+        "extract_linear_spec": true,
+        "extract_audio": true,
+        "use_linear": true,
+        "use_mel": true,
+        "use_audio": true,
+        "use_text": false,
+        "use_phone": true,
+        "fmin": 0,
+        "fmax": null,
+        "f0_min": 50,
+        "f0_max": 1100,
+        // f0_bin in sovits
+        "pitch_bin": 256,
+        // filter_length in sovits
+        "n_fft": 2048,
+        // hop_length in sovits
+        "hop_size": 512,
+        // win_length in sovits
+        "win_size": 2048,
+        "segment_size": 8192,
+        "n_mel": 100,
+        "sample_rate": 44100,
+        "mel_min_max_stats_dir": "mel_min_max_stats",
+        "whisper_dir": "whisper",
+        "contentvec_dir": "contentvec",
+        "wenet_dir": "wenet",
+        "mert_dir": "mert",
+    },
+    "model": {
+        "condition_encoder": {
+            "merge_mode": "add",
+            "input_melody_dim": 1,
+            "use_log_f0": true,
+            "n_bins_melody": 256,
+            //# Quantization (0 for not quantization)
+            "output_melody_dim": 196,
+            "input_loudness_dim": 1,
+            "use_log_loudness": false,
+            "n_bins_loudness": 256,
+            "output_loudness_dim": 196,
+            "use_whisper": false,
+            "use_contentvec": false,
+            "use_wenet": false,
+            "use_mert": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "mert_dim": 256,
+            "wenet_dim": 512,
+            "content_encoder_dim": 196,
+            "output_singer_dim": 196,
+            "singer_table_size": 512,
+            "output_content_dim": 196,
+            "use_spkid": true
+        },
+        "vits": {
+            "filter_channels": 256,
+            "gin_channels": 256,
+            "hidden_channels": 192,
+            "inter_channels": 192,
+            "kernel_size": 3,
+            "n_flow_layer": 4,
+            "n_heads": 2,
+            "n_layers": 6,
+            "n_layers_q": 3,
+            "n_speakers": 512,
+            "p_dropout": 0.1,
+            "ssl_dim": 256,
+            "use_spectral_norm": false,
+        },
+        "generator": "hifigan",
+        "generator_config": {
+            "hifigan": {
+                "resblock": "1",
+                "resblock_kernel_sizes": [
+                    3,
+                    7,
+                    11
+                ],
+                "upsample_rates": [
+                    8,8,2,2,2
+                ],
+                "upsample_kernel_sizes": [
+                    16,16,4,4,4
+                ],
+                "upsample_initial_channel": 512,
+                "resblock_dilation_sizes": [
+                    [1,3,5],
+                    [1,3,5],
+                    [1,3,5]
+                ]
+            },
+            "melgan": {
+                "ratios": [8, 8, 2, 2, 2],
+                "ngf": 32,
+                "n_residual_layers": 3,
+                "num_D": 3,
+                "ndf": 16,
+                "n_layers": 4,
+                "downsampling_factor": 4
+            },
+            "bigvgan": {
+                "resblock": "1",
+                "activation": "snakebeta",
+                "snake_logscale": true,
+                "upsample_rates": [
+                    8,8,2,2,2,
+                ],
+                "upsample_kernel_sizes": [
+                    16,16,4,4,4,
+                ],
+                "upsample_initial_channel": 512,
+                "resblock_kernel_sizes": [
+                    3,
+                    7,
+                    11
+                ],
+                "resblock_dilation_sizes": [
+                    [1,3,5],
+                    [1,3,5],
+                    [1,3,5]
+                ]
+            },
+            "nsfhifigan": {
+                "resblock": "1",
+                "harmonic_num": 8,
+                "upsample_rates": [
+                    8,8,2,2,2,
+                ],
+                "upsample_kernel_sizes": [
+                    16,16,4,4,4,
+                ],
+                "upsample_initial_channel": 768,
+                "resblock_kernel_sizes": [
+                    3,
+                    7,
+                    11
+                ],
+                "resblock_dilation_sizes": [
+                    [1,3,5],
+                    [1,3,5],
+                    [1,3,5]
+                ]
+            },
+            "apnet": {
+              "ASP_channel": 512,
+              "ASP_resblock_kernel_sizes": [3,7,11],
+              "ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+              "ASP_input_conv_kernel_size": 7,
+              "ASP_output_conv_kernel_size": 7,
+              "PSP_channel": 512,
+              "PSP_resblock_kernel_sizes": [3,7,11],
+              "PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+              "PSP_input_conv_kernel_size": 7,
+              "PSP_output_R_conv_kernel_size": 7,
+              "PSP_output_I_conv_kernel_size": 7,
+            }
+        },
+    },
+    "train": {
+        "fp16_run": true,
+        "learning_rate": 2e-4,
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-9,
+        "batch_size": 16,
+        "lr_decay": 0.999875,
+        // "segment_size": 8192,
+        "init_lr_ratio": 1,
+        "warmup_epochs": 0,
+        "c_mel": 45,
+        "c_kl": 1.0,
+        "AdamW": {
+            "betas": [
+                0.8,
+                0.99
+            ],
+            "eps": 1e-9,
+        }
+    }
+}

config/vocoder.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+  "base_config": "config/base.json",
+  "dataset": [
+    "LJSpeech",
+    "LibriTTS",
+    "opencpop",
+    "m4singer",
+    "svcc",
+    "svcceval",
+    "pjs",
+    "opensinger",
+    "popbutfy",
+    "nus48e",
+    "popcs",
+    "kising",
+    "csd",
+    "opera",
+    "vctk",
+    "lijian",
+    "cdmusiceval"
+  ],
+  "task_type": "vocoder",
+  "preprocess": {
+    // acoustic features
+    "extract_mel": true,
+    "extract_pitch": false,
+    "extract_uv": false,
+    "extract_audio": true,
+    "extract_label": false,
+    "extract_one_hot": false,
+    "extract_amplitude_phase": false,
+    "pitch_extractor": "parselmouth",
+    // Settings for data preprocessing
+    "n_mel": 100,
+    "win_size": 1024,
+    "hop_size": 256,
+    "sample_rate": 24000,
+    "n_fft": 1024,
+    "fmin": 0,
+    "fmax": 12000,
+    "f0_min": 50,
+    "f0_max": 1100,
+    "pitch_bin": 256,
+    "pitch_max": 1100.0,
+    "pitch_min": 50.0,
+    "is_mu_law": false,
+    "bits": 8,
+    "cut_mel_frame": 32,
+    // Directory names of processed data or extracted features
+    "spk2id": "singers.json",
+    // Features used for model training
+    "use_mel": true,
+    "use_frame_pitch": false,
+    "use_uv": false,
+    "use_audio": true,
+    "use_label": false,
+    "use_one_hot": false,
+    "train_file": "train.json",
+    "valid_file": "test.json"
+  },
+  "train": {
+    "random_seed": 114514,
+    "batch_size": 64,
+    "gradient_accumulation_step": 1,
+    "max_epoch": 1000000,
+    "save_checkpoint_stride": [
+      20
+    ],
+    "run_eval": [
+      true
+    ],
+    "sampler": {
+      "holistic_shuffle": true,
+      "drop_last": true
+    },
+    "dataloader": {
+      "num_worker": 4,
+      "pin_memory": true
+    },
+    "tracker": [
+      "tensorboard"
+    ],
+  }
+}

evaluation/__init__.py ADDED Viewed

File without changes

evaluation/features/__init__.py ADDED Viewed

File without changes

evaluation/features/long_term_average_spectrum.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import librosa
+from scipy import signal
+def extract_ltas(audio, fs=None, n_fft=1024, hop_length=256):
+    """Extract Long-Term Average Spectrum for a given audio."""
+    if fs != None:
+        y, _ = librosa.load(audio, sr=fs)
+    else:
+        y, fs = librosa.load(audio)
+    frequency, density = signal.welch(
+        x=y, fs=fs, window="hann", nperseg=hop_length, nfft=n_fft
+    )
+    return frequency, density

evaluation/features/signal_to_noise_ratio.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import scipy.signal as sig
+import copy
+import librosa
+def bandpower(ps, mode="time"):
+    """
+    estimate bandpower, see https://de.mathworks.com/help/signal/ref/bandpower.html
+    """
+    if mode == "time":
+        x = ps
+        l2norm = np.linalg.norm(x) ** 2.0 / len(x)
+        return l2norm
+    elif mode == "psd":
+        return sum(ps)
+def getIndizesAroundPeak(arr, peakIndex, searchWidth=1000):
+    peakBins = []
+    magMax = arr[peakIndex]
+    curVal = magMax
+    for i in range(searchWidth):
+        newBin = peakIndex + i
+        if newBin >= len(arr):
+            break
+        newVal = arr[newBin]
+        if newVal > curVal:
+            break
+        else:
+            peakBins.append(int(newBin))
+            curVal = newVal
+    curVal = magMax
+    for i in range(searchWidth):
+        newBin = peakIndex - i
+        if newBin < 0:
+            break
+        newVal = arr[newBin]
+        if newVal > curVal:
+            break
+        else:
+            peakBins.append(int(newBin))
+            curVal = newVal
+    return np.array(list(set(peakBins)))
+def freqToBin(fAxis, Freq):
+    return np.argmin(abs(fAxis - Freq))
+def getPeakInArea(psd, faxis, estimation, searchWidthHz=10):
+    """
+    returns bin and frequency of the maximum in an area
+    """
+    binLow = freqToBin(faxis, estimation - searchWidthHz)
+    binHi = freqToBin(faxis, estimation + searchWidthHz)
+    peakbin = binLow + np.argmax(psd[binLow : binHi + 1])
+    return peakbin, faxis[peakbin]
+def getHarmonics(fund, sr, nHarmonics=6, aliased=False):
+    harmonicMultipliers = np.arange(2, nHarmonics + 2)
+    harmonicFs = fund * harmonicMultipliers
+    if not aliased:
+        harmonicFs[harmonicFs > sr / 2] = -1
+        harmonicFs = np.delete(harmonicFs, harmonicFs == -1)
+    else:
+        nyqZone = np.floor(harmonicFs / (sr / 2))
+        oddEvenNyq = nyqZone % 2
+        harmonicFs = np.mod(harmonicFs, sr / 2)
+        harmonicFs[oddEvenNyq == 1] = (sr / 2) - harmonicFs[oddEvenNyq == 1]
+    return harmonicFs
+def extract_snr(audio, sr=None):
+    """Extract Signal-to-Noise Ratio for a given audio."""
+    if sr != None:
+        audio, _ = librosa.load(audio, sr=sr)
+    else:
+        audio, sr = librosa.load(audio, sr=sr)
+    faxis, ps = sig.periodogram(
+        audio, fs=sr, window=("kaiser", 38)
+    )  # get periodogram, parametrized like in matlab
+    fundBin = np.argmax(
+        ps
+    )  # estimate fundamental at maximum amplitude, get the bin number
+    fundIndizes = getIndizesAroundPeak(
+        ps, fundBin
+    )  # get bin numbers around fundamental peak
+    fundFrequency = faxis[fundBin]  # frequency of fundamental
+    nHarmonics = 18
+    harmonicFs = getHarmonics(
+        fundFrequency, sr, nHarmonics=nHarmonics, aliased=True
+    )  # get harmonic frequencies
+    harmonicBorders = np.zeros([2, nHarmonics], dtype=np.int16).T
+    fullHarmonicBins = np.array([], dtype=np.int16)
+    fullHarmonicBinList = []
+    harmPeakFreqs = []
+    harmPeaks = []
+    for i, harmonic in enumerate(harmonicFs):
+        searcharea = 0.1 * fundFrequency
+        estimation = harmonic
+        binNum, freq = getPeakInArea(ps, faxis, estimation, searcharea)
+        harmPeakFreqs.append(freq)
+        harmPeaks.append(ps[binNum])
+        allBins = getIndizesAroundPeak(ps, binNum, searchWidth=1000)
+        fullHarmonicBins = np.append(fullHarmonicBins, allBins)
+        fullHarmonicBinList.append(allBins)
+        harmonicBorders[i, :] = [allBins[0], allBins[-1]]
+    fundIndizes.sort()
+    pFund = bandpower(ps[fundIndizes[0] : fundIndizes[-1]])  # get power of fundamental
+    noisePrepared = copy.copy(ps)
+    noisePrepared[fundIndizes] = 0
+    noisePrepared[fullHarmonicBins] = 0
+    noiseMean = np.median(noisePrepared[noisePrepared != 0])
+    noisePrepared[fundIndizes] = noiseMean
+    noisePrepared[fullHarmonicBins] = noiseMean
+    noisePower = bandpower(noisePrepared)
+    r = 10 * np.log10(pFund / noisePower)
+    return r, 10 * np.log10(noisePower)

evaluation/features/singing_power_ratio.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import librosa
+from utils.util import JsonHParams
+from utils.f0 import get_f0_features_using_parselmouth, get_pitch_sub_median
+from utils.mel import extract_mel_features
+def extract_spr(
+    audio,
+    fs=None,
+    hop_length=256,
+    win_length=1024,
+    n_fft=1024,
+    n_mels=128,
+    f0_min=37,
+    f0_max=1000,
+    pitch_bin=256,
+    pitch_max=1100.0,
+    pitch_min=50.0,
+):
+    """Compute Singing Power Ratio (SPR) from a given audio.
+    audio: path to the audio.
+    fs: sampling rate.
+    hop_length: hop length.
+    win_length: window length.
+    n_mels: number of mel filters.
+    f0_min: lower limit for f0.
+    f0_max: upper limit for f0.
+    pitch_bin: number of bins for f0 quantization.
+    pitch_max: upper limit for f0 quantization.
+    pitch_min: lower limit for f0 quantization.
+    """
+    # Load audio
+    if fs != None:
+        audio, _ = librosa.load(audio, sr=fs)
+    else:
+        audio, fs = librosa.load(audio)
+    audio = torch.from_numpy(audio)
+    # Initialize config
+    cfg = JsonHParams()
+    cfg.sample_rate = fs
+    cfg.hop_size = hop_length
+    cfg.win_size = win_length
+    cfg.n_fft = n_fft
+    cfg.n_mel = n_mels
+    cfg.f0_min = f0_min
+    cfg.f0_max = f0_max
+    cfg.pitch_bin = pitch_bin
+    cfg.pitch_max = pitch_max
+    cfg.pitch_min = pitch_min
+    # Extract mel spectrograms
+    cfg.fmin = 2000
+    cfg.fmax = 4000
+    mel1 = extract_mel_features(
+        y=audio.unsqueeze(0),
+        cfg=cfg,
+    ).squeeze(0)
+    cfg.fmin = 0
+    cfg.fmax = 2000
+    mel2 = extract_mel_features(
+        y=audio.unsqueeze(0),
+        cfg=cfg,
+    ).squeeze(0)
+    f0 = get_f0_features_using_parselmouth(
+        audio,
+        cfg,
+    )[0]
+    # Mel length alignment
+    length = min(len(f0), mel1.shape[-1])
+    f0 = f0[:length]
+    mel1 = mel1[:, :length]
+    mel2 = mel2[:, :length]
+    # Compute SPR
+    res = []
+    for i in range(mel1.shape[-1]):
+        if f0[i] <= 1:
+            continue
+        chunk1 = mel1[:, i]
+        chunk2 = mel2[:, i]
+        max1 = max(chunk1.numpy())
+        max2 = max(chunk2.numpy())
+        tmp_res = max2 - max1
+        res.append(tmp_res)
+    if len(res) == 0:
+        return False
+    else:
+        return sum(res) / len(res)

evaluation/metrics/__init__.py ADDED Viewed

File without changes

evaluation/metrics/energy/__init__.py ADDED Viewed

File without changes

evaluation/metrics/energy/energy_pearson_coefficients.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import librosa
+import torch
+import numpy as np
+from numpy import linalg as LA
+from torchmetrics import PearsonCorrCoef
+def extract_energy_pearson_coeffcients(
+    audio_ref,
+    audio_deg,
+    fs=None,
+    n_fft=1024,
+    hop_length=256,
+    win_length=1024,
+    method="dtw",
+    db_scale=True,
+):
+    """Compute Energy Pearson Coefficients between the predicted and the ground truth audio.
+    audio_ref: path to the ground truth audio.
+    audio_deg: path to the predicted audio.
+    fs: sampling rate.
+    n_fft: fft size.
+    hop_length: hop length.
+    win_length: window length.
+    method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio.
+            "cut" will cut both audios into a same length according to the one with the shorter length.
+    db_scale: the ground truth and predicted audio will be converted to db_scale if "True".
+    """
+    # Initialize method
+    pearson = PearsonCorrCoef()
+    # Load audio
+    if fs != None:
+        audio_ref, _ = librosa.load(audio_ref, sr=fs)
+        audio_deg, _ = librosa.load(audio_deg, sr=fs)
+    else:
+        audio_ref, fs = librosa.load(audio_ref)
+        audio_deg, fs = librosa.load(audio_deg)
+    # STFT
+    spec_ref = librosa.stft(
+        y=audio_ref, n_fft=n_fft, hop_length=hop_length, win_length=win_length
+    )
+    spec_deg = librosa.stft(
+        y=audio_deg, n_fft=n_fft, hop_length=hop_length, win_length=win_length
+    )
+    # Get magnitudes
+    mag_ref = np.abs(spec_ref).T
+    mag_deg = np.abs(spec_deg).T
+    # Convert spectrogram to energy
+    energy_ref = LA.norm(mag_ref, axis=1)
+    energy_deg = LA.norm(mag_deg, axis=1)
+    # Convert to db_scale
+    if db_scale:
+        energy_ref = 20 * np.log10(energy_ref)
+        energy_deg = 20 * np.log10(energy_deg)
+    # Audio length alignment
+    if method == "cut":
+        length = min(len(energy_ref), len(energy_deg))
+        energy_ref = energy_ref[:length]
+        energy_deg = energy_deg[:length]
+    elif method == "dtw":
+        _, wp = librosa.sequence.dtw(energy_ref, energy_deg, backtrack=True)
+        energy_gt_new = []
+        energy_pred_new = []
+        for i in range(wp.shape[0]):
+            gt_index = wp[i][0]
+            pred_index = wp[i][1]
+            energy_gt_new.append(energy_ref[gt_index])
+            energy_pred_new.append(energy_deg[pred_index])
+        energy_ref = np.array(energy_gt_new)
+        energy_deg = np.array(energy_pred_new)
+        assert len(energy_ref) == len(energy_deg)
+    # Convert to tensor
+    energy_ref = torch.from_numpy(energy_ref)
+    energy_deg = torch.from_numpy(energy_deg)
+    return pearson(energy_ref, energy_deg).numpy().tolist()

evaluation/metrics/energy/energy_rmse.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import librosa
+import torch
+import numpy as np
+from numpy import linalg as LA
+def extract_energy_rmse(
+    audio_ref,
+    audio_deg,
+    fs=None,
+    n_fft=1024,
+    hop_length=256,
+    win_length=1024,
+    method="dtw",
+    db_scale=True,
+):
+    """Compute Energy Root Mean Square Error (RMSE) between the predicted and the ground truth audio.
+    audio_ref: path to the ground truth audio.
+    audio_deg: path to the predicted audio.
+    fs: sampling rate.
+    n_fft: fft size.
+    hop_length: hop length.
+    win_length: window length.
+    method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio.
+            "cut" will cut both audios into a same length according to the one with the shorter length.
+    db_scale: the ground truth and predicted audio will be converted to db_scale if "True".
+    """
+    # Load audio
+    if fs != None:
+        audio_ref, _ = librosa.load(audio_ref, sr=fs)
+        audio_deg, _ = librosa.load(audio_deg, sr=fs)
+    else:
+        audio_ref, fs = librosa.load(audio_ref)
+        audio_deg, fs = librosa.load(audio_deg)
+    # STFT
+    spec_ref = librosa.stft(
+        y=audio_ref, n_fft=n_fft, hop_length=hop_length, win_length=win_length
+    )
+    spec_deg = librosa.stft(
+        y=audio_deg, n_fft=n_fft, hop_length=hop_length, win_length=win_length
+    )
+    # Get magnitudes
+    mag_ref = np.abs(spec_ref).T
+    mag_deg = np.abs(spec_deg).T
+    # Convert spectrogram to energy
+    energy_ref = LA.norm(mag_ref, axis=1)
+    energy_deg = LA.norm(mag_deg, axis=1)
+    # Convert to db_scale
+    if db_scale:
+        energy_ref = 20 * np.log10(energy_ref)
+        energy_deg = 20 * np.log10(energy_deg)
+    # Audio length alignment
+    if method == "cut":
+        length = min(len(energy_ref), len(energy_deg))
+        energy_ref = energy_ref[:length]
+        energy_deg = energy_deg[:length]
+    elif method == "dtw":
+        _, wp = librosa.sequence.dtw(energy_ref, energy_deg, backtrack=True)
+        energy_gt_new = []
+        energy_pred_new = []
+        for i in range(wp.shape[0]):
+            gt_index = wp[i][0]
+            pred_index = wp[i][1]
+            energy_gt_new.append(energy_ref[gt_index])
+            energy_pred_new.append(energy_deg[pred_index])
+        energy_ref = np.array(energy_gt_new)
+        energy_deg = np.array(energy_pred_new)
+        assert len(energy_ref) == len(energy_deg)
+    # Compute RMSE
+    energy_mse = np.square(np.subtract(energy_ref, energy_deg)).mean()
+    energy_rmse = math.sqrt(energy_mse)
+    return energy_rmse

evaluation/metrics/f0/__init__.py ADDED Viewed

File without changes

evaluation/metrics/f0/f0_pearson_coefficients.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import librosa
+import numpy as np
+from torchmetrics import PearsonCorrCoef
+from utils.util import JsonHParams
+from utils.f0 import get_f0_features_using_parselmouth, get_pitch_sub_median
+def extract_fpc(
+    audio_ref,
+    audio_deg,
+    fs=None,
+    hop_length=256,
+    f0_min=50,
+    f0_max=1100,
+    pitch_bin=256,
+    pitch_min=50,
+    pitch_max=1100,
+    need_mean=True,
+    method="dtw",
+):
+    """Compute F0 Pearson Distance (FPC) between the predicted and the ground truth audio.
+    audio_ref: path to the ground truth audio.
+    audio_deg: path to the predicted audio.
+    fs: sampling rate.
+    hop_length: hop length.
+    f0_min: lower limit for f0.
+    f0_max: upper limit for f0.
+    pitch_bin: number of bins for f0 quantization.
+    pitch_max: upper limit for f0 quantization.
+    pitch_min: lower limit for f0 quantization.
+    need_mean: subtract the mean value from f0 if "True".
+    method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio.
+            "cut" will cut both audios into a same length according to the one with the shorter length.
+    """
+    # Initialize method
+    pearson = PearsonCorrCoef()
+    # Load audio
+    if fs != None:
+        audio_ref, _ = librosa.load(audio_ref, sr=fs)
+        audio_deg, _ = librosa.load(audio_deg, sr=fs)
+    else:
+        audio_ref, fs = librosa.load(audio_ref)
+        audio_deg, fs = librosa.load(audio_deg)
+    # Initialize config
+    cfg = JsonHParams()
+    cfg.sample_rate = fs
+    cfg.hop_size = hop_length
+    cfg.f0_min = f0_min
+    cfg.f0_max = f0_max
+    cfg.pitch_bin = pitch_bin
+    cfg.pitch_max = pitch_max
+    cfg.pitch_min = pitch_min
+    # Compute f0
+    f0_ref = get_f0_features_using_parselmouth(
+        audio_ref,
+        cfg,
+    )[0]
+    f0_deg = get_f0_features_using_parselmouth(
+        audio_deg,
+        cfg,
+    )[0]
+    # Subtract mean value from f0
+    if need_mean:
+        f0_ref = torch.from_numpy(f0_ref)
+        f0_deg = torch.from_numpy(f0_deg)
+        f0_ref = get_pitch_sub_median(f0_ref).numpy()
+        f0_deg = get_pitch_sub_median(f0_deg).numpy()
+    # Avoid silence
+    min_length = min(len(f0_ref), len(f0_deg))
+    if min_length <= 1:
+        return 1
+    # F0 length alignment
+    if method == "cut":
+        length = min(len(f0_ref), len(f0_deg))
+        f0_ref = f0_ref[:length]
+        f0_deg = f0_deg[:length]
+    elif method == "dtw":
+        _, wp = librosa.sequence.dtw(f0_ref, f0_deg, backtrack=True)
+        f0_gt_new = []
+        f0_pred_new = []
+        for i in range(wp.shape[0]):
+            gt_index = wp[i][0]
+            pred_index = wp[i][1]
+            f0_gt_new.append(f0_ref[gt_index])
+            f0_pred_new.append(f0_deg[pred_index])
+        f0_ref = np.array(f0_gt_new)
+        f0_deg = np.array(f0_pred_new)
+        assert len(f0_ref) == len(f0_deg)
+    # Convert to tensor
+    f0_ref = torch.from_numpy(f0_ref)
+    f0_deg = torch.from_numpy(f0_deg)
+    return pearson(f0_ref, f0_deg).numpy().tolist()

evaluation/metrics/f0/f0_periodicity_rmse.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torchcrepe
+import math
+import librosa
+import torch
+import numpy as np
+def extract_f0_periodicity_rmse(
+    audio_ref,
+    audio_deg,
+    fs=None,
+    hop_length=256,
+    method="dtw",
+):
+    """Compute f0 periodicity Root Mean Square Error (RMSE) between the predicted and the ground truth audio.
+    audio_ref: path to the ground truth audio.
+    audio_deg: path to the predicted audio.
+    fs: sampling rate.
+    hop_length: hop length.
+    method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio.
+            "cut" will cut both audios into a same length according to the one with the shorter length.
+    """
+    # Load audio
+    if fs != None:
+        audio_ref, _ = librosa.load(audio_ref, sr=fs)
+        audio_deg, _ = librosa.load(audio_deg, sr=fs)
+    else:
+        audio_ref, fs = librosa.load(audio_ref)
+        audio_deg, fs = librosa.load(audio_deg)
+    # Convert to torch
+    audio_ref = torch.from_numpy(audio_ref).unsqueeze(0)
+    audio_deg = torch.from_numpy(audio_deg).unsqueeze(0)
+    # Get periodicity
+    pitch_ref, periodicity_ref = torchcrepe.predict(
+        audio_ref,
+        sample_rate=fs,
+        hop_length=hop_length,
+        fmin=0,
+        fmax=1500,
+        model="full",
+        return_periodicity=True,
+        device="cuda:0",
+    )
+    pitch_deg, periodicity_deg = torchcrepe.predict(
+        audio_deg,
+        sample_rate=fs,
+        hop_length=hop_length,
+        fmin=0,
+        fmax=1500,
+        model="full",
+        return_periodicity=True,
+        device="cuda:0",
+    )
+    # Cut silence
+    periodicity_ref = (
+        torchcrepe.threshold.Silence()(
+            periodicity_ref,
+            audio_ref,
+            fs,
+            hop_length=hop_length,
+        )
+        .squeeze(0)
+        .numpy()
+    )
+    periodicity_deg = (
+        torchcrepe.threshold.Silence()(
+            periodicity_deg,
+            audio_deg,
+            fs,
+            hop_length=hop_length,
+        )
+        .squeeze(0)
+        .numpy()
+    )
+    # Avoid silence audio
+    min_length = min(len(periodicity_ref), len(periodicity_deg))
+    if min_length <= 1:
+        return 0
+    # Periodicity length alignment
+    if method == "cut":
+        length = min(len(periodicity_ref), len(periodicity_deg))
+        periodicity_ref = periodicity_ref[:length]
+        periodicity_deg = periodicity_deg[:length]
+    elif method == "dtw":
+        _, wp = librosa.sequence.dtw(periodicity_ref, periodicity_deg, backtrack=True)
+        periodicity_ref_new = []
+        periodicity_deg_new = []
+        for i in range(wp.shape[0]):
+            ref_index = wp[i][0]
+            deg_index = wp[i][1]
+            periodicity_ref_new.append(periodicity_ref[ref_index])
+            periodicity_deg_new.append(periodicity_deg[deg_index])
+        periodicity_ref = np.array(periodicity_ref_new)
+        periodicity_deg = np.array(periodicity_deg_new)
+        assert len(periodicity_ref) == len(periodicity_deg)
+    # Compute RMSE
+    periodicity_mse = np.square(np.subtract(periodicity_ref, periodicity_deg)).mean()
+    periodicity_rmse = math.sqrt(periodicity_mse)
+    return periodicity_rmse

evaluation/metrics/f0/f0_rmse.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import librosa
+import torch
+import numpy as np
+from utils.util import JsonHParams
+from utils.f0 import get_f0_features_using_parselmouth, get_pitch_sub_median
+ZERO = 1e-8
+def extract_f0rmse(
+    audio_ref,
+    audio_deg,
+    fs=None,
+    hop_length=256,
+    f0_min=37,
+    f0_max=1000,
+    pitch_bin=256,
+    pitch_max=1100.0,
+    pitch_min=50.0,
+    need_mean=True,
+    method="dtw",
+):
+    """Compute F0 Root Mean Square Error (RMSE) between the predicted and the ground truth audio.
+    audio_ref: path to the ground truth audio.
+    audio_deg: path to the predicted audio.
+    fs: sampling rate.
+    hop_length: hop length.
+    f0_min: lower limit for f0.
+    f0_max: upper limit for f0.
+    pitch_bin: number of bins for f0 quantization.
+    pitch_max: upper limit for f0 quantization.
+    pitch_min: lower limit for f0 quantization.
+    need_mean: subtract the mean value from f0 if "True".
+    method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio.
+            "cut" will cut both audios into a same length according to the one with the shorter length.
+    """
+    # Load audio
+    if fs != None:
+        audio_ref, _ = librosa.load(audio_ref, sr=fs)
+        audio_deg, _ = librosa.load(audio_deg, sr=fs)
+    else:
+        audio_ref, fs = librosa.load(audio_ref)
+        audio_deg, fs = librosa.load(audio_deg)
+    # Initialize config for f0 extraction
+    cfg = JsonHParams()
+    cfg.sample_rate = fs
+    cfg.hop_size = hop_length
+    cfg.f0_min = f0_min
+    cfg.f0_max = f0_max
+    cfg.pitch_bin = pitch_bin
+    cfg.pitch_max = pitch_max
+    cfg.pitch_min = pitch_min
+    # Extract f0
+    f0_ref = get_f0_features_using_parselmouth(
+        audio_ref,
+        cfg,
+    )[0]
+    f0_deg = get_f0_features_using_parselmouth(
+        audio_deg,
+        cfg,
+    )[0]
+    # Subtract mean value from f0
+    if need_mean:
+        f0_ref = torch.from_numpy(f0_ref)
+        f0_deg = torch.from_numpy(f0_deg)
+        f0_ref = get_pitch_sub_median(f0_ref).numpy()
+        f0_deg = get_pitch_sub_median(f0_deg).numpy()
+    # Avoid silence
+    min_length = min(len(f0_ref), len(f0_deg))
+    if min_length <= 1:
+        return 0
+    # F0 length alignment
+    if method == "cut":
+        length = min(len(f0_ref), len(f0_deg))
+        f0_ref = f0_ref[:length]
+        f0_deg = f0_deg[:length]
+    elif method == "dtw":
+        _, wp = librosa.sequence.dtw(f0_ref, f0_deg, backtrack=True)
+        f0_gt_new = []
+        f0_pred_new = []
+        for i in range(wp.shape[0]):
+            gt_index = wp[i][0]
+            pred_index = wp[i][1]
+            f0_gt_new.append(f0_ref[gt_index])
+            f0_pred_new.append(f0_deg[pred_index])
+        f0_ref = np.array(f0_gt_new)
+        f0_deg = np.array(f0_pred_new)
+        assert len(f0_ref) == len(f0_deg)
+    # Compute RMSE
+    f0_mse = np.square(np.subtract(f0_ref, f0_deg)).mean()
+    f0_rmse = math.sqrt(f0_mse)
+    return f0_rmse

evaluation/metrics/f0/v_uv_f1.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import librosa
+import torch
+import numpy as np
+from utils.util import JsonHParams
+from utils.f0 import get_f0_features_using_parselmouth
+ZERO = 1e-8
+def extract_f1_v_uv(
+    audio_ref,
+    audio_deg,
+    fs=None,
+    hop_length=256,
+    f0_min=37,
+    f0_max=1000,
+    pitch_bin=256,
+    pitch_max=1100.0,
+    pitch_min=50.0,
+    method="dtw",
+):
+    """Compute F1 socre of voiced/unvoiced accuracy between the predicted and the ground truth audio.
+    audio_ref: path to the ground truth audio.
+    audio_deg: path to the predicted audio.
+    fs: sampling rate.
+    hop_length: hop length.
+    f0_min: lower limit for f0.
+    f0_max: upper limit for f0.
+    pitch_bin: number of bins for f0 quantization.
+    pitch_max: upper limit for f0 quantization.
+    pitch_min: lower limit for f0 quantization.
+    need_mean: subtract the mean value from f0 if "True".
+    method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio.
+            "cut" will cut both audios into a same length according to the one with the shorter length.
+    """
+    # Load audio
+    if fs != None:
+        audio_ref, _ = librosa.load(audio_ref, sr=fs)
+        audio_deg, _ = librosa.load(audio_deg, sr=fs)
+    else:
+        audio_ref, fs = librosa.load(audio_ref)
+        audio_deg, fs = librosa.load(audio_deg)
+    # Initialize config
+    cfg = JsonHParams()
+    cfg.sample_rate = fs
+    cfg.hop_size = hop_length
+    cfg.f0_min = f0_min
+    cfg.f0_max = f0_max
+    cfg.pitch_bin = pitch_bin
+    cfg.pitch_max = pitch_max
+    cfg.pitch_min = pitch_min
+    # Compute f0
+    f0_ref = get_f0_features_using_parselmouth(
+        audio_ref,
+        cfg,
+    )[0]
+    f0_deg = get_f0_features_using_parselmouth(
+        audio_deg,
+        cfg,
+    )[0]
+    # Avoid silence
+    min_length = min(len(f0_ref), len(f0_deg))
+    if min_length <= 1:
+        return 0, 0, 0
+    # F0 length alignment
+    if method == "cut":
+        length = min(len(f0_ref), len(f0_deg))
+        f0_ref = f0_ref[:length]
+        f0_deg = f0_deg[:length]
+    elif method == "dtw":
+        _, wp = librosa.sequence.dtw(f0_ref, f0_deg, backtrack=True)
+        f0_gt_new = []
+        f0_pred_new = []
+        for i in range(wp.shape[0]):
+            gt_index = wp[i][0]
+            pred_index = wp[i][1]
+            f0_gt_new.append(f0_ref[gt_index])
+            f0_pred_new.append(f0_deg[pred_index])
+        f0_ref = np.array(f0_gt_new)
+        f0_deg = np.array(f0_pred_new)
+        assert len(f0_ref) == len(f0_deg)
+    # Get voiced/unvoiced parts
+    ref_voiced = torch.Tensor([f0_ref != 0]).bool()
+    deg_voiced = torch.Tensor([f0_deg != 0]).bool()
+    # Compute TP, FP, FN
+    true_postives = (ref_voiced & deg_voiced).sum()
+    false_postives = (~ref_voiced & deg_voiced).sum()
+    false_negatives = (ref_voiced & ~deg_voiced).sum()
+    return (
+        true_postives.numpy().tolist(),
+        false_postives.numpy().tolist(),
+        false_negatives.numpy().tolist(),
+    )

evaluation/metrics/intelligibility/__init__.py ADDED Viewed

File without changes

evaluation/metrics/intelligibility/character_error_rate.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import whisper
+from torchmetrics import CharErrorRate
+def extract_cer(
+    content_gt=None,
+    audio_ref=None,
+    audio_deg=None,
+    fs=None,
+    language="chinese",
+    remove_space=True,
+    remove_punctuation=True,
+    mode="gt_audio",
+):
+    """Compute Character Error Rate (CER) between the predicted and the ground truth audio.
+    content_gt: the ground truth content.
+    audio_ref: path to the ground truth audio.
+    audio_deg: path to the predicted audio.
+    mode: "gt_content" computes the CER between the predicted content obtained from the whisper model and the ground truth content.
+          both content_gt and audio_deg are needed.
+          "gt_audio" computes the CER between the extracted ground truth and predicted contents obtained from the whisper model.
+          both audio_ref and audio_deg are needed.
+    """
+    # Get ground truth content
+    if mode == "gt_content":
+        assert content_gt != None
+        if language == "chinese":
+            prompt = "以下是普通话的句子"
+            model = whisper.load_model("large").cuda()
+            result_deg = model.transcribe(
+                audio_deg, language="zh", verbose=True, initial_prompt=prompt
+            )
+        elif language == "english":
+            model = whisper.load_model("large").cuda()
+            result_deg = model.transcribe(audio_deg, language="en", verbose=True)
+    elif mode == "gt_audio":
+        assert audio_ref != None
+        if language == "chinese":
+            prompt = "以下是普通话的句子"
+            model = whisper.load_model("large").cuda()
+            result_ref = model.transcribe(
+                audio_ref, language="zh", verbose=True, initial_prompt=prompt
+            )
+            result_deg = model.transcribe(
+                audio_deg, language="zh", verbose=True, initial_prompt=prompt
+            )
+        elif language == "english":
+            model = whisper.load_model("large").cuda()
+            result_ref = model.transcribe(audio_deg, language="en", verbose=True)
+            result_deg = model.transcribe(audio_deg, language="en", verbose=True)
+        content_gt = result_ref["text"]
+        if remove_space:
+            content_gt = content_gt.replace(" ", "")
+        if remove_punctuation:
+            content_gt = content_gt.replace(".", "")
+            content_gt = content_gt.replace("'", "")
+            content_gt = content_gt.replace("-", "")
+            content_gt = content_gt.replace(",", "")
+            content_gt = content_gt.replace("!", "")
+            content_gt = content_gt.lower()
+    # Get predicted truth content
+    content_pred = result_deg["text"]
+    if remove_space:
+        content_pred = content_pred.replace(" ", "")
+    if remove_punctuation:
+        content_pred = content_pred.replace(".", "")
+        content_pred = content_pred.replace("'", "")
+        content_pred = content_pred.replace("-", "")
+        content_pred = content_pred.replace(",", "")
+        content_pred = content_pred.replace("!", "")
+        content_pred = content_pred.lower()
+    cer = CharErrorRate()
+    return cer(content_pred, content_gt).numpy().tolist()

evaluation/metrics/intelligibility/word_error_rate.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import whisper
+from torchmetrics import WordErrorRate
+def extract_wer(
+    content_gt=None,
+    audio_ref=None,
+    audio_deg=None,
+    fs=None,
+    language="chinese",
+    remove_space=True,
+    remove_punctuation=True,
+    mode="gt_audio",
+):
+    """Compute Word Error Rate (WER) between the predicted and the ground truth audio.
+    content_gt: the ground truth content.
+    audio_ref: path to the ground truth audio.
+    audio_deg: path to the predicted audio.
+    mode: "gt_content" computes the WER between the predicted content obtained from the whisper model and the ground truth content.
+          both content_gt and audio_deg are needed.
+          "gt_audio" computes the WER between the extracted ground truth and predicted contents obtained from the whisper model.
+          both audio_ref and audio_deg are needed.
+    """
+    # Get ground truth content
+    if mode == "gt_content":
+        assert content_gt != None
+        if language == "chinese":
+            prompt = "以下是普通话的句子"
+            model = whisper.load_model("large").cuda()
+            result_deg = model.transcribe(
+                audio_deg, language="zh", verbose=True, initial_prompt=prompt
+            )
+        elif language == "english":
+            model = whisper.load_model("large").cuda()
+            result_deg = model.transcribe(audio_deg, language="en", verbose=True)
+    elif mode == "gt_audio":
+        assert audio_ref != None
+        if language == "chinese":
+            prompt = "以下是普通话的句子"
+            model = whisper.load_model("large").cuda()
+            result_ref = model.transcribe(
+                audio_ref, language="zh", verbose=True, initial_prompt=prompt
+            )
+            result_deg = model.transcribe(
+                audio_deg, language="zh", verbose=True, initial_prompt=prompt
+            )
+        elif language == "english":
+            model = whisper.load_model("large").cuda()
+            result_ref = model.transcribe(audio_deg, language="en", verbose=True)
+            result_deg = model.transcribe(audio_deg, language="en", verbose=True)
+        content_gt = result_ref["text"]
+        if remove_space:
+            content_gt = content_gt.replace(" ", "")
+        if remove_punctuation:
+            content_gt = content_gt.replace(".", "")
+            content_gt = content_gt.replace("'", "")
+            content_gt = content_gt.replace("-", "")
+            content_gt = content_gt.replace(",", "")
+            content_gt = content_gt.replace("!", "")
+            content_gt = content_gt.lower()
+    # Get predicted content
+    content_pred = result_deg["text"]
+    if remove_space:
+        content_pred = content_pred.replace(" ", "")
+    if remove_punctuation:
+        content_pred = content_pred.replace(".", "")
+        content_pred = content_pred.replace("'", "")
+        content_pred = content_pred.replace("-", "")
+        content_pred = content_pred.replace(",", "")
+        content_pred = content_pred.replace("!", "")
+        content_pred = content_pred.lower()
+    wer = WordErrorRate()
+    return wer(content_pred, content_gt).numpy().tolist()

evaluation/metrics/similarity/__init__.py ADDED Viewed

File without changes

evaluation/metrics/similarity/models/RawNetBasicBlock.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class PreEmphasis(torch.nn.Module):
+    def __init__(self, coef: float = 0.97) -> None:
+        super().__init__()
+        self.coef = coef
+        # make kernel
+        # In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
+        self.register_buffer(
+            "flipped_filter",
+            torch.FloatTensor([-self.coef, 1.0]).unsqueeze(0).unsqueeze(0),
+        )
+    def forward(self, input: torch.tensor) -> torch.tensor:
+        assert (
+            len(input.size()) == 2
+        ), "The number of dimensions of input tensor must be 2!"
+        # reflect padding to match lengths of in/out
+        input = input.unsqueeze(1)
+        input = F.pad(input, (1, 0), "reflect")
+        return F.conv1d(input, self.flipped_filter)
+class AFMS(nn.Module):
+    """
+    Alpha-Feature map scaling, added to the output of each residual block[1,2].
+    Reference:
+    [1] RawNet2 : https://www.isca-speech.org/archive/Interspeech_2020/pdfs/1011.pdf
+    [2] AMFS    : https://www.koreascience.or.kr/article/JAKO202029757857763.page
+    """
+    def __init__(self, nb_dim: int) -> None:
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones((nb_dim, 1)))
+        self.fc = nn.Linear(nb_dim, nb_dim)
+        self.sig = nn.Sigmoid()
+    def forward(self, x):
+        y = F.adaptive_avg_pool1d(x, 1).view(x.size(0), -1)
+        y = self.sig(self.fc(y)).view(x.size(0), x.size(1), -1)
+        x = x + self.alpha
+        x = x * y
+        return x
+class Bottle2neck(nn.Module):
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        kernel_size=None,
+        dilation=None,
+        scale=4,
+        pool=False,
+    ):
+        super().__init__()
+        width = int(math.floor(planes / scale))
+        self.conv1 = nn.Conv1d(inplanes, width * scale, kernel_size=1)
+        self.bn1 = nn.BatchNorm1d(width * scale)
+        self.nums = scale - 1
+        convs = []
+        bns = []
+        num_pad = math.floor(kernel_size / 2) * dilation
+        for i in range(self.nums):
+            convs.append(
+                nn.Conv1d(
+                    width,
+                    width,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                    padding=num_pad,
+                )
+            )
+            bns.append(nn.BatchNorm1d(width))
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(bns)
+        self.conv3 = nn.Conv1d(width * scale, planes, kernel_size=1)
+        self.bn3 = nn.BatchNorm1d(planes)
+        self.relu = nn.ReLU()
+        self.width = width
+        self.mp = nn.MaxPool1d(pool) if pool else False
+        self.afms = AFMS(planes)
+        if inplanes != planes:  # if change in number of filters
+            self.residual = nn.Sequential(
+                nn.Conv1d(inplanes, planes, kernel_size=1, stride=1, bias=False)
+            )
+        else:
+            self.residual = nn.Identity()
+    def forward(self, x):
+        residual = self.residual(x)
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.bn1(out)
+        spx = torch.split(out, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            sp = self.convs[i](sp)
+            sp = self.relu(sp)
+            sp = self.bns[i](sp)
+            if i == 0:
+                out = sp
+            else:
+                out = torch.cat((out, sp), 1)
+        out = torch.cat((out, spx[self.nums]), 1)
+        out = self.conv3(out)
+        out = self.relu(out)
+        out = self.bn3(out)
+        out += residual
+        if self.mp:
+            out = self.mp(out)
+        out = self.afms(out)
+        return out

evaluation/metrics/similarity/models/RawNetModel.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# -*- encoding: utf-8 -*-
+import torch
+import torch.nn as nn
+from asteroid_filterbanks import Encoder, ParamSincFB
+from .RawNetBasicBlock import Bottle2neck, PreEmphasis
+class RawNet3(nn.Module):
+    def __init__(self, block, model_scale, context, summed, C=1024, **kwargs):
+        super().__init__()
+        nOut = kwargs["nOut"]
+        self.context = context
+        self.encoder_type = kwargs["encoder_type"]
+        self.log_sinc = kwargs["log_sinc"]
+        self.norm_sinc = kwargs["norm_sinc"]
+        self.out_bn = kwargs["out_bn"]
+        self.summed = summed
+        self.preprocess = nn.Sequential(
+            PreEmphasis(), nn.InstanceNorm1d(1, eps=1e-4, affine=True)
+        )
+        self.conv1 = Encoder(
+            ParamSincFB(
+                C // 4,
+                251,
+                stride=kwargs["sinc_stride"],
+            )
+        )
+        self.relu = nn.ReLU()
+        self.bn1 = nn.BatchNorm1d(C // 4)
+        self.layer1 = block(
+            C // 4, C, kernel_size=3, dilation=2, scale=model_scale, pool=5
+        )
+        self.layer2 = block(C, C, kernel_size=3, dilation=3, scale=model_scale, pool=3)
+        self.layer3 = block(C, C, kernel_size=3, dilation=4, scale=model_scale)
+        self.layer4 = nn.Conv1d(3 * C, 1536, kernel_size=1)
+        if self.context:
+            attn_input = 1536 * 3
+        else:
+            attn_input = 1536
+        print("self.encoder_type", self.encoder_type)
+        if self.encoder_type == "ECA":
+            attn_output = 1536
+        elif self.encoder_type == "ASP":
+            attn_output = 1
+        else:
+            raise ValueError("Undefined encoder")
+        self.attention = nn.Sequential(
+            nn.Conv1d(attn_input, 128, kernel_size=1),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Conv1d(128, attn_output, kernel_size=1),
+            nn.Softmax(dim=2),
+        )
+        self.bn5 = nn.BatchNorm1d(3072)
+        self.fc6 = nn.Linear(3072, nOut)
+        self.bn6 = nn.BatchNorm1d(nOut)
+        self.mp3 = nn.MaxPool1d(3)
+    def forward(self, x):
+        """
+        :param x: input mini-batch (bs, samp)
+        """
+        with torch.cuda.amp.autocast(enabled=False):
+            x = self.preprocess(x)
+            x = torch.abs(self.conv1(x))
+            if self.log_sinc:
+                x = torch.log(x + 1e-6)
+            if self.norm_sinc == "mean":
+                x = x - torch.mean(x, dim=-1, keepdim=True)
+            elif self.norm_sinc == "mean_std":
+                m = torch.mean(x, dim=-1, keepdim=True)
+                s = torch.std(x, dim=-1, keepdim=True)
+                s[s < 0.001] = 0.001
+                x = (x - m) / s
+        if self.summed:
+            x1 = self.layer1(x)
+            x2 = self.layer2(x1)
+            x3 = self.layer3(self.mp3(x1) + x2)
+        else:
+            x1 = self.layer1(x)
+            x2 = self.layer2(x1)
+            x3 = self.layer3(x2)
+        x = self.layer4(torch.cat((self.mp3(x1), x2, x3), dim=1))
+        x = self.relu(x)
+        t = x.size()[-1]
+        if self.context:
+            global_x = torch.cat(
+                (
+                    x,
+                    torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t),
+                    torch.sqrt(
+                        torch.var(x, dim=2, keepdim=True).clamp(min=1e-4, max=1e4)
+                    ).repeat(1, 1, t),
+                ),
+                dim=1,
+            )
+        else:
+            global_x = x
+        w = self.attention(global_x)
+        mu = torch.sum(x * w, dim=2)
+        sg = torch.sqrt(
+            (torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4)
+        )
+        x = torch.cat((mu, sg), 1)
+        x = self.bn5(x)
+        x = self.fc6(x)
+        if self.out_bn:
+            x = self.bn6(x)
+        return x
+def MainModel(**kwargs):
+    model = RawNet3(Bottle2neck, model_scale=8, context=True, summed=True, **kwargs)
+    return model

evaluation/metrics/similarity/models/__init__.py ADDED Viewed

File without changes

evaluation/metrics/similarity/speaker_similarity.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import numpy as np
+import soundfile as sf
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+import librosa
+from .models.RawNetModel import RawNet3
+from .models.RawNetBasicBlock import Bottle2neck
+def extract_speaker_embd(
+    model, fn: str, n_samples: int, n_segments: int = 10, gpu: bool = False
+) -> np.ndarray:
+    audio, sample_rate = sf.read(fn)
+    if len(audio.shape) > 1:
+        raise ValueError(
+            f"RawNet3 supports mono input only. Input data has a shape of {audio.shape}."
+        )
+    if sample_rate != 16000:
+        # resample to 16000kHz
+        audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
+        # print("resample to 16000kHz!")
+    if len(audio) < n_samples:  # RawNet3 was trained using utterances of 3 seconds
+        shortage = n_samples - len(audio) + 1
+        audio = np.pad(audio, (0, shortage), "wrap")
+    audios = []
+    startframe = np.linspace(0, len(audio) - n_samples, num=n_segments)
+    for asf in startframe:
+        audios.append(audio[int(asf) : int(asf) + n_samples])
+    audios = torch.from_numpy(np.stack(audios, axis=0).astype(np.float32))
+    if gpu:
+        audios = audios.to("cuda")
+    with torch.no_grad():
+        output = model(audios)
+    return output
+def extract_speaker_similarity(target_path, reference_path):
+    model = RawNet3(
+        Bottle2neck,
+        model_scale=8,
+        context=True,
+        summed=True,
+        encoder_type="ECA",
+        nOut=256,
+        out_bn=False,
+        sinc_stride=10,
+        log_sinc=True,
+        norm_sinc="mean",
+        grad_mult=1,
+    )
+    gpu = False
+    model.load_state_dict(
+        torch.load(
+            "pretrained/rawnet3/model.pt",
+            map_location=lambda storage, loc: storage,
+        )["model"]
+    )
+    model.eval()
+    print("RawNet3 initialised & weights loaded!")
+    if torch.cuda.is_available():
+        print("Cuda available, conducting inference on GPU")
+        model = model.to("cuda")
+        gpu = True
+    # for target_path, reference_path in zip(target_paths, ref_paths):
+    # print(f"Extracting embeddings for target singers...")
+    target_embeddings = []
+    for file in tqdm(os.listdir(target_path)):
+        output = extract_speaker_embd(
+            model,
+            fn=os.path.join(target_path, file),
+            n_samples=48000,
+            n_segments=10,
+            gpu=gpu,
+        ).mean(0)
+        target_embeddings.append(output.detach().cpu().numpy())
+    target_embeddings = np.array(target_embeddings)
+    target_embedding = np.mean(target_embeddings, axis=0)
+    # print(f"Extracting embeddings for reference singer...")
+    reference_embeddings = []
+    for file in tqdm(os.listdir(reference_path)):
+        output = extract_speaker_embd(
+            model,
+            fn=os.path.join(reference_path, file),
+            n_samples=48000,
+            n_segments=10,
+            gpu=gpu,
+        ).mean(0)
+        reference_embeddings.append(output.detach().cpu().numpy())
+    reference_embeddings = np.array(reference_embeddings)
+    # print("Calculating cosine similarity...")
+    cos_sim = F.cosine_similarity(
+        torch.from_numpy(np.mean(target_embeddings, axis=0)).unsqueeze(0),
+        torch.from_numpy(np.mean(reference_embeddings, axis=0)).unsqueeze(0),
+        dim=1,
+    )
+    # print(f"Mean cosine similarity: {cos_sim.item()}")
+    return cos_sim.item()

evaluation/metrics/spectrogram/__init__.py ADDED Viewed

File without changes

evaluation/metrics/spectrogram/frechet_distance.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from frechet_audio_distance import FrechetAudioDistance
+def extract_fad(
+    audio_dir1,
+    audio_dir2,
+    mode="vggish",
+    use_pca=False,
+    use_activation=False,
+    verbose=False,
+):
+    """Extract Frechet Audio Distance for two given audio folders.
+    audio_dir1: path to the ground truth audio folder.
+    audio_dir2: path to the predicted audio folder.
+    mode: "vggish", "pann", "clap" for different models.
+    """
+    frechet = FrechetAudioDistance(
+        model_name=mode,
+        use_pca=use_pca,
+        use_activation=use_activation,
+        verbose=verbose,
+    )
+    fad_score = frechet.score(audio_dir1, audio_dir2)
+    return fad_score

evaluation/metrics/spectrogram/mel_cepstral_distortion.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from pymcd.mcd import Calculate_MCD
+def extract_mcd(audio_ref, audio_deg, fs=None, mode="dtw_sl"):
+    """Extract Mel-Cepstral Distance for a two given audio.
+    Args:
+        audio_ref: The given reference audio. It is an audio path.
+        audio_deg: The given synthesized audio. It is an audio path.
+        mode: "plain", "dtw" and "dtw_sl".
+    """
+    mcd_toolbox = Calculate_MCD(MCD_mode=mode)
+    if fs != None:
+        mcd_toolbox.SAMPLING_RATE = fs
+    mcd_value = mcd_toolbox.calculate_mcd(audio_ref, audio_deg)
+    return mcd_value

evaluation/metrics/spectrogram/multi_resolution_stft_distance.py ADDED Viewed

	@@ -0,0 +1,225 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import librosa
+import torch
+import numpy as np
+def extract_mstft(
+    audio_ref,
+    audio_deg,
+    fs=None,
+    mid_freq=None,
+    high_freq=None,
+    method="cut",
+    version="pwg",
+):
+    """Compute Multi-Scale STFT Distance (mstft) between the predicted and the ground truth audio.
+    audio_ref: path to the ground truth audio.
+    audio_deg: path to the predicted audio.
+    fs: sampling rate.
+    med_freq: division frequency for mid frequency parts.
+    high_freq: division frequency for high frequency parts.
+    method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio.
+            "cut" will cut both audios into a same length according to the one with the shorter length.
+    version: "pwg" will use the computational version provided by ParallelWaveGAN.
+             "encodec" will use the computational version provided by Encodec.
+    """
+    # Load audio
+    if fs != None:
+        audio_ref, _ = librosa.load(audio_ref, sr=fs)
+        audio_deg, _ = librosa.load(audio_deg, sr=fs)
+    else:
+        audio_ref, fs = librosa.load(audio_ref)
+        audio_deg, fs = librosa.load(audio_deg)
+    # Automatically choose mid_freq and high_freq if they are not given
+    if mid_freq == None:
+        mid_freq = fs // 6
+    if high_freq == None:
+        high_freq = fs // 3
+    # Audio length alignment
+    if len(audio_ref) != len(audio_deg):
+        if method == "cut":
+            length = min(len(audio_ref), len(audio_deg))
+            audio_ref = audio_ref[:length]
+            audio_deg = audio_deg[:length]
+        elif method == "dtw":
+            _, wp = librosa.sequence.dtw(audio_ref, audio_deg, backtrack=True)
+            audio_ref_new = []
+            audio_deg_new = []
+            for i in range(wp.shape[0]):
+                ref_index = wp[i][0]
+                deg_index = wp[i][1]
+                audio_ref_new.append(audio_ref[ref_index])
+                audio_deg_new.append(audio_deg[deg_index])
+            audio_ref = np.array(audio_ref_new)
+            audio_deg = np.array(audio_deg_new)
+            assert len(audio_ref) == len(audio_deg)
+    # Define loss function
+    l1Loss = torch.nn.L1Loss(reduction="mean")
+    l2Loss = torch.nn.MSELoss(reduction="mean")
+    # Compute distance
+    if version == "encodec":
+        n_fft = 1024
+        mstft = 0
+        mstft_low = 0
+        mstft_mid = 0
+        mstft_high = 0
+        freq_resolution = fs / n_fft
+        mid_freq_index = 1 + int(np.floor(mid_freq / freq_resolution))
+        high_freq_index = 1 + int(np.floor(high_freq / freq_resolution))
+        for i in range(5, 11):
+            hop_length = 2**i // 4
+            win_length = 2**i
+            spec_ref = librosa.stft(
+                y=audio_ref, n_fft=n_fft, hop_length=hop_length, win_length=win_length
+            )
+            spec_deg = librosa.stft(
+                y=audio_deg, n_fft=n_fft, hop_length=hop_length, win_length=win_length
+            )
+            mag_ref = np.abs(spec_ref)
+            mag_deg = np.abs(spec_deg)
+            mag_ref = torch.from_numpy(mag_ref)
+            mag_deg = torch.from_numpy(mag_deg)
+            mstft += l1Loss(mag_ref, mag_deg) + l2Loss(mag_ref, mag_deg)
+            mag_ref_low = mag_ref[:mid_freq_index, :]
+            mag_deg_low = mag_deg[:mid_freq_index, :]
+            mstft_low += l1Loss(mag_ref_low, mag_deg_low) + l2Loss(
+                mag_ref_low, mag_deg_low
+            )
+            mag_ref_mid = mag_ref[mid_freq_index:high_freq_index, :]
+            mag_deg_mid = mag_deg[mid_freq_index:high_freq_index, :]
+            mstft_mid += l1Loss(mag_ref_mid, mag_deg_mid) + l2Loss(
+                mag_ref_mid, mag_deg_mid
+            )
+            mag_ref_high = mag_ref[high_freq_index:, :]
+            mag_deg_high = mag_deg[high_freq_index:, :]
+            mstft_high += l1Loss(mag_ref_high, mag_deg_high) + l2Loss(
+                mag_ref_high, mag_deg_high
+            )
+        mstft /= 6
+        mstft_low /= 6
+        mstft_mid /= 6
+        mstft_high /= 6
+        return mstft
+    elif version == "pwg":
+        fft_sizes = [1024, 2048, 512]
+        hop_sizes = [120, 240, 50]
+        win_sizes = [600, 1200, 240]
+        audio_ref = torch.from_numpy(audio_ref)
+        audio_deg = torch.from_numpy(audio_deg)
+        mstft_sc = 0
+        mstft_sc_low = 0
+        mstft_sc_mid = 0
+        mstft_sc_high = 0
+        mstft_mag = 0
+        mstft_mag_low = 0
+        mstft_mag_mid = 0
+        mstft_mag_high = 0
+        for n_fft, hop_length, win_length in zip(fft_sizes, hop_sizes, win_sizes):
+            spec_ref = torch.stft(
+                audio_ref, n_fft, hop_length, win_length, return_complex=False
+            )
+            spec_deg = torch.stft(
+                audio_deg, n_fft, hop_length, win_length, return_complex=False
+            )
+            real_ref = spec_ref[..., 0]
+            imag_ref = spec_ref[..., 1]
+            real_deg = spec_deg[..., 0]
+            imag_deg = spec_deg[..., 1]
+            mag_ref = torch.sqrt(
+                torch.clamp(real_ref**2 + imag_ref**2, min=1e-7)
+            ).transpose(1, 0)
+            mag_deg = torch.sqrt(
+                torch.clamp(real_deg**2 + imag_deg**2, min=1e-7)
+            ).transpose(1, 0)
+            sc_loss = torch.norm(mag_ref - mag_deg, p="fro") / torch.norm(
+                mag_ref, p="fro"
+            )
+            mag_loss = l1Loss(torch.log(mag_ref), torch.log(mag_deg))
+            mstft_sc += sc_loss
+            mstft_mag += mag_loss
+            freq_resolution = fs / n_fft
+            mid_freq_index = 1 + int(np.floor(mid_freq / freq_resolution))
+            high_freq_index = 1 + int(np.floor(high_freq / freq_resolution))
+            mag_ref_low = mag_ref[:, :mid_freq_index]
+            mag_deg_low = mag_deg[:, :mid_freq_index]
+            sc_loss_low = torch.norm(mag_ref_low - mag_deg_low, p="fro") / torch.norm(
+                mag_ref_low, p="fro"
+            )
+            mag_loss_low = l1Loss(torch.log(mag_ref_low), torch.log(mag_deg_low))
+            mstft_sc_low += sc_loss_low
+            mstft_mag_low += mag_loss_low
+            mag_ref_mid = mag_ref[:, mid_freq_index:high_freq_index]
+            mag_deg_mid = mag_deg[:, mid_freq_index:high_freq_index]
+            sc_loss_mid = torch.norm(mag_ref_mid - mag_deg_mid, p="fro") / torch.norm(
+                mag_ref_mid, p="fro"
+            )
+            mag_loss_mid = l1Loss(torch.log(mag_ref_mid), torch.log(mag_deg_mid))
+            mstft_sc_mid += sc_loss_mid
+            mstft_mag_mid += mag_loss_mid
+            mag_ref_high = mag_ref[:, high_freq_index:]
+            mag_deg_high = mag_deg[:, high_freq_index:]
+            sc_loss_high = torch.norm(
+                mag_ref_high - mag_deg_high, p="fro"
+            ) / torch.norm(mag_ref_high, p="fro")
+            mag_loss_high = l1Loss(torch.log(mag_ref_high), torch.log(mag_deg_high))
+            mstft_sc_high += sc_loss_high
+            mstft_mag_high += mag_loss_high
+        # Normalize distances
+        mstft_sc /= len(fft_sizes)
+        mstft_sc_low /= len(fft_sizes)
+        mstft_sc_mid /= len(fft_sizes)
+        mstft_sc_high /= len(fft_sizes)
+        mstft_mag /= len(fft_sizes)
+        mstft_mag_low /= len(fft_sizes)
+        mstft_mag_mid /= len(fft_sizes)
+        mstft_mag_high /= len(fft_sizes)
+        # return (
+        #     mstft_sc.numpy().tolist(),
+        #     mstft_sc_low.numpy().tolist(),
+        #     mstft_sc_mid.numpy().tolist(),
+        #     mstft_sc_high.numpy().tolist(),
+        #     mstft_mag.numpy().tolist(),
+        #     mstft_mag_low.numpy().tolist(),
+        #     mstft_mag_mid.numpy().tolist(),
+        #     mstft_mag_high.numpy().tolist(),
+        # )
+        return mstft_sc.numpy().tolist() + mstft_mag.numpy().tolist()

evaluation/metrics/spectrogram/pesq.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import librosa
+import numpy as np
+from pypesq import pesq
+def extract_pesq(audio_ref, audio_deg, fs=None, method="cut"):
+    """Extract PESQ for a two given audio.
+    audio1: the given reference audio. It is a numpy array.
+    audio2: the given synthesized audio. It is a numpy array.
+    fs: sampling rate.
+    method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio.
+            "cut" will cut both audios into a same length according to the one with the shorter length.
+    """
+    # Load audio
+    if fs != None:
+        audio_ref, _ = librosa.load(audio_ref, sr=fs)
+        audio_deg, _ = librosa.load(audio_deg, sr=fs)
+    else:
+        audio_ref, fs = librosa.load(audio_ref)
+        audio_deg, fs = librosa.load(audio_deg)
+    # Resample
+    if fs != 16000:
+        audio_ref = librosa.resample(audio_ref, orig_sr=fs, target_sr=16000)
+        audio_deg = librosa.resample(audio_deg, orig_sr=fs, target_sr=16000)
+        fs = 16000
+    # Audio length alignment
+    if len(audio_ref) != len(audio_deg):
+        if method == "cut":
+            length = min(len(audio_ref), len(audio_deg))
+            audio_ref = audio_ref[:length]
+            audio_deg = audio_deg[:length]
+        elif method == "dtw":
+            _, wp = librosa.sequence.dtw(audio_ref, audio_deg, backtrack=True)
+            audio_ref_new = []
+            audio_deg_new = []
+            for i in range(wp.shape[0]):
+                ref_index = wp[i][0]
+                deg_index = wp[i][1]
+                audio_ref_new.append(audio_ref[ref_index])
+                audio_deg_new.append(audio_deg[deg_index])
+            audio_ref = np.array(audio_ref_new)
+            audio_deg = np.array(audio_deg_new)
+            assert len(audio_ref) == len(audio_deg)
+    # Compute pesq
+    score = pesq(audio_ref, audio_deg, fs)
+    return score

evaluation/metrics/spectrogram/scale_invariant_signal_to_distortion_ratio.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import librosa
+import numpy as np
+from torchmetrics import ScaleInvariantSignalDistortionRatio
+def extract_si_sdr(audio_ref, audio_deg, fs=None, method="cut"):
+    si_sdr = ScaleInvariantSignalDistortionRatio()
+    if fs != None:
+        audio_ref, _ = librosa.load(audio_ref, sr=fs)
+        audio_deg, _ = librosa.load(audio_deg, sr=fs)
+    else:
+        audio_ref, fs = librosa.load(audio_ref)
+        audio_deg, fs = librosa.load(audio_deg)
+    if len(audio_ref) != len(audio_deg):
+        if method == "cut":
+            length = min(len(audio_ref), len(audio_deg))
+            audio_ref = audio_ref[:length]
+            audio_deg = audio_deg[:length]
+        elif method == "dtw":
+            _, wp = librosa.sequence.dtw(audio_ref, audio_deg, backtrack=True)
+            audio_ref_new = []
+            audio_deg_new = []
+            for i in range(wp.shape[0]):
+                ref_index = wp[i][0]
+                deg_index = wp[i][1]
+                audio_ref_new.append(audio_ref[ref_index])
+                audio_deg_new.append(audio_deg[deg_index])
+            audio_ref = np.array(audio_ref_new)
+            audio_deg = np.array(audio_deg_new)
+            assert len(audio_ref) == len(audio_deg)
+    audio_ref = torch.from_numpy(audio_ref)
+    audio_deg = torch.from_numpy(audio_deg)
+    return si_sdr(audio_deg, audio_ref)

evaluation/metrics/spectrogram/scale_invariant_signal_to_noise_ratio.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import librosa
+import numpy as np
+from torchmetrics import ScaleInvariantSignalNoiseRatio
+def extract_si_snr(audio_ref, audio_deg, fs=None, method="cut"):
+    si_snr = ScaleInvariantSignalNoiseRatio()
+    if fs != None:
+        audio_ref, _ = librosa.load(audio_ref, sr=fs)
+        audio_deg, _ = librosa.load(audio_deg, sr=fs)
+    else:
+        audio_ref, fs = librosa.load(audio_ref)
+        audio_deg, fs = librosa.load(audio_deg)
+    if len(audio_ref) != len(audio_deg):
+        if method == "cut":
+            length = min(len(audio_ref), len(audio_deg))
+            audio_ref = audio_ref[:length]
+            audio_deg = audio_deg[:length]
+        elif method == "dtw":
+            _, wp = librosa.sequence.dtw(audio_ref, audio_deg, backtrack=True)
+            audio_ref_new = []
+            audio_deg_new = []
+            for i in range(wp.shape[0]):
+                ref_index = wp[i][0]
+                deg_index = wp[i][1]
+                audio_ref_new.append(audio_ref[ref_index])
+                audio_deg_new.append(audio_deg[deg_index])
+            audio_ref = np.array(audio_ref_new)
+            audio_deg = np.array(audio_deg_new)
+            assert len(audio_ref) == len(audio_deg)
+    audio_ref = torch.from_numpy(audio_ref)
+    audio_deg = torch.from_numpy(audio_deg)
+    return si_snr(audio_deg, audio_ref)

evaluation/metrics/spectrogram/short_time_objective_intelligibility.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import librosa
+import numpy as np
+from torchmetrics.audio.stoi import ShortTimeObjectiveIntelligibility
+def extract_stoi(audio_ref, audio_deg, fs=None, extended=False, method="cut"):
+    """Compute Short-Time Objective Intelligibility between the predicted and the ground truth audio.
+    audio_ref: path to the ground truth audio.
+    audio_deg: path to the predicted audio.
+    fs: sampling rate.
+    method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio.
+            "cut" will cut both audios into a same length according to the one with the shorter length.
+    """
+    # Load audio
+    if fs != None:
+        audio_ref, _ = librosa.load(audio_ref, sr=fs)
+        audio_deg, _ = librosa.load(audio_deg, sr=fs)
+    else:
+        audio_ref, fs = librosa.load(audio_ref)
+        audio_deg, fs = librosa.load(audio_deg)
+    # Initialize method
+    stoi = ShortTimeObjectiveIntelligibility(fs, extended)
+    # Audio length alignment
+    if len(audio_ref) != len(audio_deg):
+        if method == "cut":
+            length = min(len(audio_ref), len(audio_deg))
+            audio_ref = audio_ref[:length]
+            audio_deg = audio_deg[:length]
+        elif method == "dtw":
+            _, wp = librosa.sequence.dtw(audio_ref, audio_deg, backtrack=True)
+            audio_ref_new = []
+            audio_deg_new = []
+            for i in range(wp.shape[0]):
+                ref_index = wp[i][0]
+                deg_index = wp[i][1]
+                audio_ref_new.append(audio_ref[ref_index])
+                audio_deg_new.append(audio_deg[deg_index])
+            audio_ref = np.array(audio_ref_new)
+            audio_deg = np.array(audio_deg_new)
+            assert len(audio_ref) == len(audio_deg)
+    # Convert to tensor
+    audio_ref = torch.from_numpy(audio_ref)
+    audio_deg = torch.from_numpy(audio_deg)
+    return stoi(audio_deg, audio_ref).numpy().tolist()

models/tts/base/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# from .tts_inferece import TTSInference
+from .tts_trainer import TTSTrainer

models/tts/base/tts_dataset.py ADDED Viewed

	@@ -0,0 +1,389 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import torchaudio
+import numpy as np
+import torch
+from utils.data_utils import *
+from torch.nn.utils.rnn import pad_sequence
+from text import text_to_sequence
+from text.text_token_collation import phoneIDCollation
+from processors.acoustic_extractor import cal_normalized_mel
+from models.base.base_dataset import (
+    BaseDataset,
+    BaseCollator,
+    BaseTestDataset,
+    BaseTestCollator,
+)
+from processors.content_extractor import (
+    ContentvecExtractor,
+    WenetExtractor,
+    WhisperExtractor,
+)
+class TTSDataset(BaseDataset):
+    def __init__(self, cfg, dataset, is_valid=False):
+        """
+        Args:
+            cfg: config
+            dataset: dataset name
+            is_valid: whether to use train or valid dataset
+        """
+        assert isinstance(dataset, str)
+        self.cfg = cfg
+        processed_data_dir = os.path.join(cfg.preprocess.processed_dir, dataset)
+        meta_file = cfg.preprocess.valid_file if is_valid else cfg.preprocess.train_file
+        self.metafile_path = os.path.join(processed_data_dir, meta_file)
+        self.metadata = self.get_metadata()
+        """
+        load spk2id and utt2spk from json file
+            spk2id: {spk1: 0, spk2: 1, ...}
+            utt2spk: {dataset_uid: spk1, ...}
+        """
+        if cfg.preprocess.use_spkid:
+            dataset = self.metadata[0]["Dataset"]
+            spk2id_path = os.path.join(processed_data_dir, cfg.preprocess.spk2id)
+            with open(spk2id_path, "r") as f:
+                self.spk2id = json.load(f)
+            utt2spk_path = os.path.join(processed_data_dir, cfg.preprocess.utt2spk)
+            self.utt2spk = dict()
+            with open(utt2spk_path, "r") as f:
+                for line in f.readlines():
+                    utt, spk = line.strip().split("\t")
+                    self.utt2spk[utt] = spk
+        if cfg.preprocess.use_uv:
+            self.utt2uv_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2uv_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.uv_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_frame_pitch:
+            self.utt2frame_pitch_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2frame_pitch_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.pitch_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_frame_energy:
+            self.utt2frame_energy_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2frame_energy_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.energy_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_mel:
+            self.utt2mel_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2mel_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.mel_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_linear:
+            self.utt2linear_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2linear_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.linear_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_audio:
+            self.utt2audio_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                if cfg.preprocess.extract_audio:
+                    self.utt2audio_path[utt] = os.path.join(
+                        cfg.preprocess.processed_dir,
+                        dataset,
+                        cfg.preprocess.audio_dir,
+                        uid + ".wav",
+                    )
+                else:
+                    self.utt2audio_path[utt] = utt_info["Path"]
+                    # self.utt2audio_path[utt] = os.path.join(
+                    #     cfg.preprocess.processed_dir,
+                    #     dataset,
+                    #     cfg.preprocess.audio_dir,
+                    #     uid + ".numpy",
+                    # )
+        elif cfg.preprocess.use_label:
+            self.utt2label_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2label_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.label_dir,
+                    uid + ".npy",
+                )
+        elif cfg.preprocess.use_one_hot:
+            self.utt2one_hot_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2one_hot_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.one_hot_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_text or cfg.preprocess.use_phone:
+            self.utt2seq = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                if cfg.preprocess.use_text:
+                    text = utt_info["Text"]
+                    sequence = text_to_sequence(text, cfg.preprocess.text_cleaners)
+                elif cfg.preprocess.use_phone:
+                    # load phoneme squence from phone file
+                    phone_path = os.path.join(
+                        processed_data_dir, cfg.preprocess.phone_dir, uid + ".phone"
+                    )
+                    with open(phone_path, "r") as fin:
+                        phones = fin.readlines()
+                        assert len(phones) == 1
+                        phones = phones[0].strip()
+                    phones_seq = phones.split(" ")
+                    phon_id_collator = phoneIDCollation(cfg, dataset=dataset)
+                    sequence = phon_id_collator.get_phone_id_sequence(cfg, phones_seq)
+                self.utt2seq[utt] = sequence
+    def __getitem__(self, index):
+        utt_info = self.metadata[index]
+        dataset = utt_info["Dataset"]
+        uid = utt_info["Uid"]
+        utt = "{}_{}".format(dataset, uid)
+        single_feature = dict()
+        if self.cfg.preprocess.use_spkid:
+            single_feature["spk_id"] = np.array(
+                [self.spk2id[self.utt2spk[utt]]], dtype=np.int32
+            )
+        if self.cfg.preprocess.use_mel:
+            mel = np.load(self.utt2mel_path[utt])
+            assert mel.shape[0] == self.cfg.preprocess.n_mel  # [n_mels, T]
+            if self.cfg.preprocess.use_min_max_norm_mel:
+                # do mel norm
+                mel = cal_normalized_mel(mel, utt_info["Dataset"], self.cfg.preprocess)
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = mel.shape[1]
+            single_feature["mel"] = mel.T  # [T, n_mels]
+        if self.cfg.preprocess.use_linear:
+            linear = np.load(self.utt2linear_path[utt])
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = linear.shape[1]
+            single_feature["linear"] = linear.T  # [T, n_linear]
+        if self.cfg.preprocess.use_frame_pitch:
+            frame_pitch_path = self.utt2frame_pitch_path[utt]
+            frame_pitch = np.load(frame_pitch_path)
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = len(frame_pitch)
+            aligned_frame_pitch = align_length(
+                frame_pitch, single_feature["target_len"]
+            )
+            single_feature["frame_pitch"] = aligned_frame_pitch
+            if self.cfg.preprocess.use_uv:
+                frame_uv_path = self.utt2uv_path[utt]
+                frame_uv = np.load(frame_uv_path)
+                aligned_frame_uv = align_length(frame_uv, single_feature["target_len"])
+                aligned_frame_uv = [
+                    0 if frame_uv else 1 for frame_uv in aligned_frame_uv
+                ]
+                aligned_frame_uv = np.array(aligned_frame_uv)
+                single_feature["frame_uv"] = aligned_frame_uv
+        if self.cfg.preprocess.use_frame_energy:
+            frame_energy_path = self.utt2frame_energy_path[utt]
+            frame_energy = np.load(frame_energy_path)
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = len(frame_energy)
+            aligned_frame_energy = align_length(
+                frame_energy, single_feature["target_len"]
+            )
+            single_feature["frame_energy"] = aligned_frame_energy
+        if self.cfg.preprocess.use_audio:
+            audio, sr = torchaudio.load(self.utt2audio_path[utt])
+            audio = audio.cpu().numpy().squeeze()
+            single_feature["audio"] = audio
+            single_feature["audio_len"] = audio.shape[0]
+        if self.cfg.preprocess.use_phone or self.cfg.preprocess.use_text:
+            single_feature["phone_seq"] = np.array(self.utt2seq[utt])
+            single_feature["phone_len"] = len(self.utt2seq[utt])
+        return single_feature
+    def __len__(self):
+        return super().__len__()
+    def get_metadata(self):
+        return super().get_metadata()
+class TTSCollator(BaseCollator):
+    """Zero-pads model inputs and targets based on number of frames per step"""
+    def __init__(self, cfg):
+        super().__init__(cfg)
+    def __call__(self, batch):
+        parsed_batch_features = super().__call__(batch)
+        return parsed_batch_features
+class TTSTestDataset(BaseTestDataset):
+    def __init__(self, args, cfg):
+        self.cfg = cfg
+        # inference from test list file
+        if args.test_list_file is not None:
+            # construst metadata
+            self.metadata = []
+            with open(args.test_list_file, "r") as fin:
+                for idx, line in enumerate(fin.readlines()):
+                    utt_info = {}
+                    utt_info["Dataset"] = "test"
+                    utt_info["Text"] = line.strip()
+                    utt_info["Uid"] = str(idx)
+                    self.metadata.append(utt_info)
+        else:
+            assert args.testing_set
+            self.metafile_path = os.path.join(
+                cfg.preprocess.processed_dir,
+                args.dataset,
+                "{}.json".format(args.testing_set),
+            )
+            self.metadata = self.get_metadata()
+    def __getitem__(self, index):
+        single_feature = {}
+        return single_feature
+    def __len__(self):
+        return len(self.metadata)
+class TTSTestCollator(BaseTestCollator):
+    """Zero-pads model inputs and targets based on number of frames per step"""
+    def __init__(self, cfg):
+        self.cfg = cfg
+    def __call__(self, batch):
+        packed_batch_features = dict()
+        # mel: [b, T, n_mels]
+        # frame_pitch, frame_energy: [1, T]
+        # target_len: [1]
+        # spk_id: [b, 1]
+        # mask: [b, T, 1]
+        for key in batch[0].keys():
+            if key == "target_len":
+                packed_batch_features["target_len"] = torch.LongTensor(
+                    [b["target_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["target_len"], 1), dtype=torch.long) for b in batch
+                ]
+                packed_batch_features["mask"] = pad_sequence(
+                    masks, batch_first=True, padding_value=0
+                )
+            elif key == "phone_len":
+                packed_batch_features["phone_len"] = torch.LongTensor(
+                    [b["phone_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["phone_len"], 1), dtype=torch.long) for b in batch
+                ]
+                packed_batch_features["phn_mask"] = pad_sequence(
+                    masks, batch_first=True, padding_value=0
+                )
+            elif key == "audio_len":
+                packed_batch_features["audio_len"] = torch.LongTensor(
+                    [b["audio_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["audio_len"], 1), dtype=torch.long) for b in batch
+                ]
+            else:
+                values = [torch.from_numpy(b[key]) for b in batch]
+                packed_batch_features[key] = pad_sequence(
+                    values, batch_first=True, padding_value=0
+                )
+        return packed_batch_features

models/tts/base/tts_inferece.py ADDED Viewed

	@@ -0,0 +1,268 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import torch
+import time
+import accelerate
+import random
+import numpy as np
+from tqdm import tqdm
+from accelerate.logging import get_logger
+from torch.utils.data import DataLoader
+from abc import abstractmethod
+from pathlib import Path
+from utils.io import save_audio
+from utils.util import load_config
+from models.vocoders.vocoder_inference import synthesis
+class TTSInference(object):
+    def __init__(self, args=None, cfg=None):
+        super().__init__()
+        start = time.monotonic_ns()
+        self.args = args
+        self.cfg = cfg
+        self.infer_type = args.mode
+        # get exp_dir
+        if self.args.acoustics_dir is not None:
+            self.exp_dir = self.args.acoustics_dir
+        elif self.args.checkpoint_path is not None:
+            self.exp_dir = os.path.dirname(os.path.dirname(self.args.checkpoint_path))
+        # Init accelerator
+        self.accelerator = accelerate.Accelerator()
+        self.accelerator.wait_for_everyone()
+        self.device = self.accelerator.device
+        # Get logger
+        with self.accelerator.main_process_first():
+            self.logger = get_logger("inference", log_level=args.log_level)
+        # Log some info
+        self.logger.info("=" * 56)
+        self.logger.info("||\t\t" + "New inference process started." + "\t\t||")
+        self.logger.info("=" * 56)
+        self.logger.info("\n")
+        self.acoustic_model_dir = args.acoustics_dir
+        self.logger.debug(f"Acoustic model dir: {args.acoustics_dir}")
+        if args.vocoder_dir is not None:
+            self.vocoder_dir = args.vocoder_dir
+            self.logger.debug(f"Vocoder dir: {args.vocoder_dir}")
+        os.makedirs(args.output_dir, exist_ok=True)
+        # Set random seed
+        with self.accelerator.main_process_first():
+            start = time.monotonic_ns()
+            self._set_random_seed(self.cfg.train.random_seed)
+            end = time.monotonic_ns()
+            self.logger.debug(
+                f"Setting random seed done in {(end - start) / 1e6:.2f}ms"
+            )
+            self.logger.debug(f"Random seed: {self.cfg.train.random_seed}")
+        # Setup data loader
+        if self.infer_type == "batch":
+            with self.accelerator.main_process_first():
+                self.logger.info("Building dataset...")
+                start = time.monotonic_ns()
+                self.test_dataloader = self._build_test_dataloader()
+                end = time.monotonic_ns()
+                self.logger.info(
+                    f"Building dataset done in {(end - start) / 1e6:.2f}ms"
+                )
+        # Build model
+        with self.accelerator.main_process_first():
+            self.logger.info("Building model...")
+            start = time.monotonic_ns()
+            self.model = self._build_model()
+            end = time.monotonic_ns()
+            self.logger.info(f"Building model done in {(end - start) / 1e6:.3f}ms")
+        # Init with accelerate
+        self.logger.info("Initializing accelerate...")
+        start = time.monotonic_ns()
+        self.accelerator = accelerate.Accelerator()
+        self.model = self.accelerator.prepare(self.model)
+        if self.infer_type == "batch":
+            self.test_dataloader = self.accelerator.prepare(self.test_dataloader)
+        end = time.monotonic_ns()
+        self.accelerator.wait_for_everyone()
+        self.logger.info(f"Initializing accelerate done in {(end - start) / 1e6:.3f}ms")
+        with self.accelerator.main_process_first():
+            self.logger.info("Loading checkpoint...")
+            start = time.monotonic_ns()
+            if args.acoustics_dir is not None:
+                self._load_model(
+                    checkpoint_dir=os.path.join(args.acoustics_dir, "checkpoint")
+                )
+            elif args.checkpoint_path is not None:
+                self._load_model(checkpoint_path=args.checkpoint_path)
+            else:
+                print("Either checkpoint dir or checkpoint path should be provided.")
+            end = time.monotonic_ns()
+            self.logger.info(f"Loading checkpoint done in {(end - start) / 1e6:.3f}ms")
+        self.model.eval()
+        self.accelerator.wait_for_everyone()
+    def _build_test_dataset(self):
+        pass
+    def _build_model(self):
+        pass
+    # TODO: LEGACY CODE
+    def _build_test_dataloader(self):
+        datasets, collate = self._build_test_dataset()
+        self.test_dataset = datasets(self.args, self.cfg)
+        self.test_collate = collate(self.cfg)
+        self.test_batch_size = min(
+            self.cfg.train.batch_size, len(self.test_dataset.metadata)
+        )
+        test_dataloader = DataLoader(
+            self.test_dataset,
+            collate_fn=self.test_collate,
+            num_workers=1,
+            batch_size=self.test_batch_size,
+            shuffle=False,
+        )
+        return test_dataloader
+    def _load_model(
+        self,
+        checkpoint_dir: str = None,
+        checkpoint_path: str = None,
+        old_mode: bool = False,
+    ):
+        r"""Load model from checkpoint. If checkpoint_path is None, it will
+        load the latest checkpoint in checkpoint_dir. If checkpoint_path is not
+        None, it will load the checkpoint specified by checkpoint_path. **Only use this
+        method after** ``accelerator.prepare()``.
+        """
+        if checkpoint_path is None:
+            assert checkpoint_dir is not None
+            # Load the latest accelerator state dicts
+            ls = [
+                str(i) for i in Path(checkpoint_dir).glob("*") if not "audio" in str(i)
+            ]
+            ls.sort(key=lambda x: int(x.split("_")[-3].split("-")[-1]), reverse=True)
+            checkpoint_path = ls[0]
+        self.accelerator.load_state(str(checkpoint_path))
+        return str(checkpoint_path)
+    def inference(self):
+        if self.infer_type == "single":
+            out_dir = os.path.join(self.args.output_dir, "single")
+            os.makedirs(out_dir, exist_ok=True)
+            pred_audio = self.inference_for_single_utterance()
+            save_path = os.path.join(out_dir, "test_pred.wav")
+            save_audio(save_path, pred_audio, self.cfg.preprocess.sample_rate)
+        elif self.infer_type == "batch":
+            out_dir = os.path.join(self.args.output_dir, "batch")
+            os.makedirs(out_dir, exist_ok=True)
+            pred_audio_list = self.inference_for_batches()
+            for it, wav in zip(self.test_dataset.metadata, pred_audio_list):
+                uid = it["Uid"]
+                save_audio(
+                    os.path.join(out_dir, f"{uid}.wav"),
+                    wav.numpy(),
+                    self.cfg.preprocess.sample_rate,
+                    add_silence=True,
+                    turn_up=True,
+                )
+                tmp_file = os.path.join(out_dir, f"{uid}.pt")
+                if os.path.exists(tmp_file):
+                    os.remove(tmp_file)
+        print("Saved to: ", out_dir)
+    @torch.inference_mode()
+    def inference_for_batches(self):
+        y_pred = []
+        for i, batch in tqdm(enumerate(self.test_dataloader)):
+            y_pred, mel_lens, _ = self._inference_each_batch(batch)
+            y_ls = y_pred.chunk(self.test_batch_size)
+            tgt_ls = mel_lens.chunk(self.test_batch_size)
+            j = 0
+            for it, l in zip(y_ls, tgt_ls):
+                l = l.item()
+                it = it.squeeze(0)[:l].detach().cpu()
+                uid = self.test_dataset.metadata[i * self.test_batch_size + j]["Uid"]
+                torch.save(it, os.path.join(self.args.output_dir, f"{uid}.pt"))
+                j += 1
+        vocoder_cfg, vocoder_ckpt = self._parse_vocoder(self.args.vocoder_dir)
+        res = synthesis(
+            cfg=vocoder_cfg,
+            vocoder_weight_file=vocoder_ckpt,
+            n_samples=None,
+            pred=[
+                torch.load(
+                    os.path.join(self.args.output_dir, "{}.pt".format(item["Uid"]))
+                ).numpy()
+                for item in self.test_dataset.metadata
+            ],
+        )
+        for it, wav in zip(self.test_dataset.metadata, res):
+            uid = it["Uid"]
+            save_audio(
+                os.path.join(self.args.output_dir, f"{uid}.wav"),
+                wav.numpy(),
+                22050,
+                add_silence=True,
+                turn_up=True,
+            )
+    @abstractmethod
+    @torch.inference_mode()
+    def _inference_each_batch(self, batch_data):
+        pass
+    def inference_for_single_utterance(self, text):
+        pass
+    def synthesis_by_vocoder(self, pred):
+        audios_pred = synthesis(
+            self.vocoder_cfg,
+            self.checkpoint_dir_vocoder,
+            len(pred),
+            pred,
+        )
+        return audios_pred
+    @staticmethod
+    def _parse_vocoder(vocoder_dir):
+        r"""Parse vocoder config"""
+        vocoder_dir = os.path.abspath(vocoder_dir)
+        ckpt_list = [ckpt for ckpt in Path(vocoder_dir).glob("*.pt")]
+        ckpt_list.sort(key=lambda x: int(x.stem), reverse=True)
+        ckpt_path = str(ckpt_list[0])
+        vocoder_cfg = load_config(
+            os.path.join(vocoder_dir, "args.json"), lowercase=True
+        )
+        return vocoder_cfg, ckpt_path
+    def _set_random_seed(self, seed):
+        """Set random seed for all possible random modules."""
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.random.manual_seed(seed)

models/tts/base/tts_trainer.py ADDED Viewed

	@@ -0,0 +1,699 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import shutil
+import torch
+import time
+from pathlib import Path
+import torch
+from tqdm import tqdm
+import re
+import logging
+import json5
+import accelerate
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration
+from torch.utils.data import ConcatDataset, DataLoader
+from accelerate import DistributedDataParallelKwargs
+from schedulers.scheduler import Eden
+from models.base.base_sampler import build_samplers
+from models.base.new_trainer import BaseTrainer
+class TTSTrainer(BaseTrainer):
+    r"""The base trainer for all TTS models. It inherits from BaseTrainer and implements
+    ``build_criterion``, ``_build_dataset`` and ``_build_singer_lut`` methods. You can inherit from this
+    class, and implement ``_build_model``, ``_forward_step``.
+    """
+    def __init__(self, args=None, cfg=None):
+        self.args = args
+        self.cfg = cfg
+        cfg.exp_name = args.exp_name
+        # init with accelerate
+        self._init_accelerator()
+        self.accelerator.wait_for_everyone()
+        with self.accelerator.main_process_first():
+            self.logger = get_logger(args.exp_name, log_level="INFO")
+        # Log some info
+        self.logger.info("=" * 56)
+        self.logger.info("||\t\t" + "New training process started." + "\t\t||")
+        self.logger.info("=" * 56)
+        self.logger.info("\n")
+        self.logger.debug(f"Using {args.log_level.upper()} logging level.")
+        self.logger.info(f"Experiment name: {args.exp_name}")
+        self.logger.info(f"Experiment directory: {self.exp_dir}")
+        self.checkpoint_dir = os.path.join(self.exp_dir, "checkpoint")
+        if self.accelerator.is_main_process:
+            os.makedirs(self.checkpoint_dir, exist_ok=True)
+        self.logger.debug(f"Checkpoint directory: {self.checkpoint_dir}")
+        # init counts
+        self.batch_count: int = 0
+        self.step: int = 0
+        self.epoch: int = 0
+        self.max_epoch = (
+            self.cfg.train.max_epoch if self.cfg.train.max_epoch > 0 else float("inf")
+        )
+        self.logger.info(
+            "Max epoch: {}".format(
+                self.max_epoch if self.max_epoch < float("inf") else "Unlimited"
+            )
+        )
+        # Check values
+        if self.accelerator.is_main_process:
+            self.__check_basic_configs()
+            # Set runtime configs
+            self.save_checkpoint_stride = self.cfg.train.save_checkpoint_stride
+            self.checkpoints_path = [
+                [] for _ in range(len(self.save_checkpoint_stride))
+            ]
+            self.keep_last = [
+                i if i > 0 else float("inf") for i in self.cfg.train.keep_last
+            ]
+            self.run_eval = self.cfg.train.run_eval
+        # set random seed
+        with self.accelerator.main_process_first():
+            start = time.monotonic_ns()
+            self._set_random_seed(self.cfg.train.random_seed)
+            end = time.monotonic_ns()
+            self.logger.debug(
+                f"Setting random seed done in {(end - start) / 1e6:.2f}ms"
+            )
+            self.logger.debug(f"Random seed: {self.cfg.train.random_seed}")
+        # setup data_loader
+        with self.accelerator.main_process_first():
+            self.logger.info("Building dataset...")
+            start = time.monotonic_ns()
+            self.train_dataloader, self.valid_dataloader = self._build_dataloader()
+            end = time.monotonic_ns()
+            self.logger.info(f"Building dataset done in {(end - start) / 1e6:.2f}ms")
+        # save phone table to exp dir. Should be done before building model due to loading phone table in model
+        if cfg.preprocess.use_phone and cfg.preprocess.phone_extractor != "lexicon":
+            self._save_phone_symbols_file_to_exp_path()
+        # setup model
+        with self.accelerator.main_process_first():
+            self.logger.info("Building model...")
+            start = time.monotonic_ns()
+            self.model = self._build_model()
+            end = time.monotonic_ns()
+            self.logger.debug(self.model)
+            self.logger.info(f"Building model done in {(end - start) / 1e6:.2f}ms")
+            self.logger.info(
+                f"Model parameters: {self.__count_parameters(self.model)/1e6:.2f}M"
+            )
+        # optimizer & scheduler
+        with self.accelerator.main_process_first():
+            self.logger.info("Building optimizer and scheduler...")
+            start = time.monotonic_ns()
+            self.optimizer = self._build_optimizer()
+            self.scheduler = self._build_scheduler()
+            end = time.monotonic_ns()
+            self.logger.info(
+                f"Building optimizer and scheduler done in {(end - start) / 1e6:.2f}ms"
+            )
+        # create criterion
+        with self.accelerator.main_process_first():
+            self.logger.info("Building criterion...")
+            start = time.monotonic_ns()
+            self.criterion = self._build_criterion()
+            end = time.monotonic_ns()
+            self.logger.info(f"Building criterion done in {(end - start) / 1e6:.2f}ms")
+        # Resume or Finetune
+        with self.accelerator.main_process_first():
+            self._check_resume()
+        # accelerate prepare
+        self.logger.info("Initializing accelerate...")
+        start = time.monotonic_ns()
+        self._accelerator_prepare()
+        end = time.monotonic_ns()
+        self.logger.info(f"Initializing accelerate done in {(end - start) / 1e6:.2f}ms")
+        # save config file path
+        self.config_save_path = os.path.join(self.exp_dir, "args.json")
+        self.device = self.accelerator.device
+        if cfg.preprocess.use_spkid and cfg.train.multi_speaker_training:
+            self.speakers = self._build_speaker_lut()
+            self.utt2spk_dict = self._build_utt2spk_dict()
+        # Only for TTS tasks
+        self.task_type = "TTS"
+        self.logger.info("Task type: {}".format(self.task_type))
+    def _check_resume(self):
+        # if args.resume:
+        if self.args.resume or (
+            self.cfg.model_type == "VALLE" and self.args.train_stage == 2
+        ):
+            if self.cfg.model_type == "VALLE" and self.args.train_stage == 2:
+                self.args.resume_type = "finetune"
+            self.logger.info("Resuming from checkpoint...")
+            start = time.monotonic_ns()
+            self.ckpt_path = self._load_model(
+                self.checkpoint_dir, self.args.checkpoint_path, self.args.resume_type
+            )
+            end = time.monotonic_ns()
+            self.logger.info(
+                f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms"
+            )
+            self.checkpoints_path = json.load(
+                open(os.path.join(self.ckpt_path, "ckpts.json"), "r")
+            )
+        self.checkpoint_dir = os.path.join(self.exp_dir, "checkpoint")
+        if self.accelerator.is_main_process:
+            os.makedirs(self.checkpoint_dir, exist_ok=True)
+        self.logger.debug(f"Checkpoint directory: {self.checkpoint_dir}")
+    def _init_accelerator(self):
+        self.exp_dir = os.path.join(
+            os.path.abspath(self.cfg.log_dir), self.args.exp_name
+        )
+        project_config = ProjectConfiguration(
+            project_dir=self.exp_dir,
+            logging_dir=os.path.join(self.exp_dir, "log"),
+        )
+        kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+        self.accelerator = accelerate.Accelerator(
+            gradient_accumulation_steps=self.cfg.train.gradient_accumulation_step,
+            log_with=self.cfg.train.tracker,
+            project_config=project_config,
+            kwargs_handlers=[kwargs],
+        )
+        if self.accelerator.is_main_process:
+            os.makedirs(project_config.project_dir, exist_ok=True)
+            os.makedirs(project_config.logging_dir, exist_ok=True)
+        with self.accelerator.main_process_first():
+            self.accelerator.init_trackers(self.args.exp_name)
+    def _accelerator_prepare(self):
+        (
+            self.train_dataloader,
+            self.valid_dataloader,
+        ) = self.accelerator.prepare(
+            self.train_dataloader,
+            self.valid_dataloader,
+        )
+        if isinstance(self.model, dict):
+            for key in self.model.keys():
+                self.model[key] = self.accelerator.prepare(self.model[key])
+        else:
+            self.model = self.accelerator.prepare(self.model)
+        if isinstance(self.optimizer, dict):
+            for key in self.optimizer.keys():
+                self.optimizer[key] = self.accelerator.prepare(self.optimizer[key])
+        else:
+            self.optimizer = self.accelerator.prepare(self.optimizer)
+        if isinstance(self.scheduler, dict):
+            for key in self.scheduler.keys():
+                self.scheduler[key] = self.accelerator.prepare(self.scheduler[key])
+        else:
+            self.scheduler = self.accelerator.prepare(self.scheduler)
+    ### Following are methods only for TTS tasks ###
+    def _build_dataset(self):
+        pass
+    def _build_criterion(self):
+        pass
+    def _build_model(self):
+        pass
+    def _build_dataloader(self):
+        """Build dataloader which merges a series of datasets."""
+        # Build dataset instance for each dataset and combine them by ConcatDataset
+        Dataset, Collator = self._build_dataset()
+        # Build train set
+        datasets_list = []
+        for dataset in self.cfg.dataset:
+            subdataset = Dataset(self.cfg, dataset, is_valid=False)
+            datasets_list.append(subdataset)
+        train_dataset = ConcatDataset(datasets_list)
+        train_collate = Collator(self.cfg)
+        _, batch_sampler = build_samplers(train_dataset, self.cfg, self.logger, "train")
+        train_loader = DataLoader(
+            train_dataset,
+            collate_fn=train_collate,
+            batch_sampler=batch_sampler,
+            num_workers=self.cfg.train.dataloader.num_worker,
+            pin_memory=self.cfg.train.dataloader.pin_memory,
+        )
+        # Build test set
+        datasets_list = []
+        for dataset in self.cfg.dataset:
+            subdataset = Dataset(self.cfg, dataset, is_valid=True)
+            datasets_list.append(subdataset)
+        valid_dataset = ConcatDataset(datasets_list)
+        valid_collate = Collator(self.cfg)
+        _, batch_sampler = build_samplers(valid_dataset, self.cfg, self.logger, "valid")
+        valid_loader = DataLoader(
+            valid_dataset,
+            collate_fn=valid_collate,
+            batch_sampler=batch_sampler,
+            num_workers=self.cfg.train.dataloader.num_worker,
+            pin_memory=self.cfg.train.dataloader.pin_memory,
+        )
+        return train_loader, valid_loader
+    def _build_optimizer(self):
+        pass
+    def _build_scheduler(self):
+        pass
+    def _load_model(self, checkpoint_dir, checkpoint_path=None, resume_type="resume"):
+        """Load model from checkpoint. If a folder is given, it will
+        load the latest checkpoint in checkpoint_dir. If a path is given
+        it will load the checkpoint specified by checkpoint_path.
+        **Only use this method after** ``accelerator.prepare()``.
+        """
+        if checkpoint_path is None:
+            ls = [str(i) for i in Path(checkpoint_dir).glob("*")]
+            ls.sort(key=lambda x: int(x.split("_")[-3].split("-")[-1]), reverse=True)
+            checkpoint_path = ls[0]
+        self.logger.info("Load model from {}".format(checkpoint_path))
+        print("Load model from {}".format(checkpoint_path))
+        if resume_type == "resume":
+            self.accelerator.load_state(checkpoint_path)
+            self.epoch = int(checkpoint_path.split("_")[-3].split("-")[-1]) + 1
+            self.step = int(checkpoint_path.split("_")[-2].split("-")[-1]) + 1
+        elif resume_type == "finetune":
+            self.model.load_state_dict(
+                torch.load(os.path.join(checkpoint_path, "pytorch_model.bin"))
+            )
+            self.model.cuda(self.accelerator.device)
+            self.logger.info("Load model weights for finetune SUCCESS!")
+        else:
+            raise ValueError("Unsupported resume type: {}".format(resume_type))
+        return checkpoint_path
+    ### THIS IS MAIN ENTRY ###
+    def train_loop(self):
+        r"""Training loop. The public entry of training process."""
+        # Wait everyone to prepare before we move on
+        self.accelerator.wait_for_everyone()
+        # dump config file
+        if self.accelerator.is_main_process:
+            self.__dump_cfg(self.config_save_path)
+        # self.optimizer.zero_grad()
+        # Wait to ensure good to go
+        self.accelerator.wait_for_everyone()
+        while self.epoch < self.max_epoch:
+            self.logger.info("\n")
+            self.logger.info("-" * 32)
+            self.logger.info("Epoch {}: ".format(self.epoch))
+            # Do training & validating epoch
+            train_total_loss, train_losses = self._train_epoch()
+            if isinstance(train_losses, dict):
+                for key, loss in train_losses.items():
+                    self.logger.info("  |- Train/{} Loss: {:.6f}".format(key, loss))
+                    self.accelerator.log(
+                        {"Epoch/Train {} Loss".format(key): loss},
+                        step=self.epoch,
+                    )
+            valid_total_loss, valid_losses = self._valid_epoch()
+            if isinstance(valid_losses, dict):
+                for key, loss in valid_losses.items():
+                    self.logger.info("  |- Valid/{} Loss: {:.6f}".format(key, loss))
+                    self.accelerator.log(
+                        {"Epoch/Train {} Loss".format(key): loss},
+                        step=self.epoch,
+                    )
+            self.logger.info("  |- Train/Loss: {:.6f}".format(train_total_loss))
+            self.logger.info("  |- Valid/Loss: {:.6f}".format(valid_total_loss))
+            self.accelerator.log(
+                {
+                    "Epoch/Train Loss": train_total_loss,
+                    "Epoch/Valid Loss": valid_total_loss,
+                },
+                step=self.epoch,
+            )
+            self.accelerator.wait_for_everyone()
+            # Check if hit save_checkpoint_stride and run_eval
+            run_eval = False
+            if self.accelerator.is_main_process:
+                save_checkpoint = False
+                hit_dix = []
+                for i, num in enumerate(self.save_checkpoint_stride):
+                    if self.epoch % num == 0:
+                        save_checkpoint = True
+                        hit_dix.append(i)
+                        run_eval |= self.run_eval[i]
+            self.accelerator.wait_for_everyone()
+            if self.accelerator.is_main_process and save_checkpoint:
+                path = os.path.join(
+                    self.checkpoint_dir,
+                    "epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                        self.epoch, self.step, train_total_loss
+                    ),
+                )
+                self.accelerator.save_state(path)
+                json.dump(
+                    self.checkpoints_path,
+                    open(os.path.join(path, "ckpts.json"), "w"),
+                    ensure_ascii=False,
+                    indent=4,
+                )
+                # Remove old checkpoints
+                to_remove = []
+                for idx in hit_dix:
+                    self.checkpoints_path[idx].append(path)
+                    while len(self.checkpoints_path[idx]) > self.keep_last[idx]:
+                        to_remove.append((idx, self.checkpoints_path[idx].pop(0)))
+                # Search conflicts
+                total = set()
+                for i in self.checkpoints_path:
+                    total |= set(i)
+                do_remove = set()
+                for idx, path in to_remove[::-1]:
+                    if path in total:
+                        self.checkpoints_path[idx].insert(0, path)
+                    else:
+                        do_remove.add(path)
+                # Remove old checkpoints
+                for path in do_remove:
+                    shutil.rmtree(path, ignore_errors=True)
+                    self.logger.debug(f"Remove old checkpoint: {path}")
+            self.accelerator.wait_for_everyone()
+            if run_eval:
+                # TODO: run evaluation
+                pass
+            # Update info for each epoch
+            self.epoch += 1
+        # Finish training and save final checkpoint
+        self.accelerator.wait_for_everyone()
+        if self.accelerator.is_main_process:
+            path = os.path.join(
+                self.checkpoint_dir,
+                "final_epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                    self.epoch, self.step, valid_total_loss
+                ),
+            )
+            self.accelerator.save_state(
+                os.path.join(
+                    self.checkpoint_dir,
+                    "final_epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                        self.epoch, self.step, valid_total_loss
+                    ),
+                )
+            )
+            json.dump(
+                self.checkpoints_path,
+                open(os.path.join(path, "ckpts.json"), "w"),
+                ensure_ascii=False,
+                indent=4,
+            )
+        self.accelerator.end_training()
+    ### Following are methods that can be used directly in child classes ###
+    def _train_epoch(self):
+        r"""Training epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        if isinstance(self.model, dict):
+            for key in self.model.keys():
+                self.model[key].train()
+        else:
+            self.model.train()
+        epoch_sum_loss: float = 0.0
+        epoch_losses: dict = {}
+        epoch_step: int = 0
+        for batch in tqdm(
+            self.train_dataloader,
+            desc=f"Training Epoch {self.epoch}",
+            unit="batch",
+            colour="GREEN",
+            leave=False,
+            dynamic_ncols=True,
+            smoothing=0.04,
+            disable=not self.accelerator.is_main_process,
+        ):
+            # Do training step and BP
+            with self.accelerator.accumulate(self.model):
+                total_loss, train_losses, _ = self._train_step(batch)
+            self.batch_count += 1
+            # Update info for each step
+            # TODO: step means BP counts or batch counts?
+            if self.batch_count % self.cfg.train.gradient_accumulation_step == 0:
+                if isinstance(self.scheduler, dict):
+                    for key in self.scheduler.keys():
+                        self.scheduler[key].step()
+                else:
+                    if isinstance(self.scheduler, Eden):
+                        self.scheduler.step_batch(self.step)
+                    else:
+                        self.scheduler.step()
+                epoch_sum_loss += total_loss
+                if isinstance(train_losses, dict):
+                    for key, value in train_losses.items():
+                        epoch_losses[key] += value
+                if isinstance(train_losses, dict):
+                    for key, loss in train_losses.items():
+                        self.accelerator.log(
+                            {"Epoch/Train {} Loss".format(key): loss},
+                            step=self.step,
+                        )
+                self.step += 1
+                epoch_step += 1
+        self.accelerator.wait_for_everyone()
+        epoch_sum_loss = (
+            epoch_sum_loss
+            / len(self.train_dataloader)
+            * self.cfg.train.gradient_accumulation_step
+        )
+        for key in epoch_losses.keys():
+            epoch_losses[key] = (
+                epoch_losses[key]
+                / len(self.train_dataloader)
+                * self.cfg.train.gradient_accumulation_step
+            )
+        return epoch_sum_loss, epoch_losses
+    @torch.inference_mode()
+    def _valid_epoch(self):
+        r"""Testing epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        if isinstance(self.model, dict):
+            for key in self.model.keys():
+                self.model[key].eval()
+        else:
+            self.model.eval()
+        epoch_sum_loss = 0.0
+        epoch_losses = dict()
+        for batch in tqdm(
+            self.valid_dataloader,
+            desc=f"Validating Epoch {self.epoch}",
+            unit="batch",
+            colour="GREEN",
+            leave=False,
+            dynamic_ncols=True,
+            smoothing=0.04,
+            disable=not self.accelerator.is_main_process,
+        ):
+            total_loss, valid_losses, valid_stats = self._valid_step(batch)
+            epoch_sum_loss += total_loss
+            if isinstance(valid_losses, dict):
+                for key, value in valid_losses.items():
+                    if key not in epoch_losses.keys():
+                        epoch_losses[key] = value
+                    else:
+                        epoch_losses[key] += value
+        epoch_sum_loss = epoch_sum_loss / len(self.valid_dataloader)
+        for key in epoch_losses.keys():
+            epoch_losses[key] = epoch_losses[key] / len(self.valid_dataloader)
+        self.accelerator.wait_for_everyone()
+        return epoch_sum_loss, epoch_losses
+    def _train_step(self):
+        pass
+    def _valid_step(self, batch):
+        pass
+    def _inference(self):
+        pass
+    def _is_valid_pattern(self, directory_name):
+        directory_name = str(directory_name)
+        pattern = r"^epoch-\d{4}_step-\d{7}_loss-\d{1}\.\d{6}"
+        return re.match(pattern, directory_name) is not None
+    def _check_basic_configs(self):
+        if self.cfg.train.gradient_accumulation_step <= 0:
+            self.logger.fatal("Invalid gradient_accumulation_step value!")
+            self.logger.error(
+                f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
+            )
+            self.accelerator.end_training()
+            raise ValueError(
+                f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
+            )
+    def __dump_cfg(self, path):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        json5.dump(
+            self.cfg,
+            open(path, "w"),
+            indent=4,
+            sort_keys=True,
+            ensure_ascii=False,
+            quote_keys=True,
+        )
+    def __check_basic_configs(self):
+        if self.cfg.train.gradient_accumulation_step <= 0:
+            self.logger.fatal("Invalid gradient_accumulation_step value!")
+            self.logger.error(
+                f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
+            )
+            self.accelerator.end_training()
+            raise ValueError(
+                f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
+            )
+        # TODO: check other values
+    @staticmethod
+    def __count_parameters(model):
+        model_param = 0.0
+        if isinstance(model, dict):
+            for key, value in model.items():
+                model_param += sum(p.numel() for p in model[key].parameters())
+        else:
+            model_param = sum(p.numel() for p in model.parameters())
+        return model_param
+    def _build_speaker_lut(self):
+        # combine speakers
+        if not os.path.exists(os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)):
+            speakers = {}
+        else:
+            with open(
+                os.path.join(self.exp_dir, self.cfg.preprocess.spk2id), "r"
+            ) as speaker_file:
+                speakers = json.load(speaker_file)
+        for dataset in self.cfg.dataset:
+            speaker_lut_path = os.path.join(
+                self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.spk2id
+            )
+            with open(speaker_lut_path, "r") as speaker_lut_path:
+                singer_lut = json.load(speaker_lut_path)
+            for singer in singer_lut.keys():
+                if singer not in speakers:
+                    speakers[singer] = len(speakers)
+        with open(
+            os.path.join(self.exp_dir, self.cfg.preprocess.spk2id), "w"
+        ) as speaker_file:
+            json.dump(speakers, speaker_file, indent=4, ensure_ascii=False)
+        print(
+            "speakers have been dumped to {}".format(
+                os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)
+            )
+        )
+        return speakers
+    def _build_utt2spk_dict(self):
+        # combine speakers
+        utt2spk = {}
+        if not os.path.exists(os.path.join(self.exp_dir, self.cfg.preprocess.utt2spk)):
+            utt2spk = {}
+        else:
+            with open(
+                os.path.join(self.exp_dir, self.cfg.preprocess.utt2spk), "r"
+            ) as utt2spk_file:
+                for line in utt2spk_file.readlines():
+                    utt, spk = line.strip().split("\t")
+                    utt2spk[utt] = spk
+        for dataset in self.cfg.dataset:
+            utt2spk_dict_path = os.path.join(
+                self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.utt2spk
+            )
+            with open(utt2spk_dict_path, "r") as utt2spk_dict:
+                for line in utt2spk_dict.readlines():
+                    utt, spk = line.strip().split("\t")
+                    if utt not in utt2spk.keys():
+                        utt2spk[utt] = spk
+        with open(
+            os.path.join(self.exp_dir, self.cfg.preprocess.utt2spk), "w"
+        ) as utt2spk_file:
+            for utt, spk in utt2spk.items():
+                utt2spk_file.write(utt + "\t" + spk + "\n")
+        print(
+            "utterance and speaker mapper have been dumped to {}".format(
+                os.path.join(self.exp_dir, self.cfg.preprocess.utt2spk)
+            )
+        )
+        return utt2spk
+    def _save_phone_symbols_file_to_exp_path(self):
+        phone_symbols_file = os.path.join(
+            self.cfg.preprocess.processed_dir,
+            self.cfg.dataset[0],
+            self.cfg.preprocess.symbols_dict,
+        )
+        phone_symbols_file_to_exp_path = os.path.join(
+            self.exp_dir, self.cfg.preprocess.symbols_dict
+        )
+        shutil.copy(phone_symbols_file, phone_symbols_file_to_exp_path)
+        print(
+            "phone symbols been dumped to {}".format(
+                os.path.join(self.exp_dir, self.cfg.preprocess.symbols_dict)
+            )
+        )

models/tts/fastspeech2/__init__.py ADDED Viewed

File without changes