Billpai
commited on
Commit
·
f196feb
1
Parent(s):
507c407
test
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- egs/svc/MultipleContentsSVC/README.md +153 -0
- egs/svc/MultipleContentsSVC/exp_config.json +126 -0
- egs/svc/MultipleContentsSVC/run.sh +1 -0
- egs/svc/README.md +34 -0
- egs/svc/_template/run.sh +150 -0
- egs/vocoder/README.md +23 -0
- egs/vocoder/diffusion/README.md +0 -0
- egs/vocoder/diffusion/exp_config_base.json +0 -0
- egs/vocoder/gan/README.md +224 -0
- egs/vocoder/gan/_template/run.sh +143 -0
- egs/vocoder/gan/apnet/exp_config.json +45 -0
- egs/vocoder/gan/apnet/run.sh +143 -0
- egs/vocoder/gan/bigvgan/exp_config.json +66 -0
- egs/vocoder/gan/bigvgan/run.sh +143 -0
- egs/vocoder/gan/bigvgan_large/exp_config.json +70 -0
- egs/vocoder/gan/bigvgan_large/run.sh +143 -0
- egs/vocoder/gan/exp_config_base.json +111 -0
- egs/vocoder/gan/hifigan/exp_config.json +59 -0
- egs/vocoder/gan/hifigan/run.sh +143 -0
- egs/vocoder/gan/melgan/exp_config.json +34 -0
- egs/vocoder/gan/melgan/run.sh +143 -0
- egs/vocoder/gan/nsfhifigan/exp_config.json +83 -0
- egs/vocoder/gan/nsfhifigan/run.sh +143 -0
- egs/vocoder/gan/tfr_enhanced_hifigan/README.md +185 -0
- egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json +118 -0
- egs/vocoder/gan/tfr_enhanced_hifigan/run.sh +145 -0
- examples/chinese_female_recordings.wav +3 -0
- examples/chinese_male_seperated.wav +3 -0
- examples/english_female_seperated.wav +3 -0
- examples/english_male_recordings.wav +3 -0
- examples/output/.DS_Store +0 -0
- examples/output/chinese_female_recordings_vocalist_l1_JohnMayer.wav +3 -0
- examples/output/chinese_male_seperated_vocalist_l1_TaylorSwift.wav +3 -0
- examples/output/english_female_seperated_vocalist_l1_汪峰.wav +3 -0
- examples/output/english_male_recordings_vocalist_l1_石倚洁.wav +3 -0
- models/__init__.py +0 -0
- models/base/__init__.py +7 -0
- models/base/base_dataset.py +350 -0
- models/base/base_inference.py +220 -0
- models/base/base_sampler.py +136 -0
- models/base/base_trainer.py +348 -0
- models/base/new_dataset.py +50 -0
- models/base/new_inference.py +249 -0
- models/base/new_trainer.py +722 -0
- models/svc/__init__.py +0 -0
- models/svc/base/__init__.py +7 -0
- models/svc/base/svc_dataset.py +425 -0
- models/svc/base/svc_inference.py +15 -0
- models/svc/base/svc_trainer.py +111 -0
- models/svc/comosvc/__init__.py +4 -0
egs/svc/MultipleContentsSVC/README.md
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion
|
| 2 |
+
|
| 3 |
+
[](https://arxiv.org/abs/2310.11160)
|
| 4 |
+
[](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html)
|
| 5 |
+
|
| 6 |
+
<br>
|
| 7 |
+
<div align="center">
|
| 8 |
+
<img src="../../../imgs/svc/MultipleContentsSVC.png" width="85%">
|
| 9 |
+
</div>
|
| 10 |
+
<br>
|
| 11 |
+
|
| 12 |
+
This is the official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Specially,
|
| 13 |
+
|
| 14 |
+
- The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec).
|
| 15 |
+
- The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
|
| 16 |
+
- The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data.
|
| 17 |
+
|
| 18 |
+
There are four stages in total:
|
| 19 |
+
|
| 20 |
+
1. Data preparation
|
| 21 |
+
2. Features extraction
|
| 22 |
+
3. Training
|
| 23 |
+
4. Inference/conversion
|
| 24 |
+
|
| 25 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
| 26 |
+
> ```bash
|
| 27 |
+
> cd Amphion
|
| 28 |
+
> ```
|
| 29 |
+
|
| 30 |
+
## 1. Data Preparation
|
| 31 |
+
|
| 32 |
+
### Dataset Download
|
| 33 |
+
|
| 34 |
+
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
|
| 35 |
+
|
| 36 |
+
### Configuration
|
| 37 |
+
|
| 38 |
+
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
| 39 |
+
|
| 40 |
+
```json
|
| 41 |
+
"dataset": [
|
| 42 |
+
"m4singer",
|
| 43 |
+
"opencpop",
|
| 44 |
+
"opensinger",
|
| 45 |
+
"svcc",
|
| 46 |
+
"vctk"
|
| 47 |
+
],
|
| 48 |
+
"dataset_path": {
|
| 49 |
+
// TODO: Fill in your dataset path
|
| 50 |
+
"m4singer": "[M4Singer dataset path]",
|
| 51 |
+
"opencpop": "[Opencpop dataset path]",
|
| 52 |
+
"opensinger": "[OpenSinger dataset path]",
|
| 53 |
+
"svcc": "[SVCC dataset path]",
|
| 54 |
+
"vctk": "[VCTK dataset path]"
|
| 55 |
+
},
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
## 2. Features Extraction
|
| 59 |
+
|
| 60 |
+
### Content-based Pretrained Models Download
|
| 61 |
+
|
| 62 |
+
By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
|
| 63 |
+
|
| 64 |
+
### Configuration
|
| 65 |
+
|
| 66 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
|
| 67 |
+
|
| 68 |
+
```json
|
| 69 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
| 70 |
+
"log_dir": "ckpts/svc",
|
| 71 |
+
"preprocess": {
|
| 72 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
| 73 |
+
"processed_dir": "data",
|
| 74 |
+
...
|
| 75 |
+
},
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
### Run
|
| 79 |
+
|
| 80 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
| 81 |
+
|
| 82 |
+
```bash
|
| 83 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 1
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
| 87 |
+
|
| 88 |
+
## 3. Training
|
| 89 |
+
|
| 90 |
+
### Configuration
|
| 91 |
+
|
| 92 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
| 93 |
+
|
| 94 |
+
```json
|
| 95 |
+
"train": {
|
| 96 |
+
"batch_size": 32,
|
| 97 |
+
...
|
| 98 |
+
"adamw": {
|
| 99 |
+
"lr": 2.0e-4
|
| 100 |
+
},
|
| 101 |
+
...
|
| 102 |
+
}
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
### Run
|
| 106 |
+
|
| 107 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
|
| 108 |
+
|
| 109 |
+
```bash
|
| 110 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName]
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
| 114 |
+
|
| 115 |
+
## 4. Inference/Conversion
|
| 116 |
+
|
| 117 |
+
### Pretrained Vocoder Download
|
| 118 |
+
|
| 119 |
+
We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
|
| 120 |
+
|
| 121 |
+
### Run
|
| 122 |
+
|
| 123 |
+
For inference/conversion, you need to specify the following configurations when running `run.sh`:
|
| 124 |
+
|
| 125 |
+
| Parameters | Description | Example |
|
| 126 |
+
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| 127 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
|
| 128 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
|
| 129 |
+
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
|
| 130 |
+
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
|
| 131 |
+
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
|
| 132 |
+
|
| 133 |
+
For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
|
| 134 |
+
|
| 135 |
+
```bash
|
| 136 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \
|
| 137 |
+
--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
|
| 138 |
+
--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
|
| 139 |
+
--infer_source_audio_dir [Your Audios Folder] \
|
| 140 |
+
--infer_target_speaker "opencpop_female1" \
|
| 141 |
+
--infer_key_shift "autoshift"
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
## Citations
|
| 145 |
+
|
| 146 |
+
```bibtex
|
| 147 |
+
@article{zhang2023leveraging,
|
| 148 |
+
title={Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion},
|
| 149 |
+
author={Zhang, Xueyao and Gu, Yicheng and Chen, Haopeng and Fang, Zihao and Zou, Lexiao and Xue, Liumeng and Wu, Zhizheng},
|
| 150 |
+
journal={Machine Learning for Audio Worshop, NeurIPS 2023},
|
| 151 |
+
year={2023}
|
| 152 |
+
}
|
| 153 |
+
```
|
egs/svc/MultipleContentsSVC/exp_config.json
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"base_config": "config/diffusion.json",
|
| 3 |
+
"model_type": "DiffWaveNetSVC",
|
| 4 |
+
"dataset": [
|
| 5 |
+
"m4singer",
|
| 6 |
+
"opencpop",
|
| 7 |
+
"opensinger",
|
| 8 |
+
"svcc",
|
| 9 |
+
"vctk"
|
| 10 |
+
],
|
| 11 |
+
"dataset_path": {
|
| 12 |
+
// TODO: Fill in your dataset path
|
| 13 |
+
"m4singer": "[M4Singer dataset path]",
|
| 14 |
+
"opencpop": "[Opencpop dataset path]",
|
| 15 |
+
"opensinger": "[OpenSinger dataset path]",
|
| 16 |
+
"svcc": "[SVCC dataset path]",
|
| 17 |
+
"vctk": "[VCTK dataset path]"
|
| 18 |
+
},
|
| 19 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
| 20 |
+
"log_dir": "ckpts/svc",
|
| 21 |
+
"preprocess": {
|
| 22 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
| 23 |
+
"processed_dir": "data",
|
| 24 |
+
// Config for features extraction
|
| 25 |
+
"extract_mel": true,
|
| 26 |
+
"extract_pitch": true,
|
| 27 |
+
"extract_energy": true,
|
| 28 |
+
"extract_whisper_feature": true,
|
| 29 |
+
"extract_contentvec_feature": true,
|
| 30 |
+
"extract_wenet_feature": false,
|
| 31 |
+
"whisper_batch_size": 30, // decrease it if your GPU is out of memory
|
| 32 |
+
"contentvec_batch_size": 1,
|
| 33 |
+
// Fill in the content-based pretrained model's path
|
| 34 |
+
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
|
| 35 |
+
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
|
| 36 |
+
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
|
| 37 |
+
"whisper_model": "medium",
|
| 38 |
+
"whisper_model_path": "pretrained/whisper/medium.pt",
|
| 39 |
+
// Config for features usage
|
| 40 |
+
"use_mel": true,
|
| 41 |
+
"use_min_max_norm_mel": true,
|
| 42 |
+
"use_frame_pitch": true,
|
| 43 |
+
"use_frame_energy": true,
|
| 44 |
+
"use_spkid": true,
|
| 45 |
+
"use_whisper": true,
|
| 46 |
+
"use_contentvec": true,
|
| 47 |
+
"use_wenet": false,
|
| 48 |
+
"n_mel": 100,
|
| 49 |
+
"sample_rate": 24000
|
| 50 |
+
},
|
| 51 |
+
"model": {
|
| 52 |
+
"condition_encoder": {
|
| 53 |
+
// Config for features usage
|
| 54 |
+
"use_whisper": true,
|
| 55 |
+
"use_contentvec": true,
|
| 56 |
+
"use_wenet": false,
|
| 57 |
+
"whisper_dim": 1024,
|
| 58 |
+
"contentvec_dim": 256,
|
| 59 |
+
"wenet_dim": 512,
|
| 60 |
+
"use_singer_encoder": false,
|
| 61 |
+
"pitch_min": 50,
|
| 62 |
+
"pitch_max": 1100
|
| 63 |
+
},
|
| 64 |
+
"diffusion": {
|
| 65 |
+
"scheduler": "ddpm",
|
| 66 |
+
"scheduler_settings": {
|
| 67 |
+
"num_train_timesteps": 1000,
|
| 68 |
+
"beta_start": 1.0e-4,
|
| 69 |
+
"beta_end": 0.02,
|
| 70 |
+
"beta_schedule": "linear"
|
| 71 |
+
},
|
| 72 |
+
// Diffusion steps encoder
|
| 73 |
+
"step_encoder": {
|
| 74 |
+
"dim_raw_embedding": 128,
|
| 75 |
+
"dim_hidden_layer": 512,
|
| 76 |
+
"activation": "SiLU",
|
| 77 |
+
"num_layer": 2,
|
| 78 |
+
"max_period": 10000
|
| 79 |
+
},
|
| 80 |
+
// Diffusion decoder
|
| 81 |
+
"model_type": "bidilconv",
|
| 82 |
+
// bidilconv, unet2d, TODO: unet1d
|
| 83 |
+
"bidilconv": {
|
| 84 |
+
"base_channel": 512,
|
| 85 |
+
"n_res_block": 40,
|
| 86 |
+
"conv_kernel_size": 3,
|
| 87 |
+
"dilation_cycle_length": 4,
|
| 88 |
+
// specially, 1 means no dilation
|
| 89 |
+
"conditioner_size": 384
|
| 90 |
+
}
|
| 91 |
+
}
|
| 92 |
+
},
|
| 93 |
+
"train": {
|
| 94 |
+
"batch_size": 32,
|
| 95 |
+
"gradient_accumulation_step": 1,
|
| 96 |
+
"max_epoch": -1, // -1 means no limit
|
| 97 |
+
"save_checkpoint_stride": [
|
| 98 |
+
3,
|
| 99 |
+
50
|
| 100 |
+
],
|
| 101 |
+
"keep_last": [
|
| 102 |
+
3,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"run_eval": [
|
| 106 |
+
true,
|
| 107 |
+
true
|
| 108 |
+
],
|
| 109 |
+
"adamw": {
|
| 110 |
+
"lr": 2.0e-4
|
| 111 |
+
},
|
| 112 |
+
"reducelronplateau": {
|
| 113 |
+
"factor": 0.8,
|
| 114 |
+
"patience": 30,
|
| 115 |
+
"min_lr": 1.0e-4
|
| 116 |
+
},
|
| 117 |
+
"dataloader": {
|
| 118 |
+
"num_worker": 8,
|
| 119 |
+
"pin_memory": true
|
| 120 |
+
},
|
| 121 |
+
"sampler": {
|
| 122 |
+
"holistic_shuffle": false,
|
| 123 |
+
"drop_last": true
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
}
|
egs/svc/MultipleContentsSVC/run.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
../_template/run.sh
|
egs/svc/README.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Amphion Singing Voice Conversion (SVC) Recipe
|
| 2 |
+
|
| 3 |
+
## Quick Start
|
| 4 |
+
|
| 5 |
+
We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html).
|
| 6 |
+
|
| 7 |
+
## Supported Model Architectures
|
| 8 |
+
|
| 9 |
+
The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder):
|
| 10 |
+
|
| 11 |
+
<br>
|
| 12 |
+
<div align="center">
|
| 13 |
+
<img src="../../imgs/svc/pipeline.png" width="70%">
|
| 14 |
+
</div>
|
| 15 |
+
<br>
|
| 16 |
+
|
| 17 |
+
Until now, Amphion SVC has supported the following features and models:
|
| 18 |
+
|
| 19 |
+
- **Speaker-agnostic Representations**:
|
| 20 |
+
- Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec).
|
| 21 |
+
- Prosody Features: F0 and energy.
|
| 22 |
+
- **Speaker Embeddings**:
|
| 23 |
+
- Speaker Look-Up Table.
|
| 24 |
+
- Reference Encoder (👨💻 developing): It can be used for zero-shot SVC.
|
| 25 |
+
- **Acoustic Decoders**:
|
| 26 |
+
- Diffusion-based models:
|
| 27 |
+
- **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
|
| 28 |
+
- **[DiffComoSVC](DiffComoSVC)** (👨💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model.
|
| 29 |
+
- Transformer-based models:
|
| 30 |
+
- **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
|
| 31 |
+
- VAE- and Flow-based models:
|
| 32 |
+
- **[VitsSVC]()** (👨💻 developing): It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
|
| 33 |
+
- **Waveform Synthesizers (Vocoders)**:
|
| 34 |
+
- The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).
|
egs/svc/_template/run.sh
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
######## Build Experiment Environment ###########
|
| 7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
| 8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
| 9 |
+
|
| 10 |
+
export WORK_DIR=$work_dir
|
| 11 |
+
export PYTHONPATH=$work_dir
|
| 12 |
+
export PYTHONIOENCODING=UTF-8
|
| 13 |
+
|
| 14 |
+
######## Parse the Given Parameters from the Commond ###########
|
| 15 |
+
options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
|
| 16 |
+
eval set -- "$options"
|
| 17 |
+
|
| 18 |
+
while true; do
|
| 19 |
+
case $1 in
|
| 20 |
+
# Experimental Configuration File
|
| 21 |
+
-c | --config) shift; exp_config=$1 ; shift ;;
|
| 22 |
+
# Experimental Name
|
| 23 |
+
-n | --name) shift; exp_name=$1 ; shift ;;
|
| 24 |
+
# Running Stage
|
| 25 |
+
-s | --stage) shift; running_stage=$1 ; shift ;;
|
| 26 |
+
# Visible GPU machines. The default value is "0".
|
| 27 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
| 28 |
+
|
| 29 |
+
# [Only for Training] Resume configuration
|
| 30 |
+
--resume) shift; resume=$1 ; shift ;;
|
| 31 |
+
# [Only for Training] The specific checkpoint path that you want to resume from.
|
| 32 |
+
--resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
|
| 33 |
+
# [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
|
| 34 |
+
--resume_type) shift; resume_type=$1 ; shift ;;
|
| 35 |
+
|
| 36 |
+
# [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
| 37 |
+
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
|
| 38 |
+
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
|
| 39 |
+
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
|
| 40 |
+
# [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
|
| 41 |
+
--infer_source_file) shift; infer_source_file=$1 ; shift ;;
|
| 42 |
+
--infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
|
| 43 |
+
# [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
|
| 44 |
+
--infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
|
| 45 |
+
# [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
|
| 46 |
+
--infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
|
| 47 |
+
# [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
|
| 48 |
+
--infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
|
| 49 |
+
|
| 50 |
+
--) shift ; break ;;
|
| 51 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
| 52 |
+
esac
|
| 53 |
+
done
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
### Value check ###
|
| 57 |
+
if [ -z "$running_stage" ]; then
|
| 58 |
+
echo "[Error] Please specify the running stage"
|
| 59 |
+
exit 1
|
| 60 |
+
fi
|
| 61 |
+
|
| 62 |
+
if [ -z "$exp_config" ]; then
|
| 63 |
+
exp_config="${exp_dir}"/exp_config.json
|
| 64 |
+
fi
|
| 65 |
+
echo "Exprimental Configuration File: $exp_config"
|
| 66 |
+
|
| 67 |
+
if [ -z "$gpu" ]; then
|
| 68 |
+
gpu="0"
|
| 69 |
+
fi
|
| 70 |
+
|
| 71 |
+
######## Features Extraction ###########
|
| 72 |
+
if [ $running_stage -eq 1 ]; then
|
| 73 |
+
CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
|
| 74 |
+
--config $exp_config \
|
| 75 |
+
--num_workers 4
|
| 76 |
+
fi
|
| 77 |
+
|
| 78 |
+
######## Training ###########
|
| 79 |
+
if [ $running_stage -eq 2 ]; then
|
| 80 |
+
if [ -z "$exp_name" ]; then
|
| 81 |
+
echo "[Error] Please specify the experiments name"
|
| 82 |
+
exit 1
|
| 83 |
+
fi
|
| 84 |
+
echo "Exprimental Name: $exp_name"
|
| 85 |
+
|
| 86 |
+
if [ "$resume" = true ]; then
|
| 87 |
+
echo "Automatically resume from the experimental dir..."
|
| 88 |
+
CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
|
| 89 |
+
--config "$exp_config" \
|
| 90 |
+
--exp_name "$exp_name" \
|
| 91 |
+
--log_level info \
|
| 92 |
+
--resume
|
| 93 |
+
else
|
| 94 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
|
| 95 |
+
--config "$exp_config" \
|
| 96 |
+
--exp_name "$exp_name" \
|
| 97 |
+
--log_level info \
|
| 98 |
+
--resume_from_ckpt_path "$resume_from_ckpt_path" \
|
| 99 |
+
--resume_type "$resume_type"
|
| 100 |
+
fi
|
| 101 |
+
fi
|
| 102 |
+
|
| 103 |
+
######## Inference/Conversion ###########
|
| 104 |
+
if [ $running_stage -eq 3 ]; then
|
| 105 |
+
if [ -z "$infer_expt_dir" ]; then
|
| 106 |
+
echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
|
| 107 |
+
exit 1
|
| 108 |
+
fi
|
| 109 |
+
|
| 110 |
+
if [ -z "$infer_output_dir" ]; then
|
| 111 |
+
infer_output_dir="$expt_dir/result"
|
| 112 |
+
fi
|
| 113 |
+
|
| 114 |
+
if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
|
| 115 |
+
echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
|
| 116 |
+
exit 1
|
| 117 |
+
fi
|
| 118 |
+
|
| 119 |
+
if [ -z "$infer_source_file" ]; then
|
| 120 |
+
infer_source=$infer_source_audio_dir
|
| 121 |
+
fi
|
| 122 |
+
|
| 123 |
+
if [ -z "$infer_source_audio_dir" ]; then
|
| 124 |
+
infer_source=$infer_source_file
|
| 125 |
+
fi
|
| 126 |
+
|
| 127 |
+
if [ -z "$infer_target_speaker" ]; then
|
| 128 |
+
echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
|
| 129 |
+
exit 1
|
| 130 |
+
fi
|
| 131 |
+
|
| 132 |
+
if [ -z "$infer_key_shift" ]; then
|
| 133 |
+
infer_key_shift="autoshift"
|
| 134 |
+
fi
|
| 135 |
+
|
| 136 |
+
if [ -z "$infer_vocoder_dir" ]; then
|
| 137 |
+
infer_vocoder_dir="$work_dir"/pretrained/bigvgan
|
| 138 |
+
echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
|
| 139 |
+
fi
|
| 140 |
+
|
| 141 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
|
| 142 |
+
--config $exp_config \
|
| 143 |
+
--acoustics_dir $infer_expt_dir \
|
| 144 |
+
--vocoder_dir $infer_vocoder_dir \
|
| 145 |
+
--target_singer $infer_target_speaker \
|
| 146 |
+
--trans_key $infer_key_shift \
|
| 147 |
+
--source $infer_source \
|
| 148 |
+
--output_dir $infer_output_dir \
|
| 149 |
+
--log_level debug
|
| 150 |
+
fi
|
egs/vocoder/README.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Amphion Vocoder Recipe
|
| 2 |
+
|
| 3 |
+
## Quick Start
|
| 4 |
+
|
| 5 |
+
We provide a [**beginner recipe**](gan/tfr_enhanced_hifigan/README.md) to demonstrate how to train a high quality HiFi-GAN speech vocoder. Specially, it is also an official implementation of our paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". Some demos can be seen [here](https://vocodexelysium.github.io/MS-SB-CQTD/).
|
| 6 |
+
|
| 7 |
+
## Supported Models
|
| 8 |
+
|
| 9 |
+
Neural vocoder generates audible waveforms from acoustic representations, which is one of the key parts for current audio generation systems. Until now, Amphion has supported various widely-used vocoders according to different vocoder types, including:
|
| 10 |
+
|
| 11 |
+
- **GAN-based vocoders**, which we have provided [**a unified recipe**](gan/README.md) :
|
| 12 |
+
- [MelGAN](https://arxiv.org/abs/1910.06711)
|
| 13 |
+
- [HiFi-GAN](https://arxiv.org/abs/2010.05646)
|
| 14 |
+
- [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts)
|
| 15 |
+
- [BigVGAN](https://arxiv.org/abs/2206.04658)
|
| 16 |
+
- [APNet](https://arxiv.org/abs/2305.07952)
|
| 17 |
+
- **Flow-based vocoders** (👨💻 developing):
|
| 18 |
+
- [WaveGlow](https://arxiv.org/abs/1811.00002)
|
| 19 |
+
- **Diffusion-based vocoders** (👨💻 developing):
|
| 20 |
+
- [Diffwave](https://arxiv.org/abs/2009.09761)
|
| 21 |
+
- **Auto-regressive based vocoders** (👨💻 developing):
|
| 22 |
+
- [WaveNet](https://arxiv.org/abs/1609.03499)
|
| 23 |
+
- [WaveRNN](https://arxiv.org/abs/1802.08435v1)
|
egs/vocoder/diffusion/README.md
ADDED
|
File without changes
|
egs/vocoder/diffusion/exp_config_base.json
ADDED
|
File without changes
|
egs/vocoder/gan/README.md
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Amphion GAN-based Vocoder Recipe
|
| 2 |
+
|
| 3 |
+
## Supported Model Architectures
|
| 4 |
+
|
| 5 |
+
GAN-based Vocoder consists of a generator and multiple discriminators, as illustrated below:
|
| 6 |
+
|
| 7 |
+
<br>
|
| 8 |
+
<div align="center">
|
| 9 |
+
<img src="../../../imgs/vocoder/gan/pipeline.png" width="40%">
|
| 10 |
+
</div>
|
| 11 |
+
<br>
|
| 12 |
+
|
| 13 |
+
Until now, Amphion GAN-based Vocoder has supported the following generators and discriminators.
|
| 14 |
+
|
| 15 |
+
- **Generators**
|
| 16 |
+
- [MelGAN](https://arxiv.org/abs/1910.06711)
|
| 17 |
+
- [HiFi-GAN](https://arxiv.org/abs/2010.05646)
|
| 18 |
+
- [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts)
|
| 19 |
+
- [BigVGAN](https://arxiv.org/abs/2206.04658)
|
| 20 |
+
- [APNet](https://arxiv.org/abs/2305.07952)
|
| 21 |
+
- **Discriminators**
|
| 22 |
+
- [Multi-Scale Discriminator](https://arxiv.org/abs/2010.05646)
|
| 23 |
+
- [Multi-Period Discriminator](https://arxiv.org/abs/2010.05646)
|
| 24 |
+
- [Multi-Resolution Discriminator](https://arxiv.org/abs/2011.09631)
|
| 25 |
+
- [Multi-Scale Short-Time Fourier Transform Discriminator](https://arxiv.org/abs/2210.13438)
|
| 26 |
+
- [**Multi-Scale Constant-Q Transfrom Discriminator (ours)**](https://arxiv.org/abs/2311.14957)
|
| 27 |
+
|
| 28 |
+
You can use any vocoder architecture with any dataset you want. There are four steps in total:
|
| 29 |
+
|
| 30 |
+
1. Data preparation
|
| 31 |
+
2. Feature extraction
|
| 32 |
+
3. Training
|
| 33 |
+
4. Inference
|
| 34 |
+
|
| 35 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
| 36 |
+
> ```bash
|
| 37 |
+
> cd Amphion
|
| 38 |
+
> ```
|
| 39 |
+
|
| 40 |
+
## 1. Data Preparation
|
| 41 |
+
|
| 42 |
+
You can train the vocoder with any datasets. Amphion's supported open-source datasets are detailed [here](../../../datasets/README.md).
|
| 43 |
+
|
| 44 |
+
### Configuration
|
| 45 |
+
|
| 46 |
+
Specify the dataset path in `exp_config_base.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
| 47 |
+
|
| 48 |
+
```json
|
| 49 |
+
"dataset": [
|
| 50 |
+
"csd",
|
| 51 |
+
"kising",
|
| 52 |
+
"m4singer",
|
| 53 |
+
"nus48e",
|
| 54 |
+
"opencpop",
|
| 55 |
+
"opensinger",
|
| 56 |
+
"opera",
|
| 57 |
+
"pjs",
|
| 58 |
+
"popbutfy",
|
| 59 |
+
"popcs",
|
| 60 |
+
"ljspeech",
|
| 61 |
+
"vctk",
|
| 62 |
+
"libritts",
|
| 63 |
+
],
|
| 64 |
+
"dataset_path": {
|
| 65 |
+
// TODO: Fill in your dataset path
|
| 66 |
+
"csd": "[dataset path]",
|
| 67 |
+
"kising": "[dataset path]",
|
| 68 |
+
"m4singer": "[dataset path]",
|
| 69 |
+
"nus48e": "[dataset path]",
|
| 70 |
+
"opencpop": "[dataset path]",
|
| 71 |
+
"opensinger": "[dataset path]",
|
| 72 |
+
"opera": "[dataset path]",
|
| 73 |
+
"pjs": "[dataset path]",
|
| 74 |
+
"popbutfy": "[dataset path]",
|
| 75 |
+
"popcs": "[dataset path]",
|
| 76 |
+
"ljspeech": "[dataset path]",
|
| 77 |
+
"vctk": "[dataset path]",
|
| 78 |
+
"libritts": "[dataset path]",
|
| 79 |
+
},
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
### 2. Feature Extraction
|
| 83 |
+
|
| 84 |
+
The needed features are speficied in the individual vocoder direction so it doesn't require any modification.
|
| 85 |
+
|
| 86 |
+
### Configuration
|
| 87 |
+
|
| 88 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config_base.json`:
|
| 89 |
+
|
| 90 |
+
```json
|
| 91 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
|
| 92 |
+
"log_dir": "ckpts/vocoder",
|
| 93 |
+
"preprocess": {
|
| 94 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
| 95 |
+
"processed_dir": "data",
|
| 96 |
+
...
|
| 97 |
+
},
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
### Run
|
| 101 |
+
|
| 102 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
| 103 |
+
|
| 104 |
+
```bash
|
| 105 |
+
sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 1
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
| 109 |
+
|
| 110 |
+
## 3. Training
|
| 111 |
+
|
| 112 |
+
### Configuration
|
| 113 |
+
|
| 114 |
+
We provide the default hyparameters in the `exp_config_base.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
| 115 |
+
|
| 116 |
+
```json
|
| 117 |
+
"train": {
|
| 118 |
+
"batch_size": 16,
|
| 119 |
+
"max_epoch": 1000000,
|
| 120 |
+
"save_checkpoint_stride": [20],
|
| 121 |
+
"adamw": {
|
| 122 |
+
"lr": 2.0e-4,
|
| 123 |
+
"adam_b1": 0.8,
|
| 124 |
+
"adam_b2": 0.99
|
| 125 |
+
},
|
| 126 |
+
"exponential_lr": {
|
| 127 |
+
"lr_decay": 0.999
|
| 128 |
+
},
|
| 129 |
+
}
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
You can also choose any amount of prefered discriminators for training in the `exp_config_base.json`.
|
| 133 |
+
|
| 134 |
+
```json
|
| 135 |
+
"discriminators": [
|
| 136 |
+
"msd",
|
| 137 |
+
"mpd",
|
| 138 |
+
"msstftd",
|
| 139 |
+
"mssbcqtd",
|
| 140 |
+
],
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
### Run
|
| 144 |
+
|
| 145 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`.
|
| 146 |
+
|
| 147 |
+
```bash
|
| 148 |
+
sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 2 --name [YourExptName]
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
## 4. Inference
|
| 155 |
+
|
| 156 |
+
### Run
|
| 157 |
+
|
| 158 |
+
Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from_audio`.
|
| 159 |
+
|
| 160 |
+
```bash
|
| 161 |
+
sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
|
| 162 |
+
--infer_mode [Your chosen inference mode] \
|
| 163 |
+
--infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \
|
| 164 |
+
--infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \
|
| 165 |
+
--infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \
|
| 166 |
+
--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
|
| 167 |
+
--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
#### a. Inference from Dataset
|
| 171 |
+
|
| 172 |
+
Run the `run.sh` with specified datasets, here is an example.
|
| 173 |
+
|
| 174 |
+
```bash
|
| 175 |
+
sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
|
| 176 |
+
--infer_mode infer_from_dataset \
|
| 177 |
+
--infer_datasets "libritts vctk ljspeech" \
|
| 178 |
+
--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
|
| 179 |
+
--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
#### b. Inference from Features
|
| 183 |
+
|
| 184 |
+
If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure:
|
| 185 |
+
|
| 186 |
+
```plaintext
|
| 187 |
+
┣ {infer_feature_dir}
|
| 188 |
+
┃ ┣ mels
|
| 189 |
+
┃ ┃ ┣ sample1.npy
|
| 190 |
+
┃ ┃ ┣ sample2.npy
|
| 191 |
+
┃ ┣ f0s (required if you use NSF-HiFiGAN)
|
| 192 |
+
┃ ┃ ┣ sample1.npy
|
| 193 |
+
┃ ┃ ┣ sample2.npy
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
Then run the `run.sh` with specificed folder direction, here is an example.
|
| 197 |
+
|
| 198 |
+
```bash
|
| 199 |
+
sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
|
| 200 |
+
--infer_mode infer_from_feature \
|
| 201 |
+
--infer_feature_dir [Your path to your predicted acoustic features] \
|
| 202 |
+
--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
|
| 203 |
+
--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
|
| 204 |
+
```
|
| 205 |
+
|
| 206 |
+
#### c. Inference from Audios
|
| 207 |
+
|
| 208 |
+
If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure:
|
| 209 |
+
|
| 210 |
+
```plaintext
|
| 211 |
+
┣ audios
|
| 212 |
+
┃ ┣ sample1.wav
|
| 213 |
+
┃ ┣ sample2.wav
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
Then run the `run.sh` with specificed folder direction, here is an example.
|
| 217 |
+
|
| 218 |
+
```bash
|
| 219 |
+
sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
|
| 220 |
+
--infer_mode infer_from_audio \
|
| 221 |
+
--infer_audio_dir [Your path to your audio files] \
|
| 222 |
+
--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
|
| 223 |
+
--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
|
| 224 |
+
```
|
egs/vocoder/gan/_template/run.sh
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
######## Build Experiment Environment ###########
|
| 7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
| 8 |
+
work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
|
| 9 |
+
|
| 10 |
+
export WORK_DIR=$work_dir
|
| 11 |
+
export PYTHONPATH=$work_dir
|
| 12 |
+
export PYTHONIOENCODING=UTF-8
|
| 13 |
+
|
| 14 |
+
######## Parse the Given Parameters from the Commond ###########
|
| 15 |
+
options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
|
| 16 |
+
eval set -- "$options"
|
| 17 |
+
|
| 18 |
+
while true; do
|
| 19 |
+
case $1 in
|
| 20 |
+
# Experimental Configuration File
|
| 21 |
+
-c | --config) shift; exp_config=$1 ; shift ;;
|
| 22 |
+
# Experimental Name
|
| 23 |
+
-n | --name) shift; exp_name=$1 ; shift ;;
|
| 24 |
+
# Running Stage
|
| 25 |
+
-s | --stage) shift; running_stage=$1 ; shift ;;
|
| 26 |
+
# Visible GPU machines. The default value is "0".
|
| 27 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
| 28 |
+
|
| 29 |
+
# [Only for Training] Resume configuration
|
| 30 |
+
--resume) shift; resume=$1 ; shift ;;
|
| 31 |
+
# [Only for Training] The specific checkpoint path that you want to resume from.
|
| 32 |
+
--checkpoint) shift; cehckpoint=$1 ; shift ;;
|
| 33 |
+
# [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
|
| 34 |
+
--resume_type) shift; resume_type=$1 ; shift ;;
|
| 35 |
+
|
| 36 |
+
# [Only for Inference] The inference mode
|
| 37 |
+
--infer_mode) shift; infer_mode=$1 ; shift ;;
|
| 38 |
+
# [Only for Inference] The inferenced datasets
|
| 39 |
+
--infer_datasets) shift; infer_datasets=$1 ; shift ;;
|
| 40 |
+
# [Only for Inference] The feature dir for inference
|
| 41 |
+
--infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
|
| 42 |
+
# [Only for Inference] The audio dir for inference
|
| 43 |
+
--infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
|
| 44 |
+
# [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
| 45 |
+
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
|
| 46 |
+
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
|
| 47 |
+
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
|
| 48 |
+
|
| 49 |
+
--) shift ; break ;;
|
| 50 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
| 51 |
+
esac
|
| 52 |
+
done
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
### Value check ###
|
| 56 |
+
if [ -z "$running_stage" ]; then
|
| 57 |
+
echo "[Error] Please specify the running stage"
|
| 58 |
+
exit 1
|
| 59 |
+
fi
|
| 60 |
+
|
| 61 |
+
if [ -z "$exp_config" ]; then
|
| 62 |
+
exp_config="${exp_dir}"/exp_config.json
|
| 63 |
+
fi
|
| 64 |
+
echo "Exprimental Configuration File: $exp_config"
|
| 65 |
+
|
| 66 |
+
if [ -z "$gpu" ]; then
|
| 67 |
+
gpu="0"
|
| 68 |
+
fi
|
| 69 |
+
|
| 70 |
+
######## Features Extraction ###########
|
| 71 |
+
if [ $running_stage -eq 1 ]; then
|
| 72 |
+
CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
|
| 73 |
+
--config $exp_config \
|
| 74 |
+
--num_workers 8
|
| 75 |
+
fi
|
| 76 |
+
|
| 77 |
+
######## Training ###########
|
| 78 |
+
if [ $running_stage -eq 2 ]; then
|
| 79 |
+
if [ -z "$exp_name" ]; then
|
| 80 |
+
echo "[Error] Please specify the experiments name"
|
| 81 |
+
exit 1
|
| 82 |
+
fi
|
| 83 |
+
echo "Exprimental Name: $exp_name"
|
| 84 |
+
|
| 85 |
+
if [ "$resume" = true ]; then
|
| 86 |
+
echo "Automatically resume from the experimental dir..."
|
| 87 |
+
CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 88 |
+
--config "$exp_config" \
|
| 89 |
+
--exp_name "$exp_name" \
|
| 90 |
+
--log_level info \
|
| 91 |
+
--resume
|
| 92 |
+
else
|
| 93 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 94 |
+
--config "$exp_config" \
|
| 95 |
+
--exp_name "$exp_name" \
|
| 96 |
+
--log_level info \
|
| 97 |
+
--checkpoint "$checkpoint" \
|
| 98 |
+
--resume_type "$resume_type"
|
| 99 |
+
fi
|
| 100 |
+
fi
|
| 101 |
+
|
| 102 |
+
######## Inference/Conversion ###########
|
| 103 |
+
if [ $running_stage -eq 3 ]; then
|
| 104 |
+
if [ -z "$infer_expt_dir" ]; then
|
| 105 |
+
echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
|
| 106 |
+
exit 1
|
| 107 |
+
fi
|
| 108 |
+
|
| 109 |
+
if [ -z "$infer_output_dir" ]; then
|
| 110 |
+
infer_output_dir="$infer_expt_dir/result"
|
| 111 |
+
fi
|
| 112 |
+
|
| 113 |
+
if [ $infer_mode = "infer_from_dataset" ]; then
|
| 114 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 115 |
+
--config $exp_config \
|
| 116 |
+
--infer_mode $infer_mode \
|
| 117 |
+
--infer_datasets $infer_datasets \
|
| 118 |
+
--vocoder_dir $infer_expt_dir \
|
| 119 |
+
--output_dir $infer_output_dir \
|
| 120 |
+
--log_level debug
|
| 121 |
+
fi
|
| 122 |
+
|
| 123 |
+
if [ $infer_mode = "infer_from_feature" ]; then
|
| 124 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 125 |
+
--config $exp_config \
|
| 126 |
+
--infer_mode $infer_mode \
|
| 127 |
+
--feature_folder $infer_feature_dir \
|
| 128 |
+
--vocoder_dir $infer_expt_dir \
|
| 129 |
+
--output_dir $infer_output_dir \
|
| 130 |
+
--log_level debug
|
| 131 |
+
fi
|
| 132 |
+
|
| 133 |
+
if [ $infer_mode = "infer_from_audio" ]; then
|
| 134 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 135 |
+
--config $exp_config \
|
| 136 |
+
--infer_mode $infer_mode \
|
| 137 |
+
--audio_folder $infer_audio_dir \
|
| 138 |
+
--vocoder_dir $infer_expt_dir \
|
| 139 |
+
--output_dir $infer_output_dir \
|
| 140 |
+
--log_level debug
|
| 141 |
+
fi
|
| 142 |
+
|
| 143 |
+
fi
|
egs/vocoder/gan/apnet/exp_config.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"base_config": "egs/vocoder/gan/exp_config_base.json",
|
| 3 |
+
"preprocess": {
|
| 4 |
+
// acoustic features
|
| 5 |
+
"extract_mel": true,
|
| 6 |
+
"extract_audio": true,
|
| 7 |
+
"extract_amplitude_phase": true,
|
| 8 |
+
|
| 9 |
+
// Features used for model training
|
| 10 |
+
"use_mel": true,
|
| 11 |
+
"use_audio": true,
|
| 12 |
+
"use_amplitude_phase": true
|
| 13 |
+
},
|
| 14 |
+
"model": {
|
| 15 |
+
"generator": "apnet",
|
| 16 |
+
"apnet": {
|
| 17 |
+
"ASP_channel": 512,
|
| 18 |
+
"ASP_resblock_kernel_sizes": [3,7,11],
|
| 19 |
+
"ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
| 20 |
+
"ASP_input_conv_kernel_size": 7,
|
| 21 |
+
"ASP_output_conv_kernel_size": 7,
|
| 22 |
+
|
| 23 |
+
"PSP_channel": 512,
|
| 24 |
+
"PSP_resblock_kernel_sizes": [3,7,11],
|
| 25 |
+
"PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
| 26 |
+
"PSP_input_conv_kernel_size": 7,
|
| 27 |
+
"PSP_output_R_conv_kernel_size": 7,
|
| 28 |
+
"PSP_output_I_conv_kernel_size": 7,
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"train": {
|
| 32 |
+
"criterions": [
|
| 33 |
+
"feature",
|
| 34 |
+
"discriminator",
|
| 35 |
+
"generator",
|
| 36 |
+
"mel",
|
| 37 |
+
"phase",
|
| 38 |
+
"amplitude",
|
| 39 |
+
"consistency"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
"inference": {
|
| 43 |
+
"batch_size": 1,
|
| 44 |
+
}
|
| 45 |
+
}
|
egs/vocoder/gan/apnet/run.sh
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
######## Build Experiment Environment ###########
|
| 7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
| 8 |
+
work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
|
| 9 |
+
|
| 10 |
+
export WORK_DIR=$work_dir
|
| 11 |
+
export PYTHONPATH=$work_dir
|
| 12 |
+
export PYTHONIOENCODING=UTF-8
|
| 13 |
+
|
| 14 |
+
######## Parse the Given Parameters from the Commond ###########
|
| 15 |
+
options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
|
| 16 |
+
eval set -- "$options"
|
| 17 |
+
|
| 18 |
+
while true; do
|
| 19 |
+
case $1 in
|
| 20 |
+
# Experimental Configuration File
|
| 21 |
+
-c | --config) shift; exp_config=$1 ; shift ;;
|
| 22 |
+
# Experimental Name
|
| 23 |
+
-n | --name) shift; exp_name=$1 ; shift ;;
|
| 24 |
+
# Running Stage
|
| 25 |
+
-s | --stage) shift; running_stage=$1 ; shift ;;
|
| 26 |
+
# Visible GPU machines. The default value is "0".
|
| 27 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
| 28 |
+
|
| 29 |
+
# [Only for Training] Resume configuration
|
| 30 |
+
--resume) shift; resume=$1 ; shift ;;
|
| 31 |
+
# [Only for Training] The specific checkpoint path that you want to resume from.
|
| 32 |
+
--checkpoint) shift; cehckpoint=$1 ; shift ;;
|
| 33 |
+
# [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
|
| 34 |
+
--resume_type) shift; resume_type=$1 ; shift ;;
|
| 35 |
+
|
| 36 |
+
# [Only for Inference] The inference mode
|
| 37 |
+
--infer_mode) shift; infer_mode=$1 ; shift ;;
|
| 38 |
+
# [Only for Inference] The inferenced datasets
|
| 39 |
+
--infer_datasets) shift; infer_datasets=$1 ; shift ;;
|
| 40 |
+
# [Only for Inference] The feature dir for inference
|
| 41 |
+
--infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
|
| 42 |
+
# [Only for Inference] The audio dir for inference
|
| 43 |
+
--infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
|
| 44 |
+
# [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
| 45 |
+
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
|
| 46 |
+
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
|
| 47 |
+
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
|
| 48 |
+
|
| 49 |
+
--) shift ; break ;;
|
| 50 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
| 51 |
+
esac
|
| 52 |
+
done
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
### Value check ###
|
| 56 |
+
if [ -z "$running_stage" ]; then
|
| 57 |
+
echo "[Error] Please specify the running stage"
|
| 58 |
+
exit 1
|
| 59 |
+
fi
|
| 60 |
+
|
| 61 |
+
if [ -z "$exp_config" ]; then
|
| 62 |
+
exp_config="${exp_dir}"/exp_config.json
|
| 63 |
+
fi
|
| 64 |
+
echo "Exprimental Configuration File: $exp_config"
|
| 65 |
+
|
| 66 |
+
if [ -z "$gpu" ]; then
|
| 67 |
+
gpu="0"
|
| 68 |
+
fi
|
| 69 |
+
|
| 70 |
+
######## Features Extraction ###########
|
| 71 |
+
if [ $running_stage -eq 1 ]; then
|
| 72 |
+
CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
|
| 73 |
+
--config $exp_config \
|
| 74 |
+
--num_workers 8
|
| 75 |
+
fi
|
| 76 |
+
|
| 77 |
+
######## Training ###########
|
| 78 |
+
if [ $running_stage -eq 2 ]; then
|
| 79 |
+
if [ -z "$exp_name" ]; then
|
| 80 |
+
echo "[Error] Please specify the experiments name"
|
| 81 |
+
exit 1
|
| 82 |
+
fi
|
| 83 |
+
echo "Exprimental Name: $exp_name"
|
| 84 |
+
|
| 85 |
+
if [ "$resume" = true ]; then
|
| 86 |
+
echo "Automatically resume from the experimental dir..."
|
| 87 |
+
CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 88 |
+
--config "$exp_config" \
|
| 89 |
+
--exp_name "$exp_name" \
|
| 90 |
+
--log_level info \
|
| 91 |
+
--resume
|
| 92 |
+
else
|
| 93 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 94 |
+
--config "$exp_config" \
|
| 95 |
+
--exp_name "$exp_name" \
|
| 96 |
+
--log_level info \
|
| 97 |
+
--checkpoint "$checkpoint" \
|
| 98 |
+
--resume_type "$resume_type"
|
| 99 |
+
fi
|
| 100 |
+
fi
|
| 101 |
+
|
| 102 |
+
######## Inference/Conversion ###########
|
| 103 |
+
if [ $running_stage -eq 3 ]; then
|
| 104 |
+
if [ -z "$infer_expt_dir" ]; then
|
| 105 |
+
echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
|
| 106 |
+
exit 1
|
| 107 |
+
fi
|
| 108 |
+
|
| 109 |
+
if [ -z "$infer_output_dir" ]; then
|
| 110 |
+
infer_output_dir="$infer_expt_dir/result"
|
| 111 |
+
fi
|
| 112 |
+
|
| 113 |
+
if [ $infer_mode = "infer_from_dataset" ]; then
|
| 114 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 115 |
+
--config $exp_config \
|
| 116 |
+
--infer_mode $infer_mode \
|
| 117 |
+
--infer_datasets $infer_datasets \
|
| 118 |
+
--vocoder_dir $infer_expt_dir \
|
| 119 |
+
--output_dir $infer_output_dir \
|
| 120 |
+
--log_level debug
|
| 121 |
+
fi
|
| 122 |
+
|
| 123 |
+
if [ $infer_mode = "infer_from_feature" ]; then
|
| 124 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 125 |
+
--config $exp_config \
|
| 126 |
+
--infer_mode $infer_mode \
|
| 127 |
+
--feature_folder $infer_feature_dir \
|
| 128 |
+
--vocoder_dir $infer_expt_dir \
|
| 129 |
+
--output_dir $infer_output_dir \
|
| 130 |
+
--log_level debug
|
| 131 |
+
fi
|
| 132 |
+
|
| 133 |
+
if [ $infer_mode = "infer_from_audio" ]; then
|
| 134 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 135 |
+
--config $exp_config \
|
| 136 |
+
--infer_mode $infer_mode \
|
| 137 |
+
--audio_folder $infer_audio_dir \
|
| 138 |
+
--vocoder_dir $infer_expt_dir \
|
| 139 |
+
--output_dir $infer_output_dir \
|
| 140 |
+
--log_level debug
|
| 141 |
+
fi
|
| 142 |
+
|
| 143 |
+
fi
|
egs/vocoder/gan/bigvgan/exp_config.json
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"base_config": "egs/vocoder/gan/exp_config_base.json",
|
| 3 |
+
"preprocess": {
|
| 4 |
+
// acoustic features
|
| 5 |
+
"extract_mel": true,
|
| 6 |
+
"extract_audio": true,
|
| 7 |
+
|
| 8 |
+
// Features used for model training
|
| 9 |
+
"use_mel": true,
|
| 10 |
+
"use_audio": true
|
| 11 |
+
},
|
| 12 |
+
"model": {
|
| 13 |
+
"generator": "bigvgan",
|
| 14 |
+
"bigvgan": {
|
| 15 |
+
"resblock": "1",
|
| 16 |
+
"activation": "snakebeta",
|
| 17 |
+
"snake_logscale": true,
|
| 18 |
+
"upsample_rates": [
|
| 19 |
+
8,
|
| 20 |
+
8,
|
| 21 |
+
2,
|
| 22 |
+
2,
|
| 23 |
+
],
|
| 24 |
+
"upsample_kernel_sizes": [
|
| 25 |
+
16,
|
| 26 |
+
16,
|
| 27 |
+
4,
|
| 28 |
+
4
|
| 29 |
+
],
|
| 30 |
+
"upsample_initial_channel": 512,
|
| 31 |
+
"resblock_kernel_sizes": [
|
| 32 |
+
3,
|
| 33 |
+
7,
|
| 34 |
+
11
|
| 35 |
+
],
|
| 36 |
+
"resblock_dilation_sizes": [
|
| 37 |
+
[
|
| 38 |
+
1,
|
| 39 |
+
3,
|
| 40 |
+
5
|
| 41 |
+
],
|
| 42 |
+
[
|
| 43 |
+
1,
|
| 44 |
+
3,
|
| 45 |
+
5
|
| 46 |
+
],
|
| 47 |
+
[
|
| 48 |
+
1,
|
| 49 |
+
3,
|
| 50 |
+
5
|
| 51 |
+
]
|
| 52 |
+
]
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"train": {
|
| 56 |
+
"criterions": [
|
| 57 |
+
"feature",
|
| 58 |
+
"discriminator",
|
| 59 |
+
"generator",
|
| 60 |
+
"mel",
|
| 61 |
+
]
|
| 62 |
+
},
|
| 63 |
+
"inference": {
|
| 64 |
+
"batch_size": 1,
|
| 65 |
+
}
|
| 66 |
+
}
|
egs/vocoder/gan/bigvgan/run.sh
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
######## Build Experiment Environment ###########
|
| 7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
| 8 |
+
work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
|
| 9 |
+
|
| 10 |
+
export WORK_DIR=$work_dir
|
| 11 |
+
export PYTHONPATH=$work_dir
|
| 12 |
+
export PYTHONIOENCODING=UTF-8
|
| 13 |
+
|
| 14 |
+
######## Parse the Given Parameters from the Commond ###########
|
| 15 |
+
options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
|
| 16 |
+
eval set -- "$options"
|
| 17 |
+
|
| 18 |
+
while true; do
|
| 19 |
+
case $1 in
|
| 20 |
+
# Experimental Configuration File
|
| 21 |
+
-c | --config) shift; exp_config=$1 ; shift ;;
|
| 22 |
+
# Experimental Name
|
| 23 |
+
-n | --name) shift; exp_name=$1 ; shift ;;
|
| 24 |
+
# Running Stage
|
| 25 |
+
-s | --stage) shift; running_stage=$1 ; shift ;;
|
| 26 |
+
# Visible GPU machines. The default value is "0".
|
| 27 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
| 28 |
+
|
| 29 |
+
# [Only for Training] Resume configuration
|
| 30 |
+
--resume) shift; resume=$1 ; shift ;;
|
| 31 |
+
# [Only for Training] The specific checkpoint path that you want to resume from.
|
| 32 |
+
--checkpoint) shift; cehckpoint=$1 ; shift ;;
|
| 33 |
+
# [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
|
| 34 |
+
--resume_type) shift; resume_type=$1 ; shift ;;
|
| 35 |
+
|
| 36 |
+
# [Only for Inference] The inference mode
|
| 37 |
+
--infer_mode) shift; infer_mode=$1 ; shift ;;
|
| 38 |
+
# [Only for Inference] The inferenced datasets
|
| 39 |
+
--infer_datasets) shift; infer_datasets=$1 ; shift ;;
|
| 40 |
+
# [Only for Inference] The feature dir for inference
|
| 41 |
+
--infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
|
| 42 |
+
# [Only for Inference] The audio dir for inference
|
| 43 |
+
--infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
|
| 44 |
+
# [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
| 45 |
+
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
|
| 46 |
+
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
|
| 47 |
+
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
|
| 48 |
+
|
| 49 |
+
--) shift ; break ;;
|
| 50 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
| 51 |
+
esac
|
| 52 |
+
done
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
### Value check ###
|
| 56 |
+
if [ -z "$running_stage" ]; then
|
| 57 |
+
echo "[Error] Please specify the running stage"
|
| 58 |
+
exit 1
|
| 59 |
+
fi
|
| 60 |
+
|
| 61 |
+
if [ -z "$exp_config" ]; then
|
| 62 |
+
exp_config="${exp_dir}"/exp_config.json
|
| 63 |
+
fi
|
| 64 |
+
echo "Exprimental Configuration File: $exp_config"
|
| 65 |
+
|
| 66 |
+
if [ -z "$gpu" ]; then
|
| 67 |
+
gpu="0"
|
| 68 |
+
fi
|
| 69 |
+
|
| 70 |
+
######## Features Extraction ###########
|
| 71 |
+
if [ $running_stage -eq 1 ]; then
|
| 72 |
+
CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
|
| 73 |
+
--config $exp_config \
|
| 74 |
+
--num_workers 8
|
| 75 |
+
fi
|
| 76 |
+
|
| 77 |
+
######## Training ###########
|
| 78 |
+
if [ $running_stage -eq 2 ]; then
|
| 79 |
+
if [ -z "$exp_name" ]; then
|
| 80 |
+
echo "[Error] Please specify the experiments name"
|
| 81 |
+
exit 1
|
| 82 |
+
fi
|
| 83 |
+
echo "Exprimental Name: $exp_name"
|
| 84 |
+
|
| 85 |
+
if [ "$resume" = true ]; then
|
| 86 |
+
echo "Automatically resume from the experimental dir..."
|
| 87 |
+
CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 88 |
+
--config "$exp_config" \
|
| 89 |
+
--exp_name "$exp_name" \
|
| 90 |
+
--log_level info \
|
| 91 |
+
--resume
|
| 92 |
+
else
|
| 93 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 94 |
+
--config "$exp_config" \
|
| 95 |
+
--exp_name "$exp_name" \
|
| 96 |
+
--log_level info \
|
| 97 |
+
--checkpoint "$checkpoint" \
|
| 98 |
+
--resume_type "$resume_type"
|
| 99 |
+
fi
|
| 100 |
+
fi
|
| 101 |
+
|
| 102 |
+
######## Inference/Conversion ###########
|
| 103 |
+
if [ $running_stage -eq 3 ]; then
|
| 104 |
+
if [ -z "$infer_expt_dir" ]; then
|
| 105 |
+
echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
|
| 106 |
+
exit 1
|
| 107 |
+
fi
|
| 108 |
+
|
| 109 |
+
if [ -z "$infer_output_dir" ]; then
|
| 110 |
+
infer_output_dir="$infer_expt_dir/result"
|
| 111 |
+
fi
|
| 112 |
+
|
| 113 |
+
if [ $infer_mode = "infer_from_dataset" ]; then
|
| 114 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 115 |
+
--config $exp_config \
|
| 116 |
+
--infer_mode $infer_mode \
|
| 117 |
+
--infer_datasets $infer_datasets \
|
| 118 |
+
--vocoder_dir $infer_expt_dir \
|
| 119 |
+
--output_dir $infer_output_dir \
|
| 120 |
+
--log_level debug
|
| 121 |
+
fi
|
| 122 |
+
|
| 123 |
+
if [ $infer_mode = "infer_from_feature" ]; then
|
| 124 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 125 |
+
--config $exp_config \
|
| 126 |
+
--infer_mode $infer_mode \
|
| 127 |
+
--feature_folder $infer_feature_dir \
|
| 128 |
+
--vocoder_dir $infer_expt_dir \
|
| 129 |
+
--output_dir $infer_output_dir \
|
| 130 |
+
--log_level debug
|
| 131 |
+
fi
|
| 132 |
+
|
| 133 |
+
if [ $infer_mode = "infer_from_audio" ]; then
|
| 134 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 135 |
+
--config $exp_config \
|
| 136 |
+
--infer_mode $infer_mode \
|
| 137 |
+
--audio_folder $infer_audio_dir \
|
| 138 |
+
--vocoder_dir $infer_expt_dir \
|
| 139 |
+
--output_dir $infer_output_dir \
|
| 140 |
+
--log_level debug
|
| 141 |
+
fi
|
| 142 |
+
|
| 143 |
+
fi
|
egs/vocoder/gan/bigvgan_large/exp_config.json
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"base_config": "egs/vocoder/gan/exp_config_base.json",
|
| 3 |
+
"preprocess": {
|
| 4 |
+
// acoustic features
|
| 5 |
+
"extract_mel": true,
|
| 6 |
+
"extract_audio": true,
|
| 7 |
+
|
| 8 |
+
// Features used for model training
|
| 9 |
+
"use_mel": true,
|
| 10 |
+
"use_audio": true
|
| 11 |
+
},
|
| 12 |
+
"model": {
|
| 13 |
+
"generator": "bigvgan",
|
| 14 |
+
"bigvgan": {
|
| 15 |
+
"resblock": "1",
|
| 16 |
+
"activation": "snakebeta",
|
| 17 |
+
"snake_logscale": true,
|
| 18 |
+
"upsample_rates": [
|
| 19 |
+
4,
|
| 20 |
+
4,
|
| 21 |
+
2,
|
| 22 |
+
2,
|
| 23 |
+
2,
|
| 24 |
+
2
|
| 25 |
+
],
|
| 26 |
+
"upsample_kernel_sizes": [
|
| 27 |
+
8,
|
| 28 |
+
8,
|
| 29 |
+
4,
|
| 30 |
+
4,
|
| 31 |
+
4,
|
| 32 |
+
4
|
| 33 |
+
],
|
| 34 |
+
"upsample_initial_channel": 1536,
|
| 35 |
+
"resblock_kernel_sizes": [
|
| 36 |
+
3,
|
| 37 |
+
7,
|
| 38 |
+
11
|
| 39 |
+
],
|
| 40 |
+
"resblock_dilation_sizes": [
|
| 41 |
+
[
|
| 42 |
+
1,
|
| 43 |
+
3,
|
| 44 |
+
5
|
| 45 |
+
],
|
| 46 |
+
[
|
| 47 |
+
1,
|
| 48 |
+
3,
|
| 49 |
+
5
|
| 50 |
+
],
|
| 51 |
+
[
|
| 52 |
+
1,
|
| 53 |
+
3,
|
| 54 |
+
5
|
| 55 |
+
]
|
| 56 |
+
]
|
| 57 |
+
},
|
| 58 |
+
},
|
| 59 |
+
"train": {
|
| 60 |
+
"criterions": [
|
| 61 |
+
"feature",
|
| 62 |
+
"discriminator",
|
| 63 |
+
"generator",
|
| 64 |
+
"mel",
|
| 65 |
+
]
|
| 66 |
+
},
|
| 67 |
+
"inference": {
|
| 68 |
+
"batch_size": 1,
|
| 69 |
+
}
|
| 70 |
+
}
|
egs/vocoder/gan/bigvgan_large/run.sh
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
######## Build Experiment Environment ###########
|
| 7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
| 8 |
+
work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
|
| 9 |
+
|
| 10 |
+
export WORK_DIR=$work_dir
|
| 11 |
+
export PYTHONPATH=$work_dir
|
| 12 |
+
export PYTHONIOENCODING=UTF-8
|
| 13 |
+
|
| 14 |
+
######## Parse the Given Parameters from the Commond ###########
|
| 15 |
+
options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
|
| 16 |
+
eval set -- "$options"
|
| 17 |
+
|
| 18 |
+
while true; do
|
| 19 |
+
case $1 in
|
| 20 |
+
# Experimental Configuration File
|
| 21 |
+
-c | --config) shift; exp_config=$1 ; shift ;;
|
| 22 |
+
# Experimental Name
|
| 23 |
+
-n | --name) shift; exp_name=$1 ; shift ;;
|
| 24 |
+
# Running Stage
|
| 25 |
+
-s | --stage) shift; running_stage=$1 ; shift ;;
|
| 26 |
+
# Visible GPU machines. The default value is "0".
|
| 27 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
| 28 |
+
|
| 29 |
+
# [Only for Training] Resume configuration
|
| 30 |
+
--resume) shift; resume=$1 ; shift ;;
|
| 31 |
+
# [Only for Training] The specific checkpoint path that you want to resume from.
|
| 32 |
+
--checkpoint) shift; cehckpoint=$1 ; shift ;;
|
| 33 |
+
# [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
|
| 34 |
+
--resume_type) shift; resume_type=$1 ; shift ;;
|
| 35 |
+
|
| 36 |
+
# [Only for Inference] The inference mode
|
| 37 |
+
--infer_mode) shift; infer_mode=$1 ; shift ;;
|
| 38 |
+
# [Only for Inference] The inferenced datasets
|
| 39 |
+
--infer_datasets) shift; infer_datasets=$1 ; shift ;;
|
| 40 |
+
# [Only for Inference] The feature dir for inference
|
| 41 |
+
--infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
|
| 42 |
+
# [Only for Inference] The audio dir for inference
|
| 43 |
+
--infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
|
| 44 |
+
# [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
| 45 |
+
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
|
| 46 |
+
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
|
| 47 |
+
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
|
| 48 |
+
|
| 49 |
+
--) shift ; break ;;
|
| 50 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
| 51 |
+
esac
|
| 52 |
+
done
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
### Value check ###
|
| 56 |
+
if [ -z "$running_stage" ]; then
|
| 57 |
+
echo "[Error] Please specify the running stage"
|
| 58 |
+
exit 1
|
| 59 |
+
fi
|
| 60 |
+
|
| 61 |
+
if [ -z "$exp_config" ]; then
|
| 62 |
+
exp_config="${exp_dir}"/exp_config.json
|
| 63 |
+
fi
|
| 64 |
+
echo "Exprimental Configuration File: $exp_config"
|
| 65 |
+
|
| 66 |
+
if [ -z "$gpu" ]; then
|
| 67 |
+
gpu="0"
|
| 68 |
+
fi
|
| 69 |
+
|
| 70 |
+
######## Features Extraction ###########
|
| 71 |
+
if [ $running_stage -eq 1 ]; then
|
| 72 |
+
CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
|
| 73 |
+
--config $exp_config \
|
| 74 |
+
--num_workers 8
|
| 75 |
+
fi
|
| 76 |
+
|
| 77 |
+
######## Training ###########
|
| 78 |
+
if [ $running_stage -eq 2 ]; then
|
| 79 |
+
if [ -z "$exp_name" ]; then
|
| 80 |
+
echo "[Error] Please specify the experiments name"
|
| 81 |
+
exit 1
|
| 82 |
+
fi
|
| 83 |
+
echo "Exprimental Name: $exp_name"
|
| 84 |
+
|
| 85 |
+
if [ "$resume" = true ]; then
|
| 86 |
+
echo "Automatically resume from the experimental dir..."
|
| 87 |
+
CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 88 |
+
--config "$exp_config" \
|
| 89 |
+
--exp_name "$exp_name" \
|
| 90 |
+
--log_level info \
|
| 91 |
+
--resume
|
| 92 |
+
else
|
| 93 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 94 |
+
--config "$exp_config" \
|
| 95 |
+
--exp_name "$exp_name" \
|
| 96 |
+
--log_level info \
|
| 97 |
+
--checkpoint "$checkpoint" \
|
| 98 |
+
--resume_type "$resume_type"
|
| 99 |
+
fi
|
| 100 |
+
fi
|
| 101 |
+
|
| 102 |
+
######## Inference/Conversion ###########
|
| 103 |
+
if [ $running_stage -eq 3 ]; then
|
| 104 |
+
if [ -z "$infer_expt_dir" ]; then
|
| 105 |
+
echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
|
| 106 |
+
exit 1
|
| 107 |
+
fi
|
| 108 |
+
|
| 109 |
+
if [ -z "$infer_output_dir" ]; then
|
| 110 |
+
infer_output_dir="$infer_expt_dir/result"
|
| 111 |
+
fi
|
| 112 |
+
|
| 113 |
+
if [ $infer_mode = "infer_from_dataset" ]; then
|
| 114 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 115 |
+
--config $exp_config \
|
| 116 |
+
--infer_mode $infer_mode \
|
| 117 |
+
--infer_datasets $infer_datasets \
|
| 118 |
+
--vocoder_dir $infer_expt_dir \
|
| 119 |
+
--output_dir $infer_output_dir \
|
| 120 |
+
--log_level debug
|
| 121 |
+
fi
|
| 122 |
+
|
| 123 |
+
if [ $infer_mode = "infer_from_feature" ]; then
|
| 124 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 125 |
+
--config $exp_config \
|
| 126 |
+
--infer_mode $infer_mode \
|
| 127 |
+
--feature_folder $infer_feature_dir \
|
| 128 |
+
--vocoder_dir $infer_expt_dir \
|
| 129 |
+
--output_dir $infer_output_dir \
|
| 130 |
+
--log_level debug
|
| 131 |
+
fi
|
| 132 |
+
|
| 133 |
+
if [ $infer_mode = "infer_from_audio" ]; then
|
| 134 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 135 |
+
--config $exp_config \
|
| 136 |
+
--infer_mode $infer_mode \
|
| 137 |
+
--audio_folder $infer_audio_dir \
|
| 138 |
+
--vocoder_dir $infer_expt_dir \
|
| 139 |
+
--output_dir $infer_output_dir \
|
| 140 |
+
--log_level debug
|
| 141 |
+
fi
|
| 142 |
+
|
| 143 |
+
fi
|
egs/vocoder/gan/exp_config_base.json
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"base_config": "config/vocoder.json",
|
| 3 |
+
"model_type": "GANVocoder",
|
| 4 |
+
// TODO: Choose your needed datasets
|
| 5 |
+
"dataset": [
|
| 6 |
+
"csd",
|
| 7 |
+
"kising",
|
| 8 |
+
"m4singer",
|
| 9 |
+
"nus48e",
|
| 10 |
+
"opencpop",
|
| 11 |
+
"opensinger",
|
| 12 |
+
"opera",
|
| 13 |
+
"pjs",
|
| 14 |
+
"popbutfy",
|
| 15 |
+
"popcs",
|
| 16 |
+
"ljspeech",
|
| 17 |
+
"vctk",
|
| 18 |
+
"libritts",
|
| 19 |
+
],
|
| 20 |
+
"dataset_path": {
|
| 21 |
+
// TODO: Fill in your dataset path
|
| 22 |
+
"csd": "[dataset path]",
|
| 23 |
+
"kising": "[dataset path]",
|
| 24 |
+
"m4singer": "[dataset path]",
|
| 25 |
+
"nus48e": "[dataset path]",
|
| 26 |
+
"opencpop": "[dataset path]",
|
| 27 |
+
"opensinger": "[dataset path]",
|
| 28 |
+
"opera": "[dataset path]",
|
| 29 |
+
"pjs": "[dataset path]",
|
| 30 |
+
"popbutfy": "[dataset path]",
|
| 31 |
+
"popcs": "[dataset path]",
|
| 32 |
+
"ljspeech": "[dataset path]",
|
| 33 |
+
"vctk": "[dataset path]",
|
| 34 |
+
"libritts": "[dataset path]",
|
| 35 |
+
},
|
| 36 |
+
// TODO: Fill in the output log path
|
| 37 |
+
"log_dir": "ckpts/vocoder",
|
| 38 |
+
"preprocess": {
|
| 39 |
+
// Acoustic features
|
| 40 |
+
"extract_mel": true,
|
| 41 |
+
"extract_audio": true,
|
| 42 |
+
"extract_pitch": false,
|
| 43 |
+
"extract_uv": false,
|
| 44 |
+
"pitch_extractor": "parselmouth",
|
| 45 |
+
|
| 46 |
+
// Features used for model training
|
| 47 |
+
"use_mel": true,
|
| 48 |
+
"use_frame_pitch": false,
|
| 49 |
+
"use_uv": false,
|
| 50 |
+
"use_audio": true,
|
| 51 |
+
|
| 52 |
+
// TODO: Fill in the output data path
|
| 53 |
+
"processed_dir": "data/",
|
| 54 |
+
"n_mel": 100,
|
| 55 |
+
"sample_rate": 24000
|
| 56 |
+
},
|
| 57 |
+
"model": {
|
| 58 |
+
// TODO: Choose your needed discriminators
|
| 59 |
+
"discriminators": [
|
| 60 |
+
"msd",
|
| 61 |
+
"mpd",
|
| 62 |
+
"msstftd",
|
| 63 |
+
"mssbcqtd",
|
| 64 |
+
],
|
| 65 |
+
"mpd": {
|
| 66 |
+
"mpd_reshapes": [
|
| 67 |
+
2,
|
| 68 |
+
3,
|
| 69 |
+
5,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"use_spectral_norm": false,
|
| 74 |
+
"discriminator_channel_mult_factor": 1
|
| 75 |
+
},
|
| 76 |
+
"mrd": {
|
| 77 |
+
"resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
|
| 78 |
+
"use_spectral_norm": false,
|
| 79 |
+
"discriminator_channel_mult_factor": 1,
|
| 80 |
+
"mrd_override": false
|
| 81 |
+
},
|
| 82 |
+
"msstftd": {
|
| 83 |
+
"filters": 32
|
| 84 |
+
},
|
| 85 |
+
"mssbcqtd": {
|
| 86 |
+
hop_lengths: [512, 256, 256],
|
| 87 |
+
filters: 32,
|
| 88 |
+
max_filters: 1024,
|
| 89 |
+
filters_scale: 1,
|
| 90 |
+
dilations: [1, 2, 4],
|
| 91 |
+
in_channels: 1,
|
| 92 |
+
out_channels: 1,
|
| 93 |
+
n_octaves: [9, 9, 9],
|
| 94 |
+
bins_per_octaves: [24, 36, 48]
|
| 95 |
+
},
|
| 96 |
+
},
|
| 97 |
+
"train": {
|
| 98 |
+
// TODO: Choose a suitable batch size, training epoch, and save stride
|
| 99 |
+
"batch_size": 32,
|
| 100 |
+
"max_epoch": 1000000,
|
| 101 |
+
"save_checkpoint_stride": [20],
|
| 102 |
+
"adamw": {
|
| 103 |
+
"lr": 2.0e-4,
|
| 104 |
+
"adam_b1": 0.8,
|
| 105 |
+
"adam_b2": 0.99
|
| 106 |
+
},
|
| 107 |
+
"exponential_lr": {
|
| 108 |
+
"lr_decay": 0.999
|
| 109 |
+
},
|
| 110 |
+
}
|
| 111 |
+
}
|
egs/vocoder/gan/hifigan/exp_config.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"base_config": "egs/vocoder/gan/exp_config_base.json",
|
| 3 |
+
"preprocess": {
|
| 4 |
+
// acoustic features
|
| 5 |
+
"extract_mel": true,
|
| 6 |
+
"extract_audio": true,
|
| 7 |
+
|
| 8 |
+
// Features used for model training
|
| 9 |
+
"use_mel": true,
|
| 10 |
+
"use_audio": true
|
| 11 |
+
},
|
| 12 |
+
"model": {
|
| 13 |
+
"generator": "hifigan",
|
| 14 |
+
"hifigan": {
|
| 15 |
+
"resblock": "2",
|
| 16 |
+
"upsample_rates": [
|
| 17 |
+
8,
|
| 18 |
+
8,
|
| 19 |
+
4
|
| 20 |
+
],
|
| 21 |
+
"upsample_kernel_sizes": [
|
| 22 |
+
16,
|
| 23 |
+
16,
|
| 24 |
+
8
|
| 25 |
+
],
|
| 26 |
+
"upsample_initial_channel": 256,
|
| 27 |
+
"resblock_kernel_sizes": [
|
| 28 |
+
3,
|
| 29 |
+
5,
|
| 30 |
+
7
|
| 31 |
+
],
|
| 32 |
+
"resblock_dilation_sizes": [
|
| 33 |
+
[
|
| 34 |
+
1,
|
| 35 |
+
2
|
| 36 |
+
],
|
| 37 |
+
[
|
| 38 |
+
2,
|
| 39 |
+
6
|
| 40 |
+
],
|
| 41 |
+
[
|
| 42 |
+
3,
|
| 43 |
+
12
|
| 44 |
+
]
|
| 45 |
+
]
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"train": {
|
| 49 |
+
"criterions": [
|
| 50 |
+
"feature",
|
| 51 |
+
"discriminator",
|
| 52 |
+
"generator",
|
| 53 |
+
"mel",
|
| 54 |
+
]
|
| 55 |
+
},
|
| 56 |
+
"inference": {
|
| 57 |
+
"batch_size": 1,
|
| 58 |
+
}
|
| 59 |
+
}
|
egs/vocoder/gan/hifigan/run.sh
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
######## Build Experiment Environment ###########
|
| 7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
| 8 |
+
work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
|
| 9 |
+
|
| 10 |
+
export WORK_DIR=$work_dir
|
| 11 |
+
export PYTHONPATH=$work_dir
|
| 12 |
+
export PYTHONIOENCODING=UTF-8
|
| 13 |
+
|
| 14 |
+
######## Parse the Given Parameters from the Commond ###########
|
| 15 |
+
options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
|
| 16 |
+
eval set -- "$options"
|
| 17 |
+
|
| 18 |
+
while true; do
|
| 19 |
+
case $1 in
|
| 20 |
+
# Experimental Configuration File
|
| 21 |
+
-c | --config) shift; exp_config=$1 ; shift ;;
|
| 22 |
+
# Experimental Name
|
| 23 |
+
-n | --name) shift; exp_name=$1 ; shift ;;
|
| 24 |
+
# Running Stage
|
| 25 |
+
-s | --stage) shift; running_stage=$1 ; shift ;;
|
| 26 |
+
# Visible GPU machines. The default value is "0".
|
| 27 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
| 28 |
+
|
| 29 |
+
# [Only for Training] Resume configuration
|
| 30 |
+
--resume) shift; resume=$1 ; shift ;;
|
| 31 |
+
# [Only for Training] The specific checkpoint path that you want to resume from.
|
| 32 |
+
--checkpoint) shift; cehckpoint=$1 ; shift ;;
|
| 33 |
+
# [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
|
| 34 |
+
--resume_type) shift; resume_type=$1 ; shift ;;
|
| 35 |
+
|
| 36 |
+
# [Only for Inference] The inference mode
|
| 37 |
+
--infer_mode) shift; infer_mode=$1 ; shift ;;
|
| 38 |
+
# [Only for Inference] The inferenced datasets
|
| 39 |
+
--infer_datasets) shift; infer_datasets=$1 ; shift ;;
|
| 40 |
+
# [Only for Inference] The feature dir for inference
|
| 41 |
+
--infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
|
| 42 |
+
# [Only for Inference] The audio dir for inference
|
| 43 |
+
--infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
|
| 44 |
+
# [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
| 45 |
+
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
|
| 46 |
+
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
|
| 47 |
+
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
|
| 48 |
+
|
| 49 |
+
--) shift ; break ;;
|
| 50 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
| 51 |
+
esac
|
| 52 |
+
done
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
### Value check ###
|
| 56 |
+
if [ -z "$running_stage" ]; then
|
| 57 |
+
echo "[Error] Please specify the running stage"
|
| 58 |
+
exit 1
|
| 59 |
+
fi
|
| 60 |
+
|
| 61 |
+
if [ -z "$exp_config" ]; then
|
| 62 |
+
exp_config="${exp_dir}"/exp_config.json
|
| 63 |
+
fi
|
| 64 |
+
echo "Exprimental Configuration File: $exp_config"
|
| 65 |
+
|
| 66 |
+
if [ -z "$gpu" ]; then
|
| 67 |
+
gpu="0"
|
| 68 |
+
fi
|
| 69 |
+
|
| 70 |
+
######## Features Extraction ###########
|
| 71 |
+
if [ $running_stage -eq 1 ]; then
|
| 72 |
+
CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
|
| 73 |
+
--config $exp_config \
|
| 74 |
+
--num_workers 8
|
| 75 |
+
fi
|
| 76 |
+
|
| 77 |
+
######## Training ###########
|
| 78 |
+
if [ $running_stage -eq 2 ]; then
|
| 79 |
+
if [ -z "$exp_name" ]; then
|
| 80 |
+
echo "[Error] Please specify the experiments name"
|
| 81 |
+
exit 1
|
| 82 |
+
fi
|
| 83 |
+
echo "Exprimental Name: $exp_name"
|
| 84 |
+
|
| 85 |
+
if [ "$resume" = true ]; then
|
| 86 |
+
echo "Automatically resume from the experimental dir..."
|
| 87 |
+
CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 88 |
+
--config "$exp_config" \
|
| 89 |
+
--exp_name "$exp_name" \
|
| 90 |
+
--log_level info \
|
| 91 |
+
--resume
|
| 92 |
+
else
|
| 93 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 94 |
+
--config "$exp_config" \
|
| 95 |
+
--exp_name "$exp_name" \
|
| 96 |
+
--log_level info \
|
| 97 |
+
--checkpoint "$checkpoint" \
|
| 98 |
+
--resume_type "$resume_type"
|
| 99 |
+
fi
|
| 100 |
+
fi
|
| 101 |
+
|
| 102 |
+
######## Inference/Conversion ###########
|
| 103 |
+
if [ $running_stage -eq 3 ]; then
|
| 104 |
+
if [ -z "$infer_expt_dir" ]; then
|
| 105 |
+
echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
|
| 106 |
+
exit 1
|
| 107 |
+
fi
|
| 108 |
+
|
| 109 |
+
if [ -z "$infer_output_dir" ]; then
|
| 110 |
+
infer_output_dir="$infer_expt_dir/result"
|
| 111 |
+
fi
|
| 112 |
+
|
| 113 |
+
if [ $infer_mode = "infer_from_dataset" ]; then
|
| 114 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 115 |
+
--config $exp_config \
|
| 116 |
+
--infer_mode $infer_mode \
|
| 117 |
+
--infer_datasets $infer_datasets \
|
| 118 |
+
--vocoder_dir $infer_expt_dir \
|
| 119 |
+
--output_dir $infer_output_dir \
|
| 120 |
+
--log_level debug
|
| 121 |
+
fi
|
| 122 |
+
|
| 123 |
+
if [ $infer_mode = "infer_from_feature" ]; then
|
| 124 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 125 |
+
--config $exp_config \
|
| 126 |
+
--infer_mode $infer_mode \
|
| 127 |
+
--feature_folder $infer_feature_dir \
|
| 128 |
+
--vocoder_dir $infer_expt_dir \
|
| 129 |
+
--output_dir $infer_output_dir \
|
| 130 |
+
--log_level debug
|
| 131 |
+
fi
|
| 132 |
+
|
| 133 |
+
if [ $infer_mode = "infer_from_audio" ]; then
|
| 134 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 135 |
+
--config $exp_config \
|
| 136 |
+
--infer_mode $infer_mode \
|
| 137 |
+
--audio_folder $infer_audio_dir \
|
| 138 |
+
--vocoder_dir $infer_expt_dir \
|
| 139 |
+
--output_dir $infer_output_dir \
|
| 140 |
+
--log_level debug
|
| 141 |
+
fi
|
| 142 |
+
|
| 143 |
+
fi
|
egs/vocoder/gan/melgan/exp_config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"base_config": "egs/vocoder/gan/exp_config_base.json",
|
| 3 |
+
"preprocess": {
|
| 4 |
+
// acoustic features
|
| 5 |
+
"extract_mel": true,
|
| 6 |
+
"extract_audio": true,
|
| 7 |
+
|
| 8 |
+
// Features used for model training
|
| 9 |
+
"use_mel": true,
|
| 10 |
+
"use_audio": true
|
| 11 |
+
},
|
| 12 |
+
"model": {
|
| 13 |
+
"generator": "melgan",
|
| 14 |
+
"melgan": {
|
| 15 |
+
"ratios": [8, 8, 2, 2],
|
| 16 |
+
"ngf": 32,
|
| 17 |
+
"n_residual_layers": 3,
|
| 18 |
+
"num_D": 3,
|
| 19 |
+
"ndf": 16,
|
| 20 |
+
"n_layers": 4,
|
| 21 |
+
"downsampling_factor": 4
|
| 22 |
+
},
|
| 23 |
+
},
|
| 24 |
+
"train": {
|
| 25 |
+
"criterions": [
|
| 26 |
+
"feature",
|
| 27 |
+
"discriminator",
|
| 28 |
+
"generator",
|
| 29 |
+
]
|
| 30 |
+
},
|
| 31 |
+
"inference": {
|
| 32 |
+
"batch_size": 1,
|
| 33 |
+
}
|
| 34 |
+
}
|
egs/vocoder/gan/melgan/run.sh
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
######## Build Experiment Environment ###########
|
| 7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
| 8 |
+
work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
|
| 9 |
+
|
| 10 |
+
export WORK_DIR=$work_dir
|
| 11 |
+
export PYTHONPATH=$work_dir
|
| 12 |
+
export PYTHONIOENCODING=UTF-8
|
| 13 |
+
|
| 14 |
+
######## Parse the Given Parameters from the Commond ###########
|
| 15 |
+
options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
|
| 16 |
+
eval set -- "$options"
|
| 17 |
+
|
| 18 |
+
while true; do
|
| 19 |
+
case $1 in
|
| 20 |
+
# Experimental Configuration File
|
| 21 |
+
-c | --config) shift; exp_config=$1 ; shift ;;
|
| 22 |
+
# Experimental Name
|
| 23 |
+
-n | --name) shift; exp_name=$1 ; shift ;;
|
| 24 |
+
# Running Stage
|
| 25 |
+
-s | --stage) shift; running_stage=$1 ; shift ;;
|
| 26 |
+
# Visible GPU machines. The default value is "0".
|
| 27 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
| 28 |
+
|
| 29 |
+
# [Only for Training] Resume configuration
|
| 30 |
+
--resume) shift; resume=$1 ; shift ;;
|
| 31 |
+
# [Only for Training] The specific checkpoint path that you want to resume from.
|
| 32 |
+
--checkpoint) shift; cehckpoint=$1 ; shift ;;
|
| 33 |
+
# [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
|
| 34 |
+
--resume_type) shift; resume_type=$1 ; shift ;;
|
| 35 |
+
|
| 36 |
+
# [Only for Inference] The inference mode
|
| 37 |
+
--infer_mode) shift; infer_mode=$1 ; shift ;;
|
| 38 |
+
# [Only for Inference] The inferenced datasets
|
| 39 |
+
--infer_datasets) shift; infer_datasets=$1 ; shift ;;
|
| 40 |
+
# [Only for Inference] The feature dir for inference
|
| 41 |
+
--infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
|
| 42 |
+
# [Only for Inference] The audio dir for inference
|
| 43 |
+
--infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
|
| 44 |
+
# [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
| 45 |
+
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
|
| 46 |
+
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
|
| 47 |
+
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
|
| 48 |
+
|
| 49 |
+
--) shift ; break ;;
|
| 50 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
| 51 |
+
esac
|
| 52 |
+
done
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
### Value check ###
|
| 56 |
+
if [ -z "$running_stage" ]; then
|
| 57 |
+
echo "[Error] Please specify the running stage"
|
| 58 |
+
exit 1
|
| 59 |
+
fi
|
| 60 |
+
|
| 61 |
+
if [ -z "$exp_config" ]; then
|
| 62 |
+
exp_config="${exp_dir}"/exp_config.json
|
| 63 |
+
fi
|
| 64 |
+
echo "Exprimental Configuration File: $exp_config"
|
| 65 |
+
|
| 66 |
+
if [ -z "$gpu" ]; then
|
| 67 |
+
gpu="0"
|
| 68 |
+
fi
|
| 69 |
+
|
| 70 |
+
######## Features Extraction ###########
|
| 71 |
+
if [ $running_stage -eq 1 ]; then
|
| 72 |
+
CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
|
| 73 |
+
--config $exp_config \
|
| 74 |
+
--num_workers 8
|
| 75 |
+
fi
|
| 76 |
+
|
| 77 |
+
######## Training ###########
|
| 78 |
+
if [ $running_stage -eq 2 ]; then
|
| 79 |
+
if [ -z "$exp_name" ]; then
|
| 80 |
+
echo "[Error] Please specify the experiments name"
|
| 81 |
+
exit 1
|
| 82 |
+
fi
|
| 83 |
+
echo "Exprimental Name: $exp_name"
|
| 84 |
+
|
| 85 |
+
if [ "$resume" = true ]; then
|
| 86 |
+
echo "Automatically resume from the experimental dir..."
|
| 87 |
+
CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 88 |
+
--config "$exp_config" \
|
| 89 |
+
--exp_name "$exp_name" \
|
| 90 |
+
--log_level info \
|
| 91 |
+
--resume
|
| 92 |
+
else
|
| 93 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 94 |
+
--config "$exp_config" \
|
| 95 |
+
--exp_name "$exp_name" \
|
| 96 |
+
--log_level info \
|
| 97 |
+
--checkpoint "$checkpoint" \
|
| 98 |
+
--resume_type "$resume_type"
|
| 99 |
+
fi
|
| 100 |
+
fi
|
| 101 |
+
|
| 102 |
+
######## Inference/Conversion ###########
|
| 103 |
+
if [ $running_stage -eq 3 ]; then
|
| 104 |
+
if [ -z "$infer_expt_dir" ]; then
|
| 105 |
+
echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
|
| 106 |
+
exit 1
|
| 107 |
+
fi
|
| 108 |
+
|
| 109 |
+
if [ -z "$infer_output_dir" ]; then
|
| 110 |
+
infer_output_dir="$infer_expt_dir/result"
|
| 111 |
+
fi
|
| 112 |
+
|
| 113 |
+
if [ $infer_mode = "infer_from_dataset" ]; then
|
| 114 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 115 |
+
--config $exp_config \
|
| 116 |
+
--infer_mode $infer_mode \
|
| 117 |
+
--infer_datasets $infer_datasets \
|
| 118 |
+
--vocoder_dir $infer_expt_dir \
|
| 119 |
+
--output_dir $infer_output_dir \
|
| 120 |
+
--log_level debug
|
| 121 |
+
fi
|
| 122 |
+
|
| 123 |
+
if [ $infer_mode = "infer_from_feature" ]; then
|
| 124 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 125 |
+
--config $exp_config \
|
| 126 |
+
--infer_mode $infer_mode \
|
| 127 |
+
--feature_folder $infer_feature_dir \
|
| 128 |
+
--vocoder_dir $infer_expt_dir \
|
| 129 |
+
--output_dir $infer_output_dir \
|
| 130 |
+
--log_level debug
|
| 131 |
+
fi
|
| 132 |
+
|
| 133 |
+
if [ $infer_mode = "infer_from_audio" ]; then
|
| 134 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 135 |
+
--config $exp_config \
|
| 136 |
+
--infer_mode $infer_mode \
|
| 137 |
+
--audio_folder $infer_audio_dir \
|
| 138 |
+
--vocoder_dir $infer_expt_dir \
|
| 139 |
+
--output_dir $infer_output_dir \
|
| 140 |
+
--log_level debug
|
| 141 |
+
fi
|
| 142 |
+
|
| 143 |
+
fi
|
egs/vocoder/gan/nsfhifigan/exp_config.json
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"base_config": "egs/vocoder/gan/exp_config_base.json",
|
| 3 |
+
"preprocess": {
|
| 4 |
+
// acoustic features
|
| 5 |
+
"extract_mel": true,
|
| 6 |
+
"extract_audio": true,
|
| 7 |
+
"extract_pitch": true,
|
| 8 |
+
|
| 9 |
+
// Features used for model training
|
| 10 |
+
"use_mel": true,
|
| 11 |
+
"use_audio": true,
|
| 12 |
+
"use_frame_pitch": true
|
| 13 |
+
},
|
| 14 |
+
"model": {
|
| 15 |
+
"generator": "nsfhifigan",
|
| 16 |
+
"nsfhifigan": {
|
| 17 |
+
"resblock": "1",
|
| 18 |
+
"harmonic_num": 8,
|
| 19 |
+
"upsample_rates": [
|
| 20 |
+
8,
|
| 21 |
+
4,
|
| 22 |
+
2,
|
| 23 |
+
2,
|
| 24 |
+
2
|
| 25 |
+
],
|
| 26 |
+
"upsample_kernel_sizes": [
|
| 27 |
+
16,
|
| 28 |
+
8,
|
| 29 |
+
4,
|
| 30 |
+
4,
|
| 31 |
+
4
|
| 32 |
+
],
|
| 33 |
+
"upsample_initial_channel": 768,
|
| 34 |
+
"resblock_kernel_sizes": [
|
| 35 |
+
3,
|
| 36 |
+
7,
|
| 37 |
+
11
|
| 38 |
+
],
|
| 39 |
+
"resblock_dilation_sizes": [
|
| 40 |
+
[
|
| 41 |
+
1,
|
| 42 |
+
3,
|
| 43 |
+
5
|
| 44 |
+
],
|
| 45 |
+
[
|
| 46 |
+
1,
|
| 47 |
+
3,
|
| 48 |
+
5
|
| 49 |
+
],
|
| 50 |
+
[
|
| 51 |
+
1,
|
| 52 |
+
3,
|
| 53 |
+
5
|
| 54 |
+
]
|
| 55 |
+
]
|
| 56 |
+
},
|
| 57 |
+
"mpd": {
|
| 58 |
+
"mpd_reshapes": [
|
| 59 |
+
2,
|
| 60 |
+
3,
|
| 61 |
+
5,
|
| 62 |
+
7,
|
| 63 |
+
11,
|
| 64 |
+
17,
|
| 65 |
+
23,
|
| 66 |
+
37
|
| 67 |
+
],
|
| 68 |
+
"use_spectral_norm": false,
|
| 69 |
+
"discriminator_channel_multi": 1
|
| 70 |
+
}
|
| 71 |
+
},
|
| 72 |
+
"train": {
|
| 73 |
+
"criterions": [
|
| 74 |
+
"feature",
|
| 75 |
+
"discriminator",
|
| 76 |
+
"generator",
|
| 77 |
+
"mel",
|
| 78 |
+
]
|
| 79 |
+
},
|
| 80 |
+
"inference": {
|
| 81 |
+
"batch_size": 1,
|
| 82 |
+
}
|
| 83 |
+
}
|
egs/vocoder/gan/nsfhifigan/run.sh
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
######## Build Experiment Environment ###########
|
| 7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
| 8 |
+
work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
|
| 9 |
+
|
| 10 |
+
export WORK_DIR=$work_dir
|
| 11 |
+
export PYTHONPATH=$work_dir
|
| 12 |
+
export PYTHONIOENCODING=UTF-8
|
| 13 |
+
|
| 14 |
+
######## Parse the Given Parameters from the Commond ###########
|
| 15 |
+
options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
|
| 16 |
+
eval set -- "$options"
|
| 17 |
+
|
| 18 |
+
while true; do
|
| 19 |
+
case $1 in
|
| 20 |
+
# Experimental Configuration File
|
| 21 |
+
-c | --config) shift; exp_config=$1 ; shift ;;
|
| 22 |
+
# Experimental Name
|
| 23 |
+
-n | --name) shift; exp_name=$1 ; shift ;;
|
| 24 |
+
# Running Stage
|
| 25 |
+
-s | --stage) shift; running_stage=$1 ; shift ;;
|
| 26 |
+
# Visible GPU machines. The default value is "0".
|
| 27 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
| 28 |
+
|
| 29 |
+
# [Only for Training] Resume configuration
|
| 30 |
+
--resume) shift; resume=$1 ; shift ;;
|
| 31 |
+
# [Only for Training] The specific checkpoint path that you want to resume from.
|
| 32 |
+
--checkpoint) shift; cehckpoint=$1 ; shift ;;
|
| 33 |
+
# [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
|
| 34 |
+
--resume_type) shift; resume_type=$1 ; shift ;;
|
| 35 |
+
|
| 36 |
+
# [Only for Inference] The inference mode
|
| 37 |
+
--infer_mode) shift; infer_mode=$1 ; shift ;;
|
| 38 |
+
# [Only for Inference] The inferenced datasets
|
| 39 |
+
--infer_datasets) shift; infer_datasets=$1 ; shift ;;
|
| 40 |
+
# [Only for Inference] The feature dir for inference
|
| 41 |
+
--infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
|
| 42 |
+
# [Only for Inference] The audio dir for inference
|
| 43 |
+
--infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
|
| 44 |
+
# [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
| 45 |
+
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
|
| 46 |
+
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
|
| 47 |
+
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
|
| 48 |
+
|
| 49 |
+
--) shift ; break ;;
|
| 50 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
| 51 |
+
esac
|
| 52 |
+
done
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
### Value check ###
|
| 56 |
+
if [ -z "$running_stage" ]; then
|
| 57 |
+
echo "[Error] Please specify the running stage"
|
| 58 |
+
exit 1
|
| 59 |
+
fi
|
| 60 |
+
|
| 61 |
+
if [ -z "$exp_config" ]; then
|
| 62 |
+
exp_config="${exp_dir}"/exp_config.json
|
| 63 |
+
fi
|
| 64 |
+
echo "Exprimental Configuration File: $exp_config"
|
| 65 |
+
|
| 66 |
+
if [ -z "$gpu" ]; then
|
| 67 |
+
gpu="0"
|
| 68 |
+
fi
|
| 69 |
+
|
| 70 |
+
######## Features Extraction ###########
|
| 71 |
+
if [ $running_stage -eq 1 ]; then
|
| 72 |
+
CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
|
| 73 |
+
--config $exp_config \
|
| 74 |
+
--num_workers 8
|
| 75 |
+
fi
|
| 76 |
+
|
| 77 |
+
######## Training ###########
|
| 78 |
+
if [ $running_stage -eq 2 ]; then
|
| 79 |
+
if [ -z "$exp_name" ]; then
|
| 80 |
+
echo "[Error] Please specify the experiments name"
|
| 81 |
+
exit 1
|
| 82 |
+
fi
|
| 83 |
+
echo "Exprimental Name: $exp_name"
|
| 84 |
+
|
| 85 |
+
if [ "$resume" = true ]; then
|
| 86 |
+
echo "Automatically resume from the experimental dir..."
|
| 87 |
+
CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 88 |
+
--config "$exp_config" \
|
| 89 |
+
--exp_name "$exp_name" \
|
| 90 |
+
--log_level info \
|
| 91 |
+
--resume
|
| 92 |
+
else
|
| 93 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 94 |
+
--config "$exp_config" \
|
| 95 |
+
--exp_name "$exp_name" \
|
| 96 |
+
--log_level info \
|
| 97 |
+
--checkpoint "$checkpoint" \
|
| 98 |
+
--resume_type "$resume_type"
|
| 99 |
+
fi
|
| 100 |
+
fi
|
| 101 |
+
|
| 102 |
+
######## Inference/Conversion ###########
|
| 103 |
+
if [ $running_stage -eq 3 ]; then
|
| 104 |
+
if [ -z "$infer_expt_dir" ]; then
|
| 105 |
+
echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
|
| 106 |
+
exit 1
|
| 107 |
+
fi
|
| 108 |
+
|
| 109 |
+
if [ -z "$infer_output_dir" ]; then
|
| 110 |
+
infer_output_dir="$infer_expt_dir/result"
|
| 111 |
+
fi
|
| 112 |
+
|
| 113 |
+
if [ $infer_mode = "infer_from_dataset" ]; then
|
| 114 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 115 |
+
--config $exp_config \
|
| 116 |
+
--infer_mode $infer_mode \
|
| 117 |
+
--infer_datasets $infer_datasets \
|
| 118 |
+
--vocoder_dir $infer_expt_dir \
|
| 119 |
+
--output_dir $infer_output_dir \
|
| 120 |
+
--log_level debug
|
| 121 |
+
fi
|
| 122 |
+
|
| 123 |
+
if [ $infer_mode = "infer_from_feature" ]; then
|
| 124 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 125 |
+
--config $exp_config \
|
| 126 |
+
--infer_mode $infer_mode \
|
| 127 |
+
--feature_folder $infer_feature_dir \
|
| 128 |
+
--vocoder_dir $infer_expt_dir \
|
| 129 |
+
--output_dir $infer_output_dir \
|
| 130 |
+
--log_level debug
|
| 131 |
+
fi
|
| 132 |
+
|
| 133 |
+
if [ $infer_mode = "infer_from_audio" ]; then
|
| 134 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 135 |
+
--config $exp_config \
|
| 136 |
+
--infer_mode $infer_mode \
|
| 137 |
+
--audio_folder $infer_audio_dir \
|
| 138 |
+
--vocoder_dir $infer_expt_dir \
|
| 139 |
+
--output_dir $infer_output_dir \
|
| 140 |
+
--log_level debug
|
| 141 |
+
fi
|
| 142 |
+
|
| 143 |
+
fi
|
egs/vocoder/gan/tfr_enhanced_hifigan/README.md
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fedility Vocoder
|
| 2 |
+
|
| 3 |
+
[](https://arxiv.org/abs/2311.14957)
|
| 4 |
+
[](https://vocodexelysium.github.io/MS-SB-CQTD/)
|
| 5 |
+
|
| 6 |
+
<br>
|
| 7 |
+
<div align="center">
|
| 8 |
+
<img src="../../../../imgs/vocoder/gan/MSSBCQTD.png" width="80%">
|
| 9 |
+
</div>
|
| 10 |
+
<br>
|
| 11 |
+
|
| 12 |
+
This is the official implementation of the paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". In this recipe, we will illustrate how to train a high quality HiFi-GAN on LibriTTS, VCTK and LJSpeech via utilizing multiple Time-Frequency-Representation-based Discriminators.
|
| 13 |
+
|
| 14 |
+
There are four stages in total:
|
| 15 |
+
|
| 16 |
+
1. Data preparation
|
| 17 |
+
2. Feature extraction
|
| 18 |
+
3. Training
|
| 19 |
+
4. Inference
|
| 20 |
+
|
| 21 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
| 22 |
+
> ```bash
|
| 23 |
+
> cd Amphion
|
| 24 |
+
> ```
|
| 25 |
+
|
| 26 |
+
## 1. Data Preparation
|
| 27 |
+
|
| 28 |
+
### Dataset Download
|
| 29 |
+
|
| 30 |
+
By default, we utilize the three datasets for training: LibriTTS, VCTK and LJSpeech. How to download them is detailed in [here](../../../datasets/README.md).
|
| 31 |
+
|
| 32 |
+
### Configuration
|
| 33 |
+
|
| 34 |
+
Specify the dataset path in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
| 35 |
+
|
| 36 |
+
```json
|
| 37 |
+
"dataset": [
|
| 38 |
+
"ljspeech",
|
| 39 |
+
"vctk",
|
| 40 |
+
"libritts",
|
| 41 |
+
],
|
| 42 |
+
"dataset_path": {
|
| 43 |
+
// TODO: Fill in your dataset path
|
| 44 |
+
"ljspeech": "[LJSpeech dataset path]",
|
| 45 |
+
"vctk": "[VCTK dataset path]",
|
| 46 |
+
"libritts": "[LibriTTS dataset path]",
|
| 47 |
+
},
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
## 2. Features Extraction
|
| 51 |
+
|
| 52 |
+
For HiFiGAN, only the Mel-Spectrogram and the Output Audio are needed for training.
|
| 53 |
+
|
| 54 |
+
### Configuration
|
| 55 |
+
|
| 56 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
|
| 57 |
+
|
| 58 |
+
```json
|
| 59 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
|
| 60 |
+
"log_dir": "ckpts/vocoder",
|
| 61 |
+
"preprocess": {
|
| 62 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
| 63 |
+
"processed_dir": "data",
|
| 64 |
+
...
|
| 65 |
+
},
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
### Run
|
| 69 |
+
|
| 70 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 1
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
| 77 |
+
|
| 78 |
+
## 3. Training
|
| 79 |
+
|
| 80 |
+
### Configuration
|
| 81 |
+
|
| 82 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
| 83 |
+
|
| 84 |
+
```json
|
| 85 |
+
"train": {
|
| 86 |
+
"batch_size": 32,
|
| 87 |
+
...
|
| 88 |
+
}
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
### Run
|
| 92 |
+
|
| 93 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`.
|
| 94 |
+
|
| 95 |
+
```bash
|
| 96 |
+
sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 2 --name [YourExptName]
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
| 100 |
+
|
| 101 |
+
## 4. Inference
|
| 102 |
+
|
| 103 |
+
### Pretrained Vocoder Download
|
| 104 |
+
|
| 105 |
+
We trained a HiFiGAN checkpoint with around 685 hours Speech data. The final pretrained checkpoint is released [here](../../../../pretrained/hifigan/README.md).
|
| 106 |
+
|
| 107 |
+
### Run
|
| 108 |
+
|
| 109 |
+
Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from audio`.
|
| 110 |
+
|
| 111 |
+
```bash
|
| 112 |
+
sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
|
| 113 |
+
--infer_mode [Your chosen inference mode] \
|
| 114 |
+
--infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \
|
| 115 |
+
--infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \
|
| 116 |
+
--infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \
|
| 117 |
+
--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
|
| 118 |
+
--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
#### a. Inference from Dataset
|
| 122 |
+
|
| 123 |
+
Run the `run.sh` with specified datasets, here is an example.
|
| 124 |
+
|
| 125 |
+
```bash
|
| 126 |
+
sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
|
| 127 |
+
--infer_mode infer_from_dataset \
|
| 128 |
+
--infer_datasets "libritts vctk ljspeech" \
|
| 129 |
+
--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
|
| 130 |
+
--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
#### b. Inference from Features
|
| 134 |
+
|
| 135 |
+
If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure:
|
| 136 |
+
|
| 137 |
+
```plaintext
|
| 138 |
+
┣ {infer_feature_dir}
|
| 139 |
+
┃ ┣ mels
|
| 140 |
+
┃ ┃ ┣ sample1.npy
|
| 141 |
+
┃ ┃ ┣ sample2.npy
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
Then run the `run.sh` with specificed folder direction, here is an example.
|
| 145 |
+
|
| 146 |
+
```bash
|
| 147 |
+
sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
|
| 148 |
+
--infer_mode infer_from_feature \
|
| 149 |
+
--infer_feature_dir [Your path to your predicted acoustic features] \
|
| 150 |
+
--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
|
| 151 |
+
--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
#### c. Inference from Audios
|
| 155 |
+
|
| 156 |
+
If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure:
|
| 157 |
+
|
| 158 |
+
```plaintext
|
| 159 |
+
┣ audios
|
| 160 |
+
┃ ┣ sample1.wav
|
| 161 |
+
┃ ┣ sample2.wav
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
Then run the `run.sh` with specificed folder direction, here is an example.
|
| 165 |
+
|
| 166 |
+
```bash
|
| 167 |
+
sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
|
| 168 |
+
--infer_mode infer_from_audio \
|
| 169 |
+
--infer_audio_dir [Your path to your audio files] \
|
| 170 |
+
--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
|
| 171 |
+
--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
## Citations
|
| 175 |
+
|
| 176 |
+
```bibtex
|
| 177 |
+
@misc{gu2023cqt,
|
| 178 |
+
title={Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder},
|
| 179 |
+
author={Yicheng Gu and Xueyao Zhang and Liumeng Xue and Zhizheng Wu},
|
| 180 |
+
year={2023},
|
| 181 |
+
eprint={2311.14957},
|
| 182 |
+
archivePrefix={arXiv},
|
| 183 |
+
primaryClass={cs.SD}
|
| 184 |
+
}
|
| 185 |
+
```
|
egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"base_config": "egs/vocoder/gan/exp_config_base.json",
|
| 3 |
+
"model_type": "GANVocoder",
|
| 4 |
+
"dataset": [
|
| 5 |
+
"ljspeech",
|
| 6 |
+
"vctk",
|
| 7 |
+
"libritts",
|
| 8 |
+
],
|
| 9 |
+
"dataset_path": {
|
| 10 |
+
// TODO: Fill in your dataset path
|
| 11 |
+
"ljspeech": "[dataset path]",
|
| 12 |
+
"vctk": "[dataset path]",
|
| 13 |
+
"libritts": "[dataset path]",
|
| 14 |
+
},
|
| 15 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
|
| 16 |
+
"log_dir": "ckpts/vocoder",
|
| 17 |
+
"preprocess": {
|
| 18 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
| 19 |
+
"processed_dir": "data",
|
| 20 |
+
// acoustic features
|
| 21 |
+
"extract_mel": true,
|
| 22 |
+
"extract_audio": true,
|
| 23 |
+
"extract_pitch": false,
|
| 24 |
+
"extract_uv": false,
|
| 25 |
+
"extract_amplitude_phase": false,
|
| 26 |
+
"pitch_extractor": "parselmouth",
|
| 27 |
+
// Features used for model training
|
| 28 |
+
"use_mel": true,
|
| 29 |
+
"use_frame_pitch": false,
|
| 30 |
+
"use_uv": false,
|
| 31 |
+
"use_audio": true,
|
| 32 |
+
"n_mel": 100,
|
| 33 |
+
"sample_rate": 24000
|
| 34 |
+
},
|
| 35 |
+
"model": {
|
| 36 |
+
"generator": "hifigan",
|
| 37 |
+
"discriminators": [
|
| 38 |
+
"msd",
|
| 39 |
+
"mpd",
|
| 40 |
+
"mssbcqtd",
|
| 41 |
+
"msstftd",
|
| 42 |
+
],
|
| 43 |
+
"hifigan": {
|
| 44 |
+
"resblock": "1",
|
| 45 |
+
"upsample_rates": [
|
| 46 |
+
8,
|
| 47 |
+
4,
|
| 48 |
+
2,
|
| 49 |
+
2,
|
| 50 |
+
2
|
| 51 |
+
],
|
| 52 |
+
"upsample_kernel_sizes": [
|
| 53 |
+
16,
|
| 54 |
+
8,
|
| 55 |
+
4,
|
| 56 |
+
4,
|
| 57 |
+
4
|
| 58 |
+
],
|
| 59 |
+
"upsample_initial_channel": 768,
|
| 60 |
+
"resblock_kernel_sizes": [
|
| 61 |
+
3,
|
| 62 |
+
5,
|
| 63 |
+
7
|
| 64 |
+
],
|
| 65 |
+
"resblock_dilation_sizes": [
|
| 66 |
+
[
|
| 67 |
+
1,
|
| 68 |
+
3,
|
| 69 |
+
5
|
| 70 |
+
],
|
| 71 |
+
[
|
| 72 |
+
1,
|
| 73 |
+
3,
|
| 74 |
+
5
|
| 75 |
+
],
|
| 76 |
+
[
|
| 77 |
+
1,
|
| 78 |
+
3,
|
| 79 |
+
5
|
| 80 |
+
]
|
| 81 |
+
]
|
| 82 |
+
},
|
| 83 |
+
"mpd": {
|
| 84 |
+
"mpd_reshapes": [
|
| 85 |
+
2,
|
| 86 |
+
3,
|
| 87 |
+
5,
|
| 88 |
+
7,
|
| 89 |
+
11,
|
| 90 |
+
17,
|
| 91 |
+
23,
|
| 92 |
+
37
|
| 93 |
+
],
|
| 94 |
+
"use_spectral_norm": false,
|
| 95 |
+
"discriminator_channel_multi": 1
|
| 96 |
+
}
|
| 97 |
+
},
|
| 98 |
+
"train": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"adamw": {
|
| 101 |
+
"lr": 2.0e-4,
|
| 102 |
+
"adam_b1": 0.8,
|
| 103 |
+
"adam_b2": 0.99
|
| 104 |
+
},
|
| 105 |
+
"exponential_lr": {
|
| 106 |
+
"lr_decay": 0.999
|
| 107 |
+
},
|
| 108 |
+
"criterions": [
|
| 109 |
+
"feature",
|
| 110 |
+
"discriminator",
|
| 111 |
+
"generator",
|
| 112 |
+
"mel",
|
| 113 |
+
]
|
| 114 |
+
},
|
| 115 |
+
"inference": {
|
| 116 |
+
"batch_size": 1,
|
| 117 |
+
}
|
| 118 |
+
}
|
egs/vocoder/gan/tfr_enhanced_hifigan/run.sh
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
######## Build Experiment Environment ###########
|
| 7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
| 8 |
+
work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
|
| 9 |
+
|
| 10 |
+
export WORK_DIR=$work_dir
|
| 11 |
+
export PYTHONPATH=$work_dir
|
| 12 |
+
export PYTHONIOENCODING=UTF-8
|
| 13 |
+
|
| 14 |
+
######## Parse the Given Parameters from the Commond ###########
|
| 15 |
+
options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
|
| 16 |
+
eval set -- "$options"
|
| 17 |
+
|
| 18 |
+
while true; do
|
| 19 |
+
case $1 in
|
| 20 |
+
# Experimental Configuration File
|
| 21 |
+
-c | --config) shift; exp_config=$1 ; shift ;;
|
| 22 |
+
# Experimental Name
|
| 23 |
+
-n | --name) shift; exp_name=$1 ; shift ;;
|
| 24 |
+
# Running Stage
|
| 25 |
+
-s | --stage) shift; running_stage=$1 ; shift ;;
|
| 26 |
+
# Visible GPU machines. The default value is "0".
|
| 27 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
| 28 |
+
|
| 29 |
+
# [Only for Training] Resume configuration
|
| 30 |
+
--resume) shift; resume=$1 ; shift ;;
|
| 31 |
+
# [Only for Training] The specific checkpoint path that you want to resume from.
|
| 32 |
+
--checkpoint) shift; cehckpoint=$1 ; shift ;;
|
| 33 |
+
# [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
|
| 34 |
+
--resume_type) shift; resume_type=$1 ; shift ;;
|
| 35 |
+
|
| 36 |
+
# [Only for Inference] The inference mode
|
| 37 |
+
--infer_mode) shift; infer_mode=$1 ; shift ;;
|
| 38 |
+
# [Only for Inference] The inferenced datasets
|
| 39 |
+
--infer_datasets) shift; infer_datasets=$1 ; shift ;;
|
| 40 |
+
# [Only for Inference] The feature dir for inference
|
| 41 |
+
--infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
|
| 42 |
+
# [Only for Inference] The audio dir for inference
|
| 43 |
+
--infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
|
| 44 |
+
# [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
| 45 |
+
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
|
| 46 |
+
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
|
| 47 |
+
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
|
| 48 |
+
|
| 49 |
+
--) shift ; break ;;
|
| 50 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
| 51 |
+
esac
|
| 52 |
+
done
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
### Value check ###
|
| 56 |
+
if [ -z "$running_stage" ]; then
|
| 57 |
+
echo "[Error] Please specify the running stage"
|
| 58 |
+
exit 1
|
| 59 |
+
fi
|
| 60 |
+
|
| 61 |
+
if [ -z "$exp_config" ]; then
|
| 62 |
+
exp_config="${exp_dir}"/exp_config.json
|
| 63 |
+
fi
|
| 64 |
+
echo "Exprimental Configuration File: $exp_config"
|
| 65 |
+
|
| 66 |
+
if [ -z "$gpu" ]; then
|
| 67 |
+
gpu="0"
|
| 68 |
+
fi
|
| 69 |
+
|
| 70 |
+
######## Features Extraction ###########
|
| 71 |
+
if [ $running_stage -eq 1 ]; then
|
| 72 |
+
CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
|
| 73 |
+
--config $exp_config \
|
| 74 |
+
--num_workers 8
|
| 75 |
+
fi
|
| 76 |
+
|
| 77 |
+
######## Training ###########
|
| 78 |
+
if [ $running_stage -eq 2 ]; then
|
| 79 |
+
if [ -z "$exp_name" ]; then
|
| 80 |
+
echo "[Error] Please specify the experiments name"
|
| 81 |
+
exit 1
|
| 82 |
+
fi
|
| 83 |
+
echo "Exprimental Name: $exp_name"
|
| 84 |
+
|
| 85 |
+
if [ "$resume" = true ]; then
|
| 86 |
+
echo "Automatically resume from the experimental dir..."
|
| 87 |
+
CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 88 |
+
--config "$exp_config" \
|
| 89 |
+
--exp_name "$exp_name" \
|
| 90 |
+
--log_level info \
|
| 91 |
+
--resume
|
| 92 |
+
else
|
| 93 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
|
| 94 |
+
--config "$exp_config" \
|
| 95 |
+
--exp_name "$exp_name" \
|
| 96 |
+
--log_level info \
|
| 97 |
+
--checkpoint "$checkpoint" \
|
| 98 |
+
--resume_type "$resume_type"
|
| 99 |
+
fi
|
| 100 |
+
fi
|
| 101 |
+
|
| 102 |
+
######## Inference/Conversion ###########
|
| 103 |
+
if [ $running_stage -eq 3 ]; then
|
| 104 |
+
if [ -z "$infer_expt_dir" ]; then
|
| 105 |
+
echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
|
| 106 |
+
exit 1
|
| 107 |
+
fi
|
| 108 |
+
|
| 109 |
+
if [ -z "$infer_output_dir" ]; then
|
| 110 |
+
infer_output_dir="$infer_expt_dir/result"
|
| 111 |
+
fi
|
| 112 |
+
|
| 113 |
+
echo $infer_datasets
|
| 114 |
+
|
| 115 |
+
if [ $infer_mode = "infer_from_dataset" ]; then
|
| 116 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 117 |
+
--config $exp_config \
|
| 118 |
+
--infer_mode $infer_mode \
|
| 119 |
+
--infer_datasets $infer_datasets \
|
| 120 |
+
--vocoder_dir $infer_expt_dir \
|
| 121 |
+
--output_dir $infer_output_dir \
|
| 122 |
+
--log_level debug
|
| 123 |
+
fi
|
| 124 |
+
|
| 125 |
+
if [ $infer_mode = "infer_from_feature" ]; then
|
| 126 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 127 |
+
--config $exp_config \
|
| 128 |
+
--infer_mode $infer_mode \
|
| 129 |
+
--feature_folder $infer_feature_dir \
|
| 130 |
+
--vocoder_dir $infer_expt_dir \
|
| 131 |
+
--output_dir $infer_output_dir \
|
| 132 |
+
--log_level debug
|
| 133 |
+
fi
|
| 134 |
+
|
| 135 |
+
if [ $infer_mode = "infer_from_audio" ]; then
|
| 136 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
|
| 137 |
+
--config $exp_config \
|
| 138 |
+
--infer_mode $infer_mode \
|
| 139 |
+
--audio_folder $infer_audio_dir \
|
| 140 |
+
--vocoder_dir $infer_expt_dir \
|
| 141 |
+
--output_dir $infer_output_dir \
|
| 142 |
+
--log_level debug
|
| 143 |
+
fi
|
| 144 |
+
|
| 145 |
+
fi
|
examples/chinese_female_recordings.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f710270fe3857211c55aaa1f813e310e68855ff9eabaf5b249537a2d4277cc30
|
| 3 |
+
size 448928
|
examples/chinese_male_seperated.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:009077a677b23bff3154078930e6c624d218eb0acbe78990bec88f6bf5a6e5de
|
| 3 |
+
size 480044
|
examples/english_female_seperated.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:87e75863ffb4e597467a825d019217e73d64dce1e9635de60a32559ffcb97cf4
|
| 3 |
+
size 1509584
|
examples/english_male_recordings.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e14ebf1c554ebb25e5169b4bcda36a685538e94c531f303339bad91ff93a2288
|
| 3 |
+
size 251948
|
examples/output/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
examples/output/chinese_female_recordings_vocalist_l1_JohnMayer.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf6d6ef89ba2234fbc64c0ee48f81528cf49717a23a919aa8d0767ada2437113
|
| 3 |
+
size 244268
|
examples/output/chinese_male_seperated_vocalist_l1_TaylorSwift.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e682abb072246f412133bfa313c6edf863f1d6a6db63022749f74c2c7ef01c7
|
| 3 |
+
size 479788
|
examples/output/english_female_seperated_vocalist_l1_汪峰.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a03755cfc9aef4d26bda6370d9335625482f22f2c1f3c918dbbec3246213cee2
|
| 3 |
+
size 410668
|
examples/output/english_male_recordings_vocalist_l1_石倚洁.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e850a0e02f2741185c3d3b642a9c292a3a297cdf262e92333b63adf98af7d450
|
| 3 |
+
size 251948
|
models/__init__.py
ADDED
|
File without changes
|
models/base/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
from .new_trainer import BaseTrainer
|
| 7 |
+
from .new_inference import BaseInference
|
models/base/base_dataset.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch.utils.data
|
| 9 |
+
from torch.nn.utils.rnn import pad_sequence
|
| 10 |
+
from utils.data_utils import *
|
| 11 |
+
from processors.acoustic_extractor import cal_normalized_mel
|
| 12 |
+
from text import text_to_sequence
|
| 13 |
+
from text.text_token_collation import phoneIDCollation
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class BaseDataset(torch.utils.data.Dataset):
|
| 17 |
+
def __init__(self, cfg, dataset, is_valid=False):
|
| 18 |
+
"""
|
| 19 |
+
Args:
|
| 20 |
+
cfg: config
|
| 21 |
+
dataset: dataset name
|
| 22 |
+
is_valid: whether to use train or valid dataset
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
assert isinstance(dataset, str)
|
| 26 |
+
|
| 27 |
+
# self.data_root = processed_data_dir
|
| 28 |
+
self.cfg = cfg
|
| 29 |
+
|
| 30 |
+
processed_data_dir = os.path.join(cfg.preprocess.processed_dir, dataset)
|
| 31 |
+
meta_file = cfg.preprocess.valid_file if is_valid else cfg.preprocess.train_file
|
| 32 |
+
self.metafile_path = os.path.join(processed_data_dir, meta_file)
|
| 33 |
+
self.metadata = self.get_metadata()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
'''
|
| 38 |
+
load spk2id and utt2spk from json file
|
| 39 |
+
spk2id: {spk1: 0, spk2: 1, ...}
|
| 40 |
+
utt2spk: {dataset_uid: spk1, ...}
|
| 41 |
+
'''
|
| 42 |
+
if cfg.preprocess.use_spkid:
|
| 43 |
+
spk2id_path = os.path.join(processed_data_dir, cfg.preprocess.spk2id)
|
| 44 |
+
with open(spk2id_path, "r") as f:
|
| 45 |
+
self.spk2id = json.load(f)
|
| 46 |
+
|
| 47 |
+
utt2spk_path = os.path.join(processed_data_dir, cfg.preprocess.utt2spk)
|
| 48 |
+
self.utt2spk = dict()
|
| 49 |
+
with open(utt2spk_path, "r") as f:
|
| 50 |
+
for line in f.readlines():
|
| 51 |
+
utt, spk = line.strip().split('\t')
|
| 52 |
+
self.utt2spk[utt] = spk
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
if cfg.preprocess.use_uv:
|
| 56 |
+
self.utt2uv_path = {}
|
| 57 |
+
for utt_info in self.metadata:
|
| 58 |
+
dataset = utt_info["Dataset"]
|
| 59 |
+
uid = utt_info["Uid"]
|
| 60 |
+
utt = "{}_{}".format(dataset, uid)
|
| 61 |
+
self.utt2uv_path[utt] = os.path.join(
|
| 62 |
+
cfg.preprocess.processed_dir,
|
| 63 |
+
dataset,
|
| 64 |
+
cfg.preprocess.uv_dir,
|
| 65 |
+
uid + ".npy",
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
if cfg.preprocess.use_frame_pitch:
|
| 69 |
+
self.utt2frame_pitch_path = {}
|
| 70 |
+
for utt_info in self.metadata:
|
| 71 |
+
dataset = utt_info["Dataset"]
|
| 72 |
+
uid = utt_info["Uid"]
|
| 73 |
+
utt = "{}_{}".format(dataset, uid)
|
| 74 |
+
|
| 75 |
+
self.utt2frame_pitch_path[utt] = os.path.join(
|
| 76 |
+
cfg.preprocess.processed_dir,
|
| 77 |
+
dataset,
|
| 78 |
+
cfg.preprocess.pitch_dir,
|
| 79 |
+
uid + ".npy",
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
if cfg.preprocess.use_frame_energy:
|
| 83 |
+
self.utt2frame_energy_path = {}
|
| 84 |
+
for utt_info in self.metadata:
|
| 85 |
+
dataset = utt_info["Dataset"]
|
| 86 |
+
uid = utt_info["Uid"]
|
| 87 |
+
utt = "{}_{}".format(dataset, uid)
|
| 88 |
+
|
| 89 |
+
self.utt2frame_energy_path[utt] = os.path.join(
|
| 90 |
+
cfg.preprocess.processed_dir,
|
| 91 |
+
dataset,
|
| 92 |
+
cfg.preprocess.energy_dir,
|
| 93 |
+
uid + ".npy",
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
if cfg.preprocess.use_mel:
|
| 97 |
+
self.utt2mel_path = {}
|
| 98 |
+
for utt_info in self.metadata:
|
| 99 |
+
dataset = utt_info["Dataset"]
|
| 100 |
+
uid = utt_info["Uid"]
|
| 101 |
+
utt = "{}_{}".format(dataset, uid)
|
| 102 |
+
|
| 103 |
+
self.utt2mel_path[utt] = os.path.join(
|
| 104 |
+
cfg.preprocess.processed_dir,
|
| 105 |
+
dataset,
|
| 106 |
+
cfg.preprocess.mel_dir,
|
| 107 |
+
uid + ".npy",
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
if cfg.preprocess.use_linear:
|
| 111 |
+
self.utt2linear_path = {}
|
| 112 |
+
for utt_info in self.metadata:
|
| 113 |
+
dataset = utt_info["Dataset"]
|
| 114 |
+
uid = utt_info["Uid"]
|
| 115 |
+
utt = "{}_{}".format(dataset, uid)
|
| 116 |
+
|
| 117 |
+
self.utt2linear_path[utt] = os.path.join(
|
| 118 |
+
cfg.preprocess.processed_dir,
|
| 119 |
+
dataset,
|
| 120 |
+
cfg.preprocess.linear_dir,
|
| 121 |
+
uid + ".npy",
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
if cfg.preprocess.use_audio:
|
| 125 |
+
self.utt2audio_path = {}
|
| 126 |
+
for utt_info in self.metadata:
|
| 127 |
+
dataset = utt_info["Dataset"]
|
| 128 |
+
uid = utt_info["Uid"]
|
| 129 |
+
utt = "{}_{}".format(dataset, uid)
|
| 130 |
+
|
| 131 |
+
self.utt2audio_path[utt] = os.path.join(
|
| 132 |
+
cfg.preprocess.processed_dir,
|
| 133 |
+
dataset,
|
| 134 |
+
cfg.preprocess.audio_dir,
|
| 135 |
+
uid + ".npy",
|
| 136 |
+
)
|
| 137 |
+
elif cfg.preprocess.use_label:
|
| 138 |
+
self.utt2label_path = {}
|
| 139 |
+
for utt_info in self.metadata:
|
| 140 |
+
dataset = utt_info["Dataset"]
|
| 141 |
+
uid = utt_info["Uid"]
|
| 142 |
+
utt = "{}_{}".format(dataset, uid)
|
| 143 |
+
|
| 144 |
+
self.utt2label_path[utt] = os.path.join(
|
| 145 |
+
cfg.preprocess.processed_dir,
|
| 146 |
+
dataset,
|
| 147 |
+
cfg.preprocess.label_dir,
|
| 148 |
+
uid + ".npy",
|
| 149 |
+
)
|
| 150 |
+
elif cfg.preprocess.use_one_hot:
|
| 151 |
+
self.utt2one_hot_path = {}
|
| 152 |
+
for utt_info in self.metadata:
|
| 153 |
+
dataset = utt_info["Dataset"]
|
| 154 |
+
uid = utt_info["Uid"]
|
| 155 |
+
utt = "{}_{}".format(dataset, uid)
|
| 156 |
+
|
| 157 |
+
self.utt2one_hot_path[utt] = os.path.join(
|
| 158 |
+
cfg.preprocess.processed_dir,
|
| 159 |
+
dataset,
|
| 160 |
+
cfg.preprocess.one_hot_dir,
|
| 161 |
+
uid + ".npy",
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
if cfg.preprocess.use_text or cfg.preprocess.use_phone:
|
| 165 |
+
self.utt2seq = {}
|
| 166 |
+
for utt_info in self.metadata:
|
| 167 |
+
dataset = utt_info["Dataset"]
|
| 168 |
+
uid = utt_info["Uid"]
|
| 169 |
+
utt = "{}_{}".format(dataset, uid)
|
| 170 |
+
|
| 171 |
+
if cfg.preprocess.use_text:
|
| 172 |
+
text = utt_info["Text"]
|
| 173 |
+
sequence = text_to_sequence(text, cfg.preprocess.text_cleaners)
|
| 174 |
+
elif cfg.preprocess.use_phone:
|
| 175 |
+
# load phoneme squence from phone file
|
| 176 |
+
phone_path = os.path.join(processed_data_dir,
|
| 177 |
+
cfg.preprocess.phone_dir,
|
| 178 |
+
uid+'.phone'
|
| 179 |
+
)
|
| 180 |
+
with open(phone_path, 'r') as fin:
|
| 181 |
+
phones = fin.readlines()
|
| 182 |
+
assert len(phones) == 1
|
| 183 |
+
phones = phones[0].strip()
|
| 184 |
+
phones_seq = phones.split(' ')
|
| 185 |
+
|
| 186 |
+
phon_id_collator = phoneIDCollation(cfg, dataset=dataset)
|
| 187 |
+
sequence = phon_id_collator.get_phone_id_sequence(cfg, phones_seq)
|
| 188 |
+
|
| 189 |
+
self.utt2seq[utt] = sequence
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def get_metadata(self):
|
| 193 |
+
with open(self.metafile_path, "r", encoding="utf-8") as f:
|
| 194 |
+
metadata = json.load(f)
|
| 195 |
+
|
| 196 |
+
return metadata
|
| 197 |
+
|
| 198 |
+
def get_dataset_name(self):
|
| 199 |
+
return self.metadata[0]["Dataset"]
|
| 200 |
+
|
| 201 |
+
def __getitem__(self, index):
|
| 202 |
+
utt_info = self.metadata[index]
|
| 203 |
+
|
| 204 |
+
dataset = utt_info["Dataset"]
|
| 205 |
+
uid = utt_info["Uid"]
|
| 206 |
+
utt = "{}_{}".format(dataset, uid)
|
| 207 |
+
|
| 208 |
+
single_feature = dict()
|
| 209 |
+
|
| 210 |
+
if self.cfg.preprocess.use_spkid:
|
| 211 |
+
single_feature["spk_id"] = np.array(
|
| 212 |
+
[self.spk2id[self.utt2spk[utt]]], dtype=np.int32
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
if self.cfg.preprocess.use_mel:
|
| 216 |
+
mel = np.load(self.utt2mel_path[utt])
|
| 217 |
+
assert mel.shape[0] == self.cfg.preprocess.n_mel # [n_mels, T]
|
| 218 |
+
if self.cfg.preprocess.use_min_max_norm_mel:
|
| 219 |
+
# do mel norm
|
| 220 |
+
mel = cal_normalized_mel(mel, utt_info["Dataset"], self.cfg.preprocess)
|
| 221 |
+
|
| 222 |
+
if "target_len" not in single_feature.keys():
|
| 223 |
+
single_feature["target_len"] = mel.shape[1]
|
| 224 |
+
single_feature["mel"] = mel.T # [T, n_mels]
|
| 225 |
+
|
| 226 |
+
if self.cfg.preprocess.use_linear:
|
| 227 |
+
linear = np.load(self.utt2linear_path[utt])
|
| 228 |
+
if "target_len" not in single_feature.keys():
|
| 229 |
+
single_feature["target_len"] = linear.shape[1]
|
| 230 |
+
single_feature["linear"] = linear.T # [T, n_linear]
|
| 231 |
+
|
| 232 |
+
if self.cfg.preprocess.use_frame_pitch:
|
| 233 |
+
frame_pitch_path = self.utt2frame_pitch_path[utt]
|
| 234 |
+
frame_pitch = np.load(frame_pitch_path)
|
| 235 |
+
if "target_len" not in single_feature.keys():
|
| 236 |
+
single_feature["target_len"] = len(frame_pitch)
|
| 237 |
+
aligned_frame_pitch = align_length(
|
| 238 |
+
frame_pitch, single_feature["target_len"]
|
| 239 |
+
)
|
| 240 |
+
single_feature["frame_pitch"] = aligned_frame_pitch
|
| 241 |
+
|
| 242 |
+
if self.cfg.preprocess.use_uv:
|
| 243 |
+
frame_uv_path = self.utt2uv_path[utt]
|
| 244 |
+
frame_uv = np.load(frame_uv_path)
|
| 245 |
+
aligned_frame_uv = align_length(frame_uv, single_feature["target_len"])
|
| 246 |
+
aligned_frame_uv = [
|
| 247 |
+
0 if frame_uv else 1 for frame_uv in aligned_frame_uv
|
| 248 |
+
]
|
| 249 |
+
aligned_frame_uv = np.array(aligned_frame_uv)
|
| 250 |
+
single_feature["frame_uv"] = aligned_frame_uv
|
| 251 |
+
|
| 252 |
+
if self.cfg.preprocess.use_frame_energy:
|
| 253 |
+
frame_energy_path = self.utt2frame_energy_path[utt]
|
| 254 |
+
frame_energy = np.load(frame_energy_path)
|
| 255 |
+
if "target_len" not in single_feature.keys():
|
| 256 |
+
single_feature["target_len"] = len(frame_energy)
|
| 257 |
+
aligned_frame_energy = align_length(
|
| 258 |
+
frame_energy, single_feature["target_len"]
|
| 259 |
+
)
|
| 260 |
+
single_feature["frame_energy"] = aligned_frame_energy
|
| 261 |
+
|
| 262 |
+
if self.cfg.preprocess.use_audio:
|
| 263 |
+
audio = np.load(self.utt2audio_path[utt])
|
| 264 |
+
single_feature["audio"] = audio
|
| 265 |
+
single_feature["audio_len"] = audio.shape[0]
|
| 266 |
+
|
| 267 |
+
if self.cfg.preprocess.use_phone or self.cfg.preprocess.use_text:
|
| 268 |
+
single_feature["phone_seq"] = np.array(self.utt2seq[utt])
|
| 269 |
+
single_feature["phone_len"] = len(self.utt2seq[utt])
|
| 270 |
+
|
| 271 |
+
return single_feature
|
| 272 |
+
|
| 273 |
+
def __len__(self):
|
| 274 |
+
return len(self.metadata)
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
class BaseCollator(object):
|
| 278 |
+
"""Zero-pads model inputs and targets based on number of frames per step"""
|
| 279 |
+
|
| 280 |
+
def __init__(self, cfg):
|
| 281 |
+
self.cfg = cfg
|
| 282 |
+
|
| 283 |
+
def __call__(self, batch):
|
| 284 |
+
packed_batch_features = dict()
|
| 285 |
+
|
| 286 |
+
# mel: [b, T, n_mels]
|
| 287 |
+
# frame_pitch, frame_energy: [1, T]
|
| 288 |
+
# target_len: [1]
|
| 289 |
+
# spk_id: [b, 1]
|
| 290 |
+
# mask: [b, T, 1]
|
| 291 |
+
|
| 292 |
+
for key in batch[0].keys():
|
| 293 |
+
if key == "target_len":
|
| 294 |
+
packed_batch_features["target_len"] = torch.LongTensor(
|
| 295 |
+
[b["target_len"] for b in batch]
|
| 296 |
+
)
|
| 297 |
+
masks = [
|
| 298 |
+
torch.ones((b["target_len"], 1), dtype=torch.long) for b in batch
|
| 299 |
+
]
|
| 300 |
+
packed_batch_features["mask"] = pad_sequence(
|
| 301 |
+
masks, batch_first=True, padding_value=0
|
| 302 |
+
)
|
| 303 |
+
elif key == "phone_len":
|
| 304 |
+
packed_batch_features["phone_len"] = torch.LongTensor(
|
| 305 |
+
[b["phone_len"] for b in batch]
|
| 306 |
+
)
|
| 307 |
+
masks = [
|
| 308 |
+
torch.ones((b["phone_len"], 1), dtype=torch.long) for b in batch
|
| 309 |
+
]
|
| 310 |
+
packed_batch_features["phn_mask"] = pad_sequence(
|
| 311 |
+
masks, batch_first=True, padding_value=0
|
| 312 |
+
)
|
| 313 |
+
elif key == "audio_len":
|
| 314 |
+
packed_batch_features["audio_len"] = torch.LongTensor(
|
| 315 |
+
[b["audio_len"] for b in batch]
|
| 316 |
+
)
|
| 317 |
+
masks = [
|
| 318 |
+
torch.ones((b["audio_len"], 1), dtype=torch.long) for b in batch
|
| 319 |
+
]
|
| 320 |
+
else:
|
| 321 |
+
values = [torch.from_numpy(b[key]) for b in batch]
|
| 322 |
+
packed_batch_features[key] = pad_sequence(
|
| 323 |
+
values, batch_first=True, padding_value=0
|
| 324 |
+
)
|
| 325 |
+
return packed_batch_features
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
class BaseTestDataset(torch.utils.data.Dataset):
|
| 329 |
+
def __init__(self, cfg, args):
|
| 330 |
+
raise NotImplementedError
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
def get_metadata(self):
|
| 334 |
+
raise NotImplementedError
|
| 335 |
+
|
| 336 |
+
def __getitem__(self, index):
|
| 337 |
+
raise NotImplementedError
|
| 338 |
+
|
| 339 |
+
def __len__(self):
|
| 340 |
+
return len(self.metadata)
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
class BaseTestCollator(object):
|
| 344 |
+
"""Zero-pads model inputs and targets based on number of frames per step"""
|
| 345 |
+
|
| 346 |
+
def __init__(self, cfg):
|
| 347 |
+
raise NotImplementedError
|
| 348 |
+
|
| 349 |
+
def __call__(self, batch):
|
| 350 |
+
raise NotImplementedError
|
models/base/base_inference.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
import time
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
import torch
|
| 13 |
+
from torch.utils.data import DataLoader
|
| 14 |
+
from tqdm import tqdm
|
| 15 |
+
|
| 16 |
+
from models.vocoders.vocoder_inference import synthesis
|
| 17 |
+
from torch.utils.data import DataLoader
|
| 18 |
+
from utils.util import set_all_random_seed
|
| 19 |
+
from utils.util import load_config
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def parse_vocoder(vocoder_dir):
|
| 23 |
+
r"""Parse vocoder config"""
|
| 24 |
+
vocoder_dir = os.path.abspath(vocoder_dir)
|
| 25 |
+
ckpt_list = [ckpt for ckpt in Path(vocoder_dir).glob("*.pt")]
|
| 26 |
+
ckpt_list.sort(key=lambda x: int(x.stem), reverse=True)
|
| 27 |
+
ckpt_path = str(ckpt_list[0])
|
| 28 |
+
vocoder_cfg = load_config(os.path.join(vocoder_dir, "args.json"), lowercase=True)
|
| 29 |
+
vocoder_cfg.model.bigvgan = vocoder_cfg.vocoder
|
| 30 |
+
return vocoder_cfg, ckpt_path
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class BaseInference(object):
|
| 34 |
+
def __init__(self, cfg, args):
|
| 35 |
+
self.cfg = cfg
|
| 36 |
+
self.args = args
|
| 37 |
+
self.model_type = cfg.model_type
|
| 38 |
+
self.avg_rtf = list()
|
| 39 |
+
set_all_random_seed(10086)
|
| 40 |
+
os.makedirs(args.output_dir, exist_ok=True)
|
| 41 |
+
|
| 42 |
+
if torch.cuda.is_available():
|
| 43 |
+
self.device = torch.device("cuda")
|
| 44 |
+
else:
|
| 45 |
+
self.device = torch.device("cpu")
|
| 46 |
+
torch.set_num_threads(10) # inference on 1 core cpu.
|
| 47 |
+
|
| 48 |
+
# Load acoustic model
|
| 49 |
+
self.model = self.create_model().to(self.device)
|
| 50 |
+
state_dict = self.load_state_dict()
|
| 51 |
+
self.load_model(state_dict)
|
| 52 |
+
self.model.eval()
|
| 53 |
+
|
| 54 |
+
# Load vocoder model if necessary
|
| 55 |
+
if self.args.checkpoint_dir_vocoder is not None:
|
| 56 |
+
self.get_vocoder_info()
|
| 57 |
+
|
| 58 |
+
def create_model(self):
|
| 59 |
+
raise NotImplementedError
|
| 60 |
+
|
| 61 |
+
def load_state_dict(self):
|
| 62 |
+
self.checkpoint_file = self.args.checkpoint_file
|
| 63 |
+
if self.checkpoint_file is None:
|
| 64 |
+
assert self.args.checkpoint_dir is not None
|
| 65 |
+
checkpoint_path = os.path.join(self.args.checkpoint_dir, "checkpoint")
|
| 66 |
+
checkpoint_filename = open(checkpoint_path).readlines()[-1].strip()
|
| 67 |
+
self.checkpoint_file = os.path.join(
|
| 68 |
+
self.args.checkpoint_dir, checkpoint_filename
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
self.checkpoint_dir = os.path.split(self.checkpoint_file)[0]
|
| 72 |
+
|
| 73 |
+
print("Restore acoustic model from {}".format(self.checkpoint_file))
|
| 74 |
+
raw_state_dict = torch.load(self.checkpoint_file, map_location=self.device)
|
| 75 |
+
self.am_restore_step = re.findall(r"step-(.+?)_loss", self.checkpoint_file)[0]
|
| 76 |
+
|
| 77 |
+
return raw_state_dict
|
| 78 |
+
|
| 79 |
+
def load_model(self, model):
|
| 80 |
+
raise NotImplementedError
|
| 81 |
+
|
| 82 |
+
def get_vocoder_info(self):
|
| 83 |
+
self.checkpoint_dir_vocoder = self.args.checkpoint_dir_vocoder
|
| 84 |
+
self.vocoder_cfg = os.path.join(
|
| 85 |
+
os.path.dirname(self.checkpoint_dir_vocoder), "args.json"
|
| 86 |
+
)
|
| 87 |
+
self.cfg.vocoder = load_config(self.vocoder_cfg, lowercase=True)
|
| 88 |
+
self.vocoder_tag = self.checkpoint_dir_vocoder.split("/")[-2].split(":")[-1]
|
| 89 |
+
self.vocoder_steps = self.checkpoint_dir_vocoder.split("/")[-1].split(".")[0]
|
| 90 |
+
|
| 91 |
+
def build_test_utt_data(self):
|
| 92 |
+
raise NotImplementedError
|
| 93 |
+
|
| 94 |
+
def build_testdata_loader(self, args, target_speaker=None):
|
| 95 |
+
datasets, collate = self.build_test_dataset()
|
| 96 |
+
self.test_dataset = datasets(self.cfg, args, target_speaker)
|
| 97 |
+
self.test_collate = collate(self.cfg)
|
| 98 |
+
self.test_batch_size = min(
|
| 99 |
+
self.cfg.train.batch_size, len(self.test_dataset.metadata)
|
| 100 |
+
)
|
| 101 |
+
test_loader = DataLoader(
|
| 102 |
+
self.test_dataset,
|
| 103 |
+
collate_fn=self.test_collate,
|
| 104 |
+
num_workers=self.args.num_workers,
|
| 105 |
+
batch_size=self.test_batch_size,
|
| 106 |
+
shuffle=False,
|
| 107 |
+
)
|
| 108 |
+
return test_loader
|
| 109 |
+
|
| 110 |
+
def inference_each_batch(self, batch_data):
|
| 111 |
+
raise NotImplementedError
|
| 112 |
+
|
| 113 |
+
def inference_for_batches(self, args, target_speaker=None):
|
| 114 |
+
###### Construct test_batch ######
|
| 115 |
+
loader = self.build_testdata_loader(args, target_speaker)
|
| 116 |
+
|
| 117 |
+
n_batch = len(loader)
|
| 118 |
+
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
|
| 119 |
+
print(
|
| 120 |
+
"Model eval time: {}, batch_size = {}, n_batch = {}".format(
|
| 121 |
+
now, self.test_batch_size, n_batch
|
| 122 |
+
)
|
| 123 |
+
)
|
| 124 |
+
self.model.eval()
|
| 125 |
+
|
| 126 |
+
###### Inference for each batch ######
|
| 127 |
+
pred_res = []
|
| 128 |
+
with torch.no_grad():
|
| 129 |
+
for i, batch_data in enumerate(loader if n_batch == 1 else tqdm(loader)):
|
| 130 |
+
# Put the data to device
|
| 131 |
+
for k, v in batch_data.items():
|
| 132 |
+
batch_data[k] = batch_data[k].to(self.device)
|
| 133 |
+
|
| 134 |
+
y_pred, stats = self.inference_each_batch(batch_data)
|
| 135 |
+
|
| 136 |
+
pred_res += y_pred
|
| 137 |
+
|
| 138 |
+
return pred_res
|
| 139 |
+
|
| 140 |
+
def inference(self, feature):
|
| 141 |
+
raise NotImplementedError
|
| 142 |
+
|
| 143 |
+
def synthesis_by_vocoder(self, pred):
|
| 144 |
+
audios_pred = synthesis(
|
| 145 |
+
self.vocoder_cfg,
|
| 146 |
+
self.checkpoint_dir_vocoder,
|
| 147 |
+
len(pred),
|
| 148 |
+
pred,
|
| 149 |
+
)
|
| 150 |
+
return audios_pred
|
| 151 |
+
|
| 152 |
+
def __call__(self, utt):
|
| 153 |
+
feature = self.build_test_utt_data(utt)
|
| 154 |
+
start_time = time.time()
|
| 155 |
+
with torch.no_grad():
|
| 156 |
+
outputs = self.inference(feature)[0]
|
| 157 |
+
time_used = time.time() - start_time
|
| 158 |
+
rtf = time_used / (
|
| 159 |
+
outputs.shape[1]
|
| 160 |
+
* self.cfg.preprocess.hop_size
|
| 161 |
+
/ self.cfg.preprocess.sample_rate
|
| 162 |
+
)
|
| 163 |
+
print("Time used: {:.3f}, RTF: {:.4f}".format(time_used, rtf))
|
| 164 |
+
self.avg_rtf.append(rtf)
|
| 165 |
+
audios = outputs.cpu().squeeze().numpy().reshape(-1, 1)
|
| 166 |
+
return audios
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def base_parser():
|
| 170 |
+
parser = argparse.ArgumentParser()
|
| 171 |
+
parser.add_argument(
|
| 172 |
+
"--config", default="config.json", help="json files for configurations."
|
| 173 |
+
)
|
| 174 |
+
parser.add_argument("--use_ddp_inference", default=False)
|
| 175 |
+
parser.add_argument("--n_workers", default=1, type=int)
|
| 176 |
+
parser.add_argument("--local_rank", default=-1, type=int)
|
| 177 |
+
parser.add_argument(
|
| 178 |
+
"--batch_size", default=1, type=int, help="Batch size for inference"
|
| 179 |
+
)
|
| 180 |
+
parser.add_argument(
|
| 181 |
+
"--num_workers",
|
| 182 |
+
default=1,
|
| 183 |
+
type=int,
|
| 184 |
+
help="Worker number for inference dataloader",
|
| 185 |
+
)
|
| 186 |
+
parser.add_argument(
|
| 187 |
+
"--checkpoint_dir",
|
| 188 |
+
type=str,
|
| 189 |
+
default=None,
|
| 190 |
+
help="Checkpoint dir including model file and configuration",
|
| 191 |
+
)
|
| 192 |
+
parser.add_argument(
|
| 193 |
+
"--checkpoint_file", help="checkpoint file", type=str, default=None
|
| 194 |
+
)
|
| 195 |
+
parser.add_argument(
|
| 196 |
+
"--test_list", help="test utterance list for testing", type=str, default=None
|
| 197 |
+
)
|
| 198 |
+
parser.add_argument(
|
| 199 |
+
"--checkpoint_dir_vocoder",
|
| 200 |
+
help="Vocoder's checkpoint dir including model file and configuration",
|
| 201 |
+
type=str,
|
| 202 |
+
default=None,
|
| 203 |
+
)
|
| 204 |
+
parser.add_argument(
|
| 205 |
+
"--output_dir",
|
| 206 |
+
type=str,
|
| 207 |
+
default=None,
|
| 208 |
+
help="Output dir for saving generated results",
|
| 209 |
+
)
|
| 210 |
+
return parser
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
if __name__ == "__main__":
|
| 214 |
+
parser = base_parser()
|
| 215 |
+
args = parser.parse_args()
|
| 216 |
+
cfg = load_config(args.config)
|
| 217 |
+
|
| 218 |
+
# Build inference
|
| 219 |
+
inference = BaseInference(cfg, args)
|
| 220 |
+
inference()
|
models/base/base_sampler.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import math
|
| 7 |
+
import random
|
| 8 |
+
|
| 9 |
+
from torch.utils.data import ConcatDataset, Dataset
|
| 10 |
+
from torch.utils.data.sampler import (
|
| 11 |
+
BatchSampler,
|
| 12 |
+
RandomSampler,
|
| 13 |
+
Sampler,
|
| 14 |
+
SequentialSampler,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ScheduledSampler(Sampler):
|
| 19 |
+
"""A sampler that samples data from a given concat-dataset.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
concat_dataset (ConcatDataset): a concatenated dataset consisting of all datasets
|
| 23 |
+
batch_size (int): batch size
|
| 24 |
+
holistic_shuffle (bool): whether to shuffle the whole dataset or not
|
| 25 |
+
logger (logging.Logger): logger to print warning message
|
| 26 |
+
|
| 27 |
+
Usage:
|
| 28 |
+
For cfg.train.batch_size = 3, cfg.train.holistic_shuffle = False, cfg.train.drop_last = True:
|
| 29 |
+
>>> list(ScheduledSampler(ConcatDataset([0, 1, 2], [3, 4, 5], [6, 7, 8]])))
|
| 30 |
+
[3, 4, 5, 0, 1, 2, 6, 7, 8]
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def __init__(
|
| 34 |
+
self,
|
| 35 |
+
concat_dataset,
|
| 36 |
+
batch_size,
|
| 37 |
+
holistic_shuffle,
|
| 38 |
+
logger=None,
|
| 39 |
+
loader_type="train",
|
| 40 |
+
):
|
| 41 |
+
if not isinstance(concat_dataset, ConcatDataset):
|
| 42 |
+
raise ValueError(
|
| 43 |
+
"concat_dataset must be an instance of ConcatDataset, but got {}".format(
|
| 44 |
+
type(concat_dataset)
|
| 45 |
+
)
|
| 46 |
+
)
|
| 47 |
+
if not isinstance(batch_size, int):
|
| 48 |
+
raise ValueError(
|
| 49 |
+
"batch_size must be an integer, but got {}".format(type(batch_size))
|
| 50 |
+
)
|
| 51 |
+
if not isinstance(holistic_shuffle, bool):
|
| 52 |
+
raise ValueError(
|
| 53 |
+
"holistic_shuffle must be a boolean, but got {}".format(
|
| 54 |
+
type(holistic_shuffle)
|
| 55 |
+
)
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
self.concat_dataset = concat_dataset
|
| 59 |
+
self.batch_size = batch_size
|
| 60 |
+
self.holistic_shuffle = holistic_shuffle
|
| 61 |
+
|
| 62 |
+
affected_dataset_name = []
|
| 63 |
+
affected_dataset_len = []
|
| 64 |
+
for dataset in concat_dataset.datasets:
|
| 65 |
+
dataset_len = len(dataset)
|
| 66 |
+
dataset_name = dataset.get_dataset_name()
|
| 67 |
+
if dataset_len < batch_size:
|
| 68 |
+
affected_dataset_name.append(dataset_name)
|
| 69 |
+
affected_dataset_len.append(dataset_len)
|
| 70 |
+
|
| 71 |
+
self.type = loader_type
|
| 72 |
+
for dataset_name, dataset_len in zip(
|
| 73 |
+
affected_dataset_name, affected_dataset_len
|
| 74 |
+
):
|
| 75 |
+
if not loader_type == "valid":
|
| 76 |
+
logger.warning(
|
| 77 |
+
"The {} dataset {} has a length of {}, which is smaller than the batch size {}. This may cause unexpected behavior.".format(
|
| 78 |
+
loader_type, dataset_name, dataset_len, batch_size
|
| 79 |
+
)
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
def __len__(self):
|
| 83 |
+
# the number of batches with drop last
|
| 84 |
+
num_of_batches = sum(
|
| 85 |
+
[
|
| 86 |
+
math.floor(len(dataset) / self.batch_size)
|
| 87 |
+
for dataset in self.concat_dataset.datasets
|
| 88 |
+
]
|
| 89 |
+
)
|
| 90 |
+
# if samples are not enough for one batch, we don't drop last
|
| 91 |
+
if self.type == "valid" and num_of_batches < 1:
|
| 92 |
+
return len(self.concat_dataset)
|
| 93 |
+
return num_of_batches * self.batch_size
|
| 94 |
+
|
| 95 |
+
def __iter__(self):
|
| 96 |
+
iters = []
|
| 97 |
+
for dataset in self.concat_dataset.datasets:
|
| 98 |
+
iters.append(
|
| 99 |
+
SequentialSampler(dataset).__iter__()
|
| 100 |
+
if not self.holistic_shuffle
|
| 101 |
+
else RandomSampler(dataset).__iter__()
|
| 102 |
+
)
|
| 103 |
+
# e.g. [0, 200, 400]
|
| 104 |
+
init_indices = [0] + self.concat_dataset.cumulative_sizes[:-1]
|
| 105 |
+
output_batches = []
|
| 106 |
+
for dataset_idx in range(len(self.concat_dataset.datasets)):
|
| 107 |
+
cur_batch = []
|
| 108 |
+
for idx in iters[dataset_idx]:
|
| 109 |
+
cur_batch.append(idx + init_indices[dataset_idx])
|
| 110 |
+
if len(cur_batch) == self.batch_size:
|
| 111 |
+
output_batches.append(cur_batch)
|
| 112 |
+
cur_batch = []
|
| 113 |
+
# if loader_type is valid, we don't need to drop last
|
| 114 |
+
if self.type == "valid" and len(cur_batch) > 0:
|
| 115 |
+
output_batches.append(cur_batch)
|
| 116 |
+
|
| 117 |
+
# force drop last in training
|
| 118 |
+
random.shuffle(output_batches)
|
| 119 |
+
output_indices = [item for sublist in output_batches for item in sublist]
|
| 120 |
+
return iter(output_indices)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def build_samplers(concat_dataset: Dataset, cfg, logger, loader_type):
|
| 124 |
+
sampler = ScheduledSampler(
|
| 125 |
+
concat_dataset,
|
| 126 |
+
cfg.train.batch_size,
|
| 127 |
+
cfg.train.sampler.holistic_shuffle,
|
| 128 |
+
logger,
|
| 129 |
+
loader_type,
|
| 130 |
+
)
|
| 131 |
+
batch_sampler = BatchSampler(
|
| 132 |
+
sampler,
|
| 133 |
+
cfg.train.batch_size,
|
| 134 |
+
cfg.train.sampler.drop_last if not loader_type == "valid" else False,
|
| 135 |
+
)
|
| 136 |
+
return sampler, batch_sampler
|
models/base/base_trainer.py
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import collections
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
import time
|
| 11 |
+
|
| 12 |
+
import torch
|
| 13 |
+
import torch.distributed as dist
|
| 14 |
+
from torch.nn.parallel import DistributedDataParallel
|
| 15 |
+
from torch.utils.data import ConcatDataset, DataLoader
|
| 16 |
+
from torch.utils.tensorboard import SummaryWriter
|
| 17 |
+
|
| 18 |
+
from models.base.base_sampler import BatchSampler
|
| 19 |
+
from utils.util import (
|
| 20 |
+
Logger,
|
| 21 |
+
remove_older_ckpt,
|
| 22 |
+
save_config,
|
| 23 |
+
set_all_random_seed,
|
| 24 |
+
ValueWindow,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class BaseTrainer(object):
|
| 29 |
+
def __init__(self, args, cfg):
|
| 30 |
+
self.args = args
|
| 31 |
+
self.log_dir = args.log_dir
|
| 32 |
+
self.cfg = cfg
|
| 33 |
+
|
| 34 |
+
self.checkpoint_dir = os.path.join(args.log_dir, "checkpoints")
|
| 35 |
+
os.makedirs(self.checkpoint_dir, exist_ok=True)
|
| 36 |
+
if not cfg.train.ddp or args.local_rank == 0:
|
| 37 |
+
self.sw = SummaryWriter(os.path.join(args.log_dir, "events"))
|
| 38 |
+
self.logger = self.build_logger()
|
| 39 |
+
self.time_window = ValueWindow(50)
|
| 40 |
+
|
| 41 |
+
self.step = 0
|
| 42 |
+
self.epoch = -1
|
| 43 |
+
self.max_epochs = self.cfg.train.epochs
|
| 44 |
+
self.max_steps = self.cfg.train.max_steps
|
| 45 |
+
|
| 46 |
+
# set random seed & init distributed training
|
| 47 |
+
set_all_random_seed(self.cfg.train.random_seed)
|
| 48 |
+
if cfg.train.ddp:
|
| 49 |
+
dist.init_process_group(backend="nccl")
|
| 50 |
+
|
| 51 |
+
if cfg.model_type not in ["AutoencoderKL", "AudioLDM"]:
|
| 52 |
+
self.singers = self.build_singers_lut()
|
| 53 |
+
|
| 54 |
+
# setup data_loader
|
| 55 |
+
self.data_loader = self.build_data_loader()
|
| 56 |
+
|
| 57 |
+
# setup model & enable distributed training
|
| 58 |
+
self.model = self.build_model()
|
| 59 |
+
print(self.model)
|
| 60 |
+
|
| 61 |
+
if isinstance(self.model, dict):
|
| 62 |
+
for key, value in self.model.items():
|
| 63 |
+
value.cuda(self.args.local_rank)
|
| 64 |
+
if key == "PQMF":
|
| 65 |
+
continue
|
| 66 |
+
if cfg.train.ddp:
|
| 67 |
+
self.model[key] = DistributedDataParallel(
|
| 68 |
+
value, device_ids=[self.args.local_rank]
|
| 69 |
+
)
|
| 70 |
+
else:
|
| 71 |
+
self.model.cuda(self.args.local_rank)
|
| 72 |
+
if cfg.train.ddp:
|
| 73 |
+
self.model = DistributedDataParallel(
|
| 74 |
+
self.model, device_ids=[self.args.local_rank]
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# create criterion
|
| 78 |
+
self.criterion = self.build_criterion()
|
| 79 |
+
if isinstance(self.criterion, dict):
|
| 80 |
+
for key, value in self.criterion.items():
|
| 81 |
+
self.criterion[key].cuda(args.local_rank)
|
| 82 |
+
else:
|
| 83 |
+
self.criterion.cuda(self.args.local_rank)
|
| 84 |
+
|
| 85 |
+
# optimizer
|
| 86 |
+
self.optimizer = self.build_optimizer()
|
| 87 |
+
self.scheduler = self.build_scheduler()
|
| 88 |
+
|
| 89 |
+
# save config file
|
| 90 |
+
self.config_save_path = os.path.join(self.checkpoint_dir, "args.json")
|
| 91 |
+
|
| 92 |
+
def build_logger(self):
|
| 93 |
+
log_file = os.path.join(self.checkpoint_dir, "train.log")
|
| 94 |
+
logger = Logger(log_file, level=self.args.log_level).logger
|
| 95 |
+
|
| 96 |
+
return logger
|
| 97 |
+
|
| 98 |
+
def build_dataset(self):
|
| 99 |
+
raise NotImplementedError
|
| 100 |
+
|
| 101 |
+
def build_data_loader(self):
|
| 102 |
+
Dataset, Collator = self.build_dataset()
|
| 103 |
+
# build dataset instance for each dataset and combine them by ConcatDataset
|
| 104 |
+
datasets_list = []
|
| 105 |
+
for dataset in self.cfg.dataset:
|
| 106 |
+
subdataset = Dataset(self.cfg, dataset, is_valid=False)
|
| 107 |
+
datasets_list.append(subdataset)
|
| 108 |
+
train_dataset = ConcatDataset(datasets_list)
|
| 109 |
+
|
| 110 |
+
train_collate = Collator(self.cfg)
|
| 111 |
+
# TODO: multi-GPU training
|
| 112 |
+
if self.cfg.train.ddp:
|
| 113 |
+
raise NotImplementedError("DDP is not supported yet.")
|
| 114 |
+
|
| 115 |
+
# sampler will provide indices to batch_sampler, which will perform batching and yield batch indices
|
| 116 |
+
batch_sampler = BatchSampler(
|
| 117 |
+
cfg=self.cfg, concat_dataset=train_dataset, dataset_list=datasets_list
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# use batch_sampler argument instead of (sampler, shuffle, drop_last, batch_size)
|
| 121 |
+
train_loader = DataLoader(
|
| 122 |
+
train_dataset,
|
| 123 |
+
collate_fn=train_collate,
|
| 124 |
+
num_workers=self.args.num_workers,
|
| 125 |
+
batch_sampler=batch_sampler,
|
| 126 |
+
pin_memory=False,
|
| 127 |
+
)
|
| 128 |
+
if not self.cfg.train.ddp or self.args.local_rank == 0:
|
| 129 |
+
datasets_list = []
|
| 130 |
+
for dataset in self.cfg.dataset:
|
| 131 |
+
subdataset = Dataset(self.cfg, dataset, is_valid=True)
|
| 132 |
+
datasets_list.append(subdataset)
|
| 133 |
+
valid_dataset = ConcatDataset(datasets_list)
|
| 134 |
+
valid_collate = Collator(self.cfg)
|
| 135 |
+
batch_sampler = BatchSampler(
|
| 136 |
+
cfg=self.cfg, concat_dataset=valid_dataset, dataset_list=datasets_list
|
| 137 |
+
)
|
| 138 |
+
valid_loader = DataLoader(
|
| 139 |
+
valid_dataset,
|
| 140 |
+
collate_fn=valid_collate,
|
| 141 |
+
num_workers=1,
|
| 142 |
+
batch_sampler=batch_sampler,
|
| 143 |
+
)
|
| 144 |
+
else:
|
| 145 |
+
raise NotImplementedError("DDP is not supported yet.")
|
| 146 |
+
# valid_loader = None
|
| 147 |
+
data_loader = {"train": train_loader, "valid": valid_loader}
|
| 148 |
+
return data_loader
|
| 149 |
+
|
| 150 |
+
def build_singers_lut(self):
|
| 151 |
+
# combine singers
|
| 152 |
+
if not os.path.exists(os.path.join(self.log_dir, self.cfg.preprocess.spk2id)):
|
| 153 |
+
singers = collections.OrderedDict()
|
| 154 |
+
else:
|
| 155 |
+
with open(
|
| 156 |
+
os.path.join(self.log_dir, self.cfg.preprocess.spk2id), "r"
|
| 157 |
+
) as singer_file:
|
| 158 |
+
singers = json.load(singer_file)
|
| 159 |
+
singer_count = len(singers)
|
| 160 |
+
for dataset in self.cfg.dataset:
|
| 161 |
+
singer_lut_path = os.path.join(
|
| 162 |
+
self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.spk2id
|
| 163 |
+
)
|
| 164 |
+
with open(singer_lut_path, "r") as singer_lut_path:
|
| 165 |
+
singer_lut = json.load(singer_lut_path)
|
| 166 |
+
for singer in singer_lut.keys():
|
| 167 |
+
if singer not in singers:
|
| 168 |
+
singers[singer] = singer_count
|
| 169 |
+
singer_count += 1
|
| 170 |
+
with open(
|
| 171 |
+
os.path.join(self.log_dir, self.cfg.preprocess.spk2id), "w"
|
| 172 |
+
) as singer_file:
|
| 173 |
+
json.dump(singers, singer_file, indent=4, ensure_ascii=False)
|
| 174 |
+
print(
|
| 175 |
+
"singers have been dumped to {}".format(
|
| 176 |
+
os.path.join(self.log_dir, self.cfg.preprocess.spk2id)
|
| 177 |
+
)
|
| 178 |
+
)
|
| 179 |
+
return singers
|
| 180 |
+
|
| 181 |
+
def build_model(self):
|
| 182 |
+
raise NotImplementedError()
|
| 183 |
+
|
| 184 |
+
def build_optimizer(self):
|
| 185 |
+
raise NotImplementedError
|
| 186 |
+
|
| 187 |
+
def build_scheduler(self):
|
| 188 |
+
raise NotImplementedError()
|
| 189 |
+
|
| 190 |
+
def build_criterion(self):
|
| 191 |
+
raise NotImplementedError
|
| 192 |
+
|
| 193 |
+
def get_state_dict(self):
|
| 194 |
+
raise NotImplementedError
|
| 195 |
+
|
| 196 |
+
def save_config_file(self):
|
| 197 |
+
save_config(self.config_save_path, self.cfg)
|
| 198 |
+
|
| 199 |
+
# TODO, save without module.
|
| 200 |
+
def save_checkpoint(self, state_dict, saved_model_path):
|
| 201 |
+
torch.save(state_dict, saved_model_path)
|
| 202 |
+
|
| 203 |
+
def load_checkpoint(self):
|
| 204 |
+
checkpoint_path = os.path.join(self.checkpoint_dir, "checkpoint")
|
| 205 |
+
assert os.path.exists(checkpoint_path)
|
| 206 |
+
checkpoint_filename = open(checkpoint_path).readlines()[-1].strip()
|
| 207 |
+
model_path = os.path.join(self.checkpoint_dir, checkpoint_filename)
|
| 208 |
+
assert os.path.exists(model_path)
|
| 209 |
+
if not self.cfg.train.ddp or self.args.local_rank == 0:
|
| 210 |
+
self.logger.info(f"Re(store) from {model_path}")
|
| 211 |
+
checkpoint = torch.load(model_path, map_location="cpu")
|
| 212 |
+
return checkpoint
|
| 213 |
+
|
| 214 |
+
def load_model(self, checkpoint):
|
| 215 |
+
raise NotImplementedError
|
| 216 |
+
|
| 217 |
+
def restore(self):
|
| 218 |
+
checkpoint = self.load_checkpoint()
|
| 219 |
+
self.load_model(checkpoint)
|
| 220 |
+
|
| 221 |
+
def train_step(self, data):
|
| 222 |
+
raise NotImplementedError(
|
| 223 |
+
f"Need to implement function {sys._getframe().f_code.co_name} in "
|
| 224 |
+
f"your sub-class of {self.__class__.__name__}. "
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
@torch.no_grad()
|
| 228 |
+
def eval_step(self):
|
| 229 |
+
raise NotImplementedError(
|
| 230 |
+
f"Need to implement function {sys._getframe().f_code.co_name} in "
|
| 231 |
+
f"your sub-class of {self.__class__.__name__}. "
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
def write_summary(self, losses, stats):
|
| 235 |
+
raise NotImplementedError(
|
| 236 |
+
f"Need to implement function {sys._getframe().f_code.co_name} in "
|
| 237 |
+
f"your sub-class of {self.__class__.__name__}. "
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
def write_valid_summary(self, losses, stats):
|
| 241 |
+
raise NotImplementedError(
|
| 242 |
+
f"Need to implement function {sys._getframe().f_code.co_name} in "
|
| 243 |
+
f"your sub-class of {self.__class__.__name__}. "
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
def echo_log(self, losses, mode="Training"):
|
| 247 |
+
message = [
|
| 248 |
+
"{} - Epoch {} Step {}: [{:.3f} s/step]".format(
|
| 249 |
+
mode, self.epoch + 1, self.step, self.time_window.average
|
| 250 |
+
)
|
| 251 |
+
]
|
| 252 |
+
|
| 253 |
+
for key in sorted(losses.keys()):
|
| 254 |
+
if isinstance(losses[key], dict):
|
| 255 |
+
for k, v in losses[key].items():
|
| 256 |
+
message.append(
|
| 257 |
+
str(k).split("/")[-1] + "=" + str(round(float(v), 5))
|
| 258 |
+
)
|
| 259 |
+
else:
|
| 260 |
+
message.append(
|
| 261 |
+
str(key).split("/")[-1] + "=" + str(round(float(losses[key]), 5))
|
| 262 |
+
)
|
| 263 |
+
self.logger.info(", ".join(message))
|
| 264 |
+
|
| 265 |
+
def eval_epoch(self):
|
| 266 |
+
self.logger.info("Validation...")
|
| 267 |
+
valid_losses = {}
|
| 268 |
+
for i, batch_data in enumerate(self.data_loader["valid"]):
|
| 269 |
+
for k, v in batch_data.items():
|
| 270 |
+
if isinstance(v, torch.Tensor):
|
| 271 |
+
batch_data[k] = v.cuda()
|
| 272 |
+
valid_loss, valid_stats, total_valid_loss = self.eval_step(batch_data, i)
|
| 273 |
+
for key in valid_loss:
|
| 274 |
+
if key not in valid_losses:
|
| 275 |
+
valid_losses[key] = 0
|
| 276 |
+
valid_losses[key] += valid_loss[key]
|
| 277 |
+
|
| 278 |
+
# Add mel and audio to the Tensorboard
|
| 279 |
+
# Average loss
|
| 280 |
+
for key in valid_losses:
|
| 281 |
+
valid_losses[key] /= i + 1
|
| 282 |
+
self.echo_log(valid_losses, "Valid")
|
| 283 |
+
return valid_losses, valid_stats
|
| 284 |
+
|
| 285 |
+
def train_epoch(self):
|
| 286 |
+
for i, batch_data in enumerate(self.data_loader["train"]):
|
| 287 |
+
start_time = time.time()
|
| 288 |
+
# Put the data to cuda device
|
| 289 |
+
for k, v in batch_data.items():
|
| 290 |
+
if isinstance(v, torch.Tensor):
|
| 291 |
+
batch_data[k] = v.cuda(self.args.local_rank)
|
| 292 |
+
|
| 293 |
+
# Training step
|
| 294 |
+
train_losses, train_stats, total_loss = self.train_step(batch_data)
|
| 295 |
+
self.time_window.append(time.time() - start_time)
|
| 296 |
+
|
| 297 |
+
if self.args.local_rank == 0 or not self.cfg.train.ddp:
|
| 298 |
+
if self.step % self.args.stdout_interval == 0:
|
| 299 |
+
self.echo_log(train_losses, "Training")
|
| 300 |
+
|
| 301 |
+
if self.step % self.cfg.train.save_summary_steps == 0:
|
| 302 |
+
self.logger.info(f"Save summary as step {self.step}")
|
| 303 |
+
self.write_summary(train_losses, train_stats)
|
| 304 |
+
|
| 305 |
+
if (
|
| 306 |
+
self.step % self.cfg.train.save_checkpoints_steps == 0
|
| 307 |
+
and self.step != 0
|
| 308 |
+
):
|
| 309 |
+
saved_model_name = "step-{:07d}_loss-{:.4f}.pt".format(
|
| 310 |
+
self.step, total_loss
|
| 311 |
+
)
|
| 312 |
+
saved_model_path = os.path.join(
|
| 313 |
+
self.checkpoint_dir, saved_model_name
|
| 314 |
+
)
|
| 315 |
+
saved_state_dict = self.get_state_dict()
|
| 316 |
+
self.save_checkpoint(saved_state_dict, saved_model_path)
|
| 317 |
+
self.save_config_file()
|
| 318 |
+
# keep max n models
|
| 319 |
+
remove_older_ckpt(
|
| 320 |
+
saved_model_name,
|
| 321 |
+
self.checkpoint_dir,
|
| 322 |
+
max_to_keep=self.cfg.train.keep_checkpoint_max,
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
if self.step != 0 and self.step % self.cfg.train.valid_interval == 0:
|
| 326 |
+
if isinstance(self.model, dict):
|
| 327 |
+
for key in self.model.keys():
|
| 328 |
+
self.model[key].eval()
|
| 329 |
+
else:
|
| 330 |
+
self.model.eval()
|
| 331 |
+
# Evaluate one epoch and get average loss
|
| 332 |
+
valid_losses, valid_stats = self.eval_epoch()
|
| 333 |
+
if isinstance(self.model, dict):
|
| 334 |
+
for key in self.model.keys():
|
| 335 |
+
self.model[key].train()
|
| 336 |
+
else:
|
| 337 |
+
self.model.train()
|
| 338 |
+
# Write validation losses to summary.
|
| 339 |
+
self.write_valid_summary(valid_losses, valid_stats)
|
| 340 |
+
self.step += 1
|
| 341 |
+
|
| 342 |
+
def train(self):
|
| 343 |
+
for epoch in range(max(0, self.epoch), self.max_epochs):
|
| 344 |
+
self.train_epoch()
|
| 345 |
+
self.epoch += 1
|
| 346 |
+
if self.step > self.max_steps:
|
| 347 |
+
self.logger.info("Training finished!")
|
| 348 |
+
break
|
models/base/new_dataset.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
from abc import abstractmethod
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
import json5
|
| 12 |
+
import torch
|
| 13 |
+
import yaml
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# TODO: for training and validating
|
| 17 |
+
class BaseDataset(torch.utils.data.Dataset):
|
| 18 |
+
r"""Base dataset for training and validating."""
|
| 19 |
+
|
| 20 |
+
def __init__(self, args, cfg, is_valid=False):
|
| 21 |
+
pass
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class BaseTestDataset(torch.utils.data.Dataset):
|
| 25 |
+
r"""Test dataset for inference."""
|
| 26 |
+
|
| 27 |
+
def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
|
| 28 |
+
assert infer_type in ["from_dataset", "from_file"]
|
| 29 |
+
|
| 30 |
+
self.args = args
|
| 31 |
+
self.cfg = cfg
|
| 32 |
+
self.infer_type = infer_type
|
| 33 |
+
|
| 34 |
+
@abstractmethod
|
| 35 |
+
def __getitem__(self, index):
|
| 36 |
+
pass
|
| 37 |
+
|
| 38 |
+
def __len__(self):
|
| 39 |
+
return len(self.metadata)
|
| 40 |
+
|
| 41 |
+
def get_metadata(self):
|
| 42 |
+
path = Path(self.args.source)
|
| 43 |
+
if path.suffix == ".json" or path.suffix == ".jsonc":
|
| 44 |
+
metadata = json5.load(open(self.args.source, "r"))
|
| 45 |
+
elif path.suffix == ".yaml" or path.suffix == ".yml":
|
| 46 |
+
metadata = yaml.full_load(open(self.args.source, "r"))
|
| 47 |
+
else:
|
| 48 |
+
raise ValueError(f"Unsupported file type: {path.suffix}")
|
| 49 |
+
|
| 50 |
+
return metadata
|
models/base/new_inference.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import random
|
| 8 |
+
import re
|
| 9 |
+
import time
|
| 10 |
+
from abc import abstractmethod
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
import accelerate
|
| 14 |
+
import json5
|
| 15 |
+
import numpy as np
|
| 16 |
+
import torch
|
| 17 |
+
from accelerate.logging import get_logger
|
| 18 |
+
from torch.utils.data import DataLoader
|
| 19 |
+
|
| 20 |
+
from models.vocoders.vocoder_inference import synthesis
|
| 21 |
+
from utils.io import save_audio
|
| 22 |
+
from utils.util import load_config
|
| 23 |
+
from utils.audio_slicer import is_silence
|
| 24 |
+
|
| 25 |
+
EPS = 1.0e-12
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class BaseInference(object):
|
| 29 |
+
def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
|
| 30 |
+
super().__init__()
|
| 31 |
+
|
| 32 |
+
start = time.monotonic_ns()
|
| 33 |
+
self.args = args
|
| 34 |
+
self.cfg = cfg
|
| 35 |
+
|
| 36 |
+
assert infer_type in ["from_dataset", "from_file"]
|
| 37 |
+
self.infer_type = infer_type
|
| 38 |
+
|
| 39 |
+
# init with accelerate
|
| 40 |
+
self.accelerator = accelerate.Accelerator()
|
| 41 |
+
self.accelerator.wait_for_everyone()
|
| 42 |
+
|
| 43 |
+
# Use accelerate logger for distributed inference
|
| 44 |
+
with self.accelerator.main_process_first():
|
| 45 |
+
self.logger = get_logger("inference", log_level=args.log_level)
|
| 46 |
+
|
| 47 |
+
# Log some info
|
| 48 |
+
self.logger.info("=" * 56)
|
| 49 |
+
self.logger.info("||\t\t" + "New inference process started." + "\t\t||")
|
| 50 |
+
self.logger.info("=" * 56)
|
| 51 |
+
self.logger.info("\n")
|
| 52 |
+
self.logger.debug(f"Using {args.log_level.upper()} logging level.")
|
| 53 |
+
|
| 54 |
+
self.acoustics_dir = args.acoustics_dir
|
| 55 |
+
self.logger.debug(f"Acoustic dir: {args.acoustics_dir}")
|
| 56 |
+
self.vocoder_dir = args.vocoder_dir
|
| 57 |
+
self.logger.debug(f"Vocoder dir: {args.vocoder_dir}")
|
| 58 |
+
# should be in svc inferencer
|
| 59 |
+
# self.target_singer = args.target_singer
|
| 60 |
+
# self.logger.info(f"Target singers: {args.target_singer}")
|
| 61 |
+
# self.trans_key = args.trans_key
|
| 62 |
+
# self.logger.info(f"Trans key: {args.trans_key}")
|
| 63 |
+
|
| 64 |
+
os.makedirs(args.output_dir, exist_ok=True)
|
| 65 |
+
|
| 66 |
+
# set random seed
|
| 67 |
+
with self.accelerator.main_process_first():
|
| 68 |
+
start = time.monotonic_ns()
|
| 69 |
+
self._set_random_seed(self.cfg.train.random_seed)
|
| 70 |
+
end = time.monotonic_ns()
|
| 71 |
+
self.logger.debug(
|
| 72 |
+
f"Setting random seed done in {(end - start) / 1e6:.2f}ms"
|
| 73 |
+
)
|
| 74 |
+
self.logger.debug(f"Random seed: {self.cfg.train.random_seed}")
|
| 75 |
+
|
| 76 |
+
# setup data_loader
|
| 77 |
+
with self.accelerator.main_process_first():
|
| 78 |
+
self.logger.info("Building dataset...")
|
| 79 |
+
start = time.monotonic_ns()
|
| 80 |
+
self.test_dataloader = self._build_dataloader()
|
| 81 |
+
end = time.monotonic_ns()
|
| 82 |
+
self.logger.info(f"Building dataset done in {(end - start) / 1e6:.2f}ms")
|
| 83 |
+
|
| 84 |
+
# setup model
|
| 85 |
+
with self.accelerator.main_process_first():
|
| 86 |
+
self.logger.info("Building model...")
|
| 87 |
+
start = time.monotonic_ns()
|
| 88 |
+
self.model = self._build_model()
|
| 89 |
+
end = time.monotonic_ns()
|
| 90 |
+
# self.logger.debug(self.model)
|
| 91 |
+
self.logger.info(f"Building model done in {(end - start) / 1e6:.3f}ms")
|
| 92 |
+
|
| 93 |
+
# init with accelerate
|
| 94 |
+
self.logger.info("Initializing accelerate...")
|
| 95 |
+
start = time.monotonic_ns()
|
| 96 |
+
self.accelerator = accelerate.Accelerator()
|
| 97 |
+
self.model = self.accelerator.prepare(self.model)
|
| 98 |
+
end = time.monotonic_ns()
|
| 99 |
+
self.accelerator.wait_for_everyone()
|
| 100 |
+
self.logger.info(f"Initializing accelerate done in {(end - start) / 1e6:.3f}ms")
|
| 101 |
+
|
| 102 |
+
with self.accelerator.main_process_first():
|
| 103 |
+
self.logger.info("Loading checkpoint...")
|
| 104 |
+
start = time.monotonic_ns()
|
| 105 |
+
# TODO: Also, suppose only use latest one yet
|
| 106 |
+
self.__load_model(os.path.join(args.acoustics_dir, "checkpoint"))
|
| 107 |
+
end = time.monotonic_ns()
|
| 108 |
+
self.logger.info(f"Loading checkpoint done in {(end - start) / 1e6:.3f}ms")
|
| 109 |
+
|
| 110 |
+
self.model.eval()
|
| 111 |
+
self.accelerator.wait_for_everyone()
|
| 112 |
+
|
| 113 |
+
### Abstract methods ###
|
| 114 |
+
@abstractmethod
|
| 115 |
+
def _build_test_dataset(self):
|
| 116 |
+
pass
|
| 117 |
+
|
| 118 |
+
@abstractmethod
|
| 119 |
+
def _build_model(self):
|
| 120 |
+
pass
|
| 121 |
+
|
| 122 |
+
@abstractmethod
|
| 123 |
+
@torch.inference_mode()
|
| 124 |
+
def _inference_each_batch(self, batch_data):
|
| 125 |
+
pass
|
| 126 |
+
|
| 127 |
+
### Abstract methods end ###
|
| 128 |
+
|
| 129 |
+
@torch.inference_mode()
|
| 130 |
+
def inference(self):
|
| 131 |
+
for i, batch in enumerate(self.test_dataloader):
|
| 132 |
+
y_pred = self._inference_each_batch(batch).cpu()
|
| 133 |
+
mel_min, mel_max = self.test_dataset.target_mel_extrema
|
| 134 |
+
y_pred = (y_pred + 1.0) / 2.0 * (mel_max - mel_min + EPS) + mel_min
|
| 135 |
+
y_ls = y_pred.chunk(self.test_batch_size)
|
| 136 |
+
tgt_ls = batch["target_len"].cpu().chunk(self.test_batch_size)
|
| 137 |
+
j = 0
|
| 138 |
+
for it, l in zip(y_ls, tgt_ls):
|
| 139 |
+
l = l.item()
|
| 140 |
+
it = it.squeeze(0)[:l]
|
| 141 |
+
uid = self.test_dataset.metadata[i * self.test_batch_size + j]["Uid"]
|
| 142 |
+
torch.save(it, os.path.join(self.args.output_dir, f"{uid}.pt"))
|
| 143 |
+
j += 1
|
| 144 |
+
|
| 145 |
+
vocoder_cfg, vocoder_ckpt = self._parse_vocoder(self.args.vocoder_dir)
|
| 146 |
+
|
| 147 |
+
res = synthesis(
|
| 148 |
+
cfg=vocoder_cfg,
|
| 149 |
+
vocoder_weight_file=vocoder_ckpt,
|
| 150 |
+
n_samples=None,
|
| 151 |
+
pred=[
|
| 152 |
+
torch.load(
|
| 153 |
+
os.path.join(self.args.output_dir, "{}.pt".format(i["Uid"]))
|
| 154 |
+
).numpy(force=True)
|
| 155 |
+
for i in self.test_dataset.metadata
|
| 156 |
+
],
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
output_audio_files = []
|
| 160 |
+
for it, wav in zip(self.test_dataset.metadata, res):
|
| 161 |
+
uid = it["Uid"]
|
| 162 |
+
file = os.path.join(self.args.output_dir, f"{uid}.wav")
|
| 163 |
+
output_audio_files.append(file)
|
| 164 |
+
|
| 165 |
+
wav = wav.numpy(force=True)
|
| 166 |
+
save_audio(
|
| 167 |
+
file,
|
| 168 |
+
wav,
|
| 169 |
+
self.cfg.preprocess.sample_rate,
|
| 170 |
+
add_silence=False,
|
| 171 |
+
turn_up=not is_silence(wav, self.cfg.preprocess.sample_rate),
|
| 172 |
+
)
|
| 173 |
+
os.remove(os.path.join(self.args.output_dir, f"{uid}.pt"))
|
| 174 |
+
|
| 175 |
+
return sorted(output_audio_files)
|
| 176 |
+
|
| 177 |
+
# TODO: LEGACY CODE
|
| 178 |
+
def _build_dataloader(self):
|
| 179 |
+
datasets, collate = self._build_test_dataset()
|
| 180 |
+
self.test_dataset = datasets(self.args, self.cfg, self.infer_type)
|
| 181 |
+
self.test_collate = collate(self.cfg)
|
| 182 |
+
self.test_batch_size = min(
|
| 183 |
+
self.cfg.train.batch_size, len(self.test_dataset.metadata)
|
| 184 |
+
)
|
| 185 |
+
test_dataloader = DataLoader(
|
| 186 |
+
self.test_dataset,
|
| 187 |
+
collate_fn=self.test_collate,
|
| 188 |
+
num_workers=1,
|
| 189 |
+
batch_size=self.test_batch_size,
|
| 190 |
+
shuffle=False,
|
| 191 |
+
)
|
| 192 |
+
return test_dataloader
|
| 193 |
+
|
| 194 |
+
def __load_model(self, checkpoint_dir: str = None, checkpoint_path: str = None):
|
| 195 |
+
r"""Load model from checkpoint. If checkpoint_path is None, it will
|
| 196 |
+
load the latest checkpoint in checkpoint_dir. If checkpoint_path is not
|
| 197 |
+
None, it will load the checkpoint specified by checkpoint_path. **Only use this
|
| 198 |
+
method after** ``accelerator.prepare()``.
|
| 199 |
+
"""
|
| 200 |
+
if checkpoint_path is None:
|
| 201 |
+
ls = []
|
| 202 |
+
for i in Path(checkpoint_dir).iterdir():
|
| 203 |
+
if re.match(r"epoch-\d+_step-\d+_loss-[\d.]+", str(i.stem)):
|
| 204 |
+
ls.append(i)
|
| 205 |
+
ls.sort(
|
| 206 |
+
key=lambda x: int(x.stem.split("_")[-3].split("-")[-1]), reverse=True
|
| 207 |
+
)
|
| 208 |
+
checkpoint_path = ls[0]
|
| 209 |
+
else:
|
| 210 |
+
checkpoint_path = Path(checkpoint_path)
|
| 211 |
+
self.accelerator.load_state(str(checkpoint_path))
|
| 212 |
+
# set epoch and step
|
| 213 |
+
self.epoch = int(checkpoint_path.stem.split("_")[-3].split("-")[-1])
|
| 214 |
+
self.step = int(checkpoint_path.stem.split("_")[-2].split("-")[-1])
|
| 215 |
+
return str(checkpoint_path)
|
| 216 |
+
|
| 217 |
+
@staticmethod
|
| 218 |
+
def _set_random_seed(seed):
|
| 219 |
+
r"""Set random seed for all possible random modules."""
|
| 220 |
+
random.seed(seed)
|
| 221 |
+
np.random.seed(seed)
|
| 222 |
+
torch.random.manual_seed(seed)
|
| 223 |
+
|
| 224 |
+
@staticmethod
|
| 225 |
+
def _parse_vocoder(vocoder_dir):
|
| 226 |
+
r"""Parse vocoder config"""
|
| 227 |
+
vocoder_dir = os.path.abspath(vocoder_dir)
|
| 228 |
+
ckpt_list = [ckpt for ckpt in Path(vocoder_dir).glob("*.pt")]
|
| 229 |
+
ckpt_list.sort(key=lambda x: int(x.stem), reverse=True)
|
| 230 |
+
ckpt_path = str(ckpt_list[0])
|
| 231 |
+
vocoder_cfg = load_config(
|
| 232 |
+
os.path.join(vocoder_dir, "args.json"), lowercase=True
|
| 233 |
+
)
|
| 234 |
+
return vocoder_cfg, ckpt_path
|
| 235 |
+
|
| 236 |
+
@staticmethod
|
| 237 |
+
def __count_parameters(model):
|
| 238 |
+
return sum(p.numel() for p in model.parameters())
|
| 239 |
+
|
| 240 |
+
def __dump_cfg(self, path):
|
| 241 |
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
| 242 |
+
json5.dump(
|
| 243 |
+
self.cfg,
|
| 244 |
+
open(path, "w"),
|
| 245 |
+
indent=4,
|
| 246 |
+
sort_keys=True,
|
| 247 |
+
ensure_ascii=False,
|
| 248 |
+
quote_keys=True,
|
| 249 |
+
)
|
models/base/new_trainer.py
ADDED
|
@@ -0,0 +1,722 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import random
|
| 9 |
+
import shutil
|
| 10 |
+
import time
|
| 11 |
+
from abc import abstractmethod
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
import accelerate
|
| 15 |
+
import json5
|
| 16 |
+
import numpy as np
|
| 17 |
+
import torch
|
| 18 |
+
from accelerate.logging import get_logger
|
| 19 |
+
from accelerate.utils import ProjectConfiguration
|
| 20 |
+
from torch.utils.data import ConcatDataset, DataLoader
|
| 21 |
+
from tqdm import tqdm
|
| 22 |
+
|
| 23 |
+
from models.base.base_sampler import build_samplers
|
| 24 |
+
from optimizer.optimizers import NoamLR
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class BaseTrainer(object):
|
| 28 |
+
r"""The base trainer for all tasks. Any trainer should inherit from this class."""
|
| 29 |
+
|
| 30 |
+
def __init__(self, args=None, cfg=None):
|
| 31 |
+
super().__init__()
|
| 32 |
+
|
| 33 |
+
self.args = args
|
| 34 |
+
self.cfg = cfg
|
| 35 |
+
|
| 36 |
+
cfg.exp_name = args.exp_name
|
| 37 |
+
|
| 38 |
+
# init with accelerate
|
| 39 |
+
self._init_accelerator()
|
| 40 |
+
self.accelerator.wait_for_everyone()
|
| 41 |
+
|
| 42 |
+
# Use accelerate logger for distributed training
|
| 43 |
+
with self.accelerator.main_process_first():
|
| 44 |
+
self.logger = get_logger(args.exp_name, log_level=args.log_level)
|
| 45 |
+
|
| 46 |
+
# Log some info
|
| 47 |
+
self.logger.info("=" * 56)
|
| 48 |
+
self.logger.info("||\t\t" + "New training process started." + "\t\t||")
|
| 49 |
+
self.logger.info("=" * 56)
|
| 50 |
+
self.logger.info("\n")
|
| 51 |
+
self.logger.debug(f"Using {args.log_level.upper()} logging level.")
|
| 52 |
+
self.logger.info(f"Experiment name: {args.exp_name}")
|
| 53 |
+
self.logger.info(f"Experiment directory: {self.exp_dir}")
|
| 54 |
+
self.checkpoint_dir = os.path.join(self.exp_dir, "checkpoint")
|
| 55 |
+
if self.accelerator.is_main_process:
|
| 56 |
+
os.makedirs(self.checkpoint_dir, exist_ok=True)
|
| 57 |
+
self.logger.debug(f"Checkpoint directory: {self.checkpoint_dir}")
|
| 58 |
+
|
| 59 |
+
# init counts
|
| 60 |
+
self.batch_count: int = 0
|
| 61 |
+
self.step: int = 0
|
| 62 |
+
self.epoch: int = 0
|
| 63 |
+
self.max_epoch = (
|
| 64 |
+
self.cfg.train.max_epoch if self.cfg.train.max_epoch > 0 else float("inf")
|
| 65 |
+
)
|
| 66 |
+
self.logger.info(
|
| 67 |
+
"Max epoch: {}".format(
|
| 68 |
+
self.max_epoch if self.max_epoch < float("inf") else "Unlimited"
|
| 69 |
+
)
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Check values
|
| 73 |
+
if self.accelerator.is_main_process:
|
| 74 |
+
self.__check_basic_configs()
|
| 75 |
+
# Set runtime configs
|
| 76 |
+
self.save_checkpoint_stride = self.cfg.train.save_checkpoint_stride
|
| 77 |
+
self.checkpoints_path = [
|
| 78 |
+
[] for _ in range(len(self.save_checkpoint_stride))
|
| 79 |
+
]
|
| 80 |
+
self.keep_last = [
|
| 81 |
+
i if i > 0 else float("inf") for i in self.cfg.train.keep_last
|
| 82 |
+
]
|
| 83 |
+
self.run_eval = self.cfg.train.run_eval
|
| 84 |
+
|
| 85 |
+
# set random seed
|
| 86 |
+
with self.accelerator.main_process_first():
|
| 87 |
+
start = time.monotonic_ns()
|
| 88 |
+
self._set_random_seed(self.cfg.train.random_seed)
|
| 89 |
+
end = time.monotonic_ns()
|
| 90 |
+
self.logger.debug(
|
| 91 |
+
f"Setting random seed done in {(end - start) / 1e6:.2f}ms"
|
| 92 |
+
)
|
| 93 |
+
self.logger.debug(f"Random seed: {self.cfg.train.random_seed}")
|
| 94 |
+
|
| 95 |
+
# setup data_loader
|
| 96 |
+
with self.accelerator.main_process_first():
|
| 97 |
+
self.logger.info("Building dataset...")
|
| 98 |
+
start = time.monotonic_ns()
|
| 99 |
+
self.train_dataloader, self.valid_dataloader = self._build_dataloader()
|
| 100 |
+
end = time.monotonic_ns()
|
| 101 |
+
self.logger.info(f"Building dataset done in {(end - start) / 1e6:.2f}ms")
|
| 102 |
+
|
| 103 |
+
# setup model
|
| 104 |
+
with self.accelerator.main_process_first():
|
| 105 |
+
self.logger.info("Building model...")
|
| 106 |
+
start = time.monotonic_ns()
|
| 107 |
+
self.model = self._build_model()
|
| 108 |
+
end = time.monotonic_ns()
|
| 109 |
+
self.logger.debug(self.model)
|
| 110 |
+
self.logger.info(f"Building model done in {(end - start) / 1e6:.2f}ms")
|
| 111 |
+
self.logger.info(
|
| 112 |
+
f"Model parameters: {self.__count_parameters(self.model)/1e6:.2f}M"
|
| 113 |
+
)
|
| 114 |
+
# optimizer & scheduler
|
| 115 |
+
with self.accelerator.main_process_first():
|
| 116 |
+
self.logger.info("Building optimizer and scheduler...")
|
| 117 |
+
start = time.monotonic_ns()
|
| 118 |
+
self.optimizer = self.__build_optimizer()
|
| 119 |
+
self.scheduler = self.__build_scheduler()
|
| 120 |
+
end = time.monotonic_ns()
|
| 121 |
+
self.logger.info(
|
| 122 |
+
f"Building optimizer and scheduler done in {(end - start) / 1e6:.2f}ms"
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
# accelerate prepare
|
| 126 |
+
self.logger.info("Initializing accelerate...")
|
| 127 |
+
start = time.monotonic_ns()
|
| 128 |
+
(
|
| 129 |
+
self.train_dataloader,
|
| 130 |
+
self.valid_dataloader,
|
| 131 |
+
self.model,
|
| 132 |
+
self.optimizer,
|
| 133 |
+
self.scheduler,
|
| 134 |
+
) = self.accelerator.prepare(
|
| 135 |
+
self.train_dataloader,
|
| 136 |
+
self.valid_dataloader,
|
| 137 |
+
self.model,
|
| 138 |
+
self.optimizer,
|
| 139 |
+
self.scheduler,
|
| 140 |
+
)
|
| 141 |
+
end = time.monotonic_ns()
|
| 142 |
+
self.logger.info(f"Initializing accelerate done in {(end - start) / 1e6:.2f}ms")
|
| 143 |
+
|
| 144 |
+
# create criterion
|
| 145 |
+
with self.accelerator.main_process_first():
|
| 146 |
+
self.logger.info("Building criterion...")
|
| 147 |
+
start = time.monotonic_ns()
|
| 148 |
+
self.criterion = self._build_criterion()
|
| 149 |
+
end = time.monotonic_ns()
|
| 150 |
+
self.logger.info(f"Building criterion done in {(end - start) / 1e6:.2f}ms")
|
| 151 |
+
|
| 152 |
+
# Resume or Finetune
|
| 153 |
+
with self.accelerator.main_process_first():
|
| 154 |
+
if args.resume:
|
| 155 |
+
## Automatically resume according to the current exprimental name
|
| 156 |
+
self.logger.info("Resuming from {}...".format(self.checkpoint_dir))
|
| 157 |
+
start = time.monotonic_ns()
|
| 158 |
+
ckpt_path = self.__load_model(
|
| 159 |
+
checkpoint_dir=self.checkpoint_dir, resume_type=args.resume_type
|
| 160 |
+
)
|
| 161 |
+
end = time.monotonic_ns()
|
| 162 |
+
self.logger.info(
|
| 163 |
+
f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms"
|
| 164 |
+
)
|
| 165 |
+
self.checkpoints_path = json.load(
|
| 166 |
+
open(os.path.join(ckpt_path, "ckpts.json"), "r")
|
| 167 |
+
)
|
| 168 |
+
elif args.resume_from_ckpt_path and args.resume_from_ckpt_path != "":
|
| 169 |
+
## Resume from the given checkpoint path
|
| 170 |
+
if not os.path.exists(args.resume_from_ckpt_path):
|
| 171 |
+
raise ValueError(
|
| 172 |
+
"[Error] The resumed checkpoint path {} don't exist.".format(
|
| 173 |
+
args.resume_from_ckpt_path
|
| 174 |
+
)
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
self.logger.info(
|
| 178 |
+
"Resuming from {}...".format(args.resume_from_ckpt_path)
|
| 179 |
+
)
|
| 180 |
+
start = time.monotonic_ns()
|
| 181 |
+
ckpt_path = self.__load_model(
|
| 182 |
+
checkpoint_path=args.resume_from_ckpt_path,
|
| 183 |
+
resume_type=args.resume_type,
|
| 184 |
+
)
|
| 185 |
+
end = time.monotonic_ns()
|
| 186 |
+
self.logger.info(
|
| 187 |
+
f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms"
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
# save config file path
|
| 191 |
+
self.config_save_path = os.path.join(self.exp_dir, "args.json")
|
| 192 |
+
|
| 193 |
+
### Following are abstract methods that should be implemented in child classes ###
|
| 194 |
+
@abstractmethod
|
| 195 |
+
def _build_dataset(self):
|
| 196 |
+
r"""Build dataset for model training/validating/evaluating."""
|
| 197 |
+
pass
|
| 198 |
+
|
| 199 |
+
@staticmethod
|
| 200 |
+
@abstractmethod
|
| 201 |
+
def _build_criterion():
|
| 202 |
+
r"""Build criterion function for model loss calculation."""
|
| 203 |
+
pass
|
| 204 |
+
|
| 205 |
+
@abstractmethod
|
| 206 |
+
def _build_model(self):
|
| 207 |
+
r"""Build model for training/validating/evaluating."""
|
| 208 |
+
pass
|
| 209 |
+
|
| 210 |
+
@abstractmethod
|
| 211 |
+
def _forward_step(self, batch):
|
| 212 |
+
r"""One forward step of the neural network. This abstract method is trying to
|
| 213 |
+
unify ``_train_step`` and ``_valid_step`` and avoid redundant implementation.
|
| 214 |
+
However, for special case that using different forward step pattern for
|
| 215 |
+
training and validating, you could just override this method with ``pass`` and
|
| 216 |
+
implement ``_train_step`` and ``_valid_step`` separately.
|
| 217 |
+
"""
|
| 218 |
+
pass
|
| 219 |
+
|
| 220 |
+
@abstractmethod
|
| 221 |
+
def _save_auxiliary_states(self):
|
| 222 |
+
r"""To save some auxiliary states when saving model's ckpt"""
|
| 223 |
+
pass
|
| 224 |
+
|
| 225 |
+
### Abstract methods end ###
|
| 226 |
+
|
| 227 |
+
### THIS IS MAIN ENTRY ###
|
| 228 |
+
def train_loop(self):
|
| 229 |
+
r"""Training loop. The public entry of training process."""
|
| 230 |
+
# Wait everyone to prepare before we move on
|
| 231 |
+
self.accelerator.wait_for_everyone()
|
| 232 |
+
# dump config file
|
| 233 |
+
if self.accelerator.is_main_process:
|
| 234 |
+
self.__dump_cfg(self.config_save_path)
|
| 235 |
+
self.model.train()
|
| 236 |
+
self.optimizer.zero_grad()
|
| 237 |
+
# Wait to ensure good to go
|
| 238 |
+
self.accelerator.wait_for_everyone()
|
| 239 |
+
while self.epoch < self.max_epoch:
|
| 240 |
+
self.logger.info("\n")
|
| 241 |
+
self.logger.info("-" * 32)
|
| 242 |
+
self.logger.info("Epoch {}: ".format(self.epoch))
|
| 243 |
+
|
| 244 |
+
### TODO: change the return values of _train_epoch() to a loss dict, or (total_loss, loss_dict)
|
| 245 |
+
### It's inconvenient for the model with multiple losses
|
| 246 |
+
# Do training & validating epoch
|
| 247 |
+
train_loss = self._train_epoch()
|
| 248 |
+
self.logger.info(" |- Train/Loss: {:.6f}".format(train_loss))
|
| 249 |
+
valid_loss = self._valid_epoch()
|
| 250 |
+
self.logger.info(" |- Valid/Loss: {:.6f}".format(valid_loss))
|
| 251 |
+
self.accelerator.log(
|
| 252 |
+
{"Epoch/Train Loss": train_loss, "Epoch/Valid Loss": valid_loss},
|
| 253 |
+
step=self.epoch,
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
self.accelerator.wait_for_everyone()
|
| 257 |
+
# TODO: what is scheduler?
|
| 258 |
+
self.scheduler.step(valid_loss) # FIXME: use epoch track correct?
|
| 259 |
+
|
| 260 |
+
# Check if hit save_checkpoint_stride and run_eval
|
| 261 |
+
run_eval = False
|
| 262 |
+
if self.accelerator.is_main_process:
|
| 263 |
+
save_checkpoint = False
|
| 264 |
+
hit_dix = []
|
| 265 |
+
for i, num in enumerate(self.save_checkpoint_stride):
|
| 266 |
+
if self.epoch % num == 0:
|
| 267 |
+
save_checkpoint = True
|
| 268 |
+
hit_dix.append(i)
|
| 269 |
+
run_eval |= self.run_eval[i]
|
| 270 |
+
|
| 271 |
+
self.accelerator.wait_for_everyone()
|
| 272 |
+
if self.accelerator.is_main_process and save_checkpoint:
|
| 273 |
+
path = os.path.join(
|
| 274 |
+
self.checkpoint_dir,
|
| 275 |
+
"epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
|
| 276 |
+
self.epoch, self.step, train_loss
|
| 277 |
+
),
|
| 278 |
+
)
|
| 279 |
+
self.tmp_checkpoint_save_path = path
|
| 280 |
+
self.accelerator.save_state(path)
|
| 281 |
+
print(f"save checkpoint in {path}")
|
| 282 |
+
json.dump(
|
| 283 |
+
self.checkpoints_path,
|
| 284 |
+
open(os.path.join(path, "ckpts.json"), "w"),
|
| 285 |
+
ensure_ascii=False,
|
| 286 |
+
indent=4,
|
| 287 |
+
)
|
| 288 |
+
self._save_auxiliary_states()
|
| 289 |
+
|
| 290 |
+
# Remove old checkpoints
|
| 291 |
+
to_remove = []
|
| 292 |
+
for idx in hit_dix:
|
| 293 |
+
self.checkpoints_path[idx].append(path)
|
| 294 |
+
while len(self.checkpoints_path[idx]) > self.keep_last[idx]:
|
| 295 |
+
to_remove.append((idx, self.checkpoints_path[idx].pop(0)))
|
| 296 |
+
|
| 297 |
+
# Search conflicts
|
| 298 |
+
total = set()
|
| 299 |
+
for i in self.checkpoints_path:
|
| 300 |
+
total |= set(i)
|
| 301 |
+
do_remove = set()
|
| 302 |
+
for idx, path in to_remove[::-1]:
|
| 303 |
+
if path in total:
|
| 304 |
+
self.checkpoints_path[idx].insert(0, path)
|
| 305 |
+
else:
|
| 306 |
+
do_remove.add(path)
|
| 307 |
+
|
| 308 |
+
# Remove old checkpoints
|
| 309 |
+
for path in do_remove:
|
| 310 |
+
shutil.rmtree(path, ignore_errors=True)
|
| 311 |
+
self.logger.debug(f"Remove old checkpoint: {path}")
|
| 312 |
+
|
| 313 |
+
self.accelerator.wait_for_everyone()
|
| 314 |
+
if run_eval:
|
| 315 |
+
# TODO: run evaluation
|
| 316 |
+
pass
|
| 317 |
+
|
| 318 |
+
# Update info for each epoch
|
| 319 |
+
self.epoch += 1
|
| 320 |
+
|
| 321 |
+
# Finish training and save final checkpoint
|
| 322 |
+
self.accelerator.wait_for_everyone()
|
| 323 |
+
if self.accelerator.is_main_process:
|
| 324 |
+
self.accelerator.save_state(
|
| 325 |
+
os.path.join(
|
| 326 |
+
self.checkpoint_dir,
|
| 327 |
+
"final_epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
|
| 328 |
+
self.epoch, self.step, valid_loss
|
| 329 |
+
),
|
| 330 |
+
)
|
| 331 |
+
)
|
| 332 |
+
self._save_auxiliary_states()
|
| 333 |
+
|
| 334 |
+
self.accelerator.end_training()
|
| 335 |
+
|
| 336 |
+
### Following are methods that can be used directly in child classes ###
|
| 337 |
+
def _train_epoch(self):
|
| 338 |
+
r"""Training epoch. Should return average loss of a batch (sample) over
|
| 339 |
+
one epoch. See ``train_loop`` for usage.
|
| 340 |
+
"""
|
| 341 |
+
self.model.train()
|
| 342 |
+
epoch_sum_loss: float = 0.0
|
| 343 |
+
epoch_step: int = 0
|
| 344 |
+
for batch in tqdm(
|
| 345 |
+
self.train_dataloader,
|
| 346 |
+
desc=f"Training Epoch {self.epoch}",
|
| 347 |
+
unit="batch",
|
| 348 |
+
colour="GREEN",
|
| 349 |
+
leave=False,
|
| 350 |
+
dynamic_ncols=True,
|
| 351 |
+
smoothing=0.04,
|
| 352 |
+
disable=not self.accelerator.is_main_process,
|
| 353 |
+
):
|
| 354 |
+
# Do training step and BP
|
| 355 |
+
with self.accelerator.accumulate(self.model):
|
| 356 |
+
loss = self._train_step(batch)
|
| 357 |
+
self.accelerator.backward(loss)
|
| 358 |
+
self.optimizer.step()
|
| 359 |
+
self.optimizer.zero_grad()
|
| 360 |
+
self.batch_count += 1
|
| 361 |
+
|
| 362 |
+
# Update info for each step
|
| 363 |
+
# TODO: step means BP counts or batch counts?
|
| 364 |
+
if self.batch_count % self.cfg.train.gradient_accumulation_step == 0:
|
| 365 |
+
epoch_sum_loss += loss
|
| 366 |
+
self.accelerator.log(
|
| 367 |
+
{
|
| 368 |
+
"Step/Train Loss": loss,
|
| 369 |
+
"Step/Learning Rate": self.optimizer.param_groups[0]["lr"],
|
| 370 |
+
},
|
| 371 |
+
step=self.step,
|
| 372 |
+
)
|
| 373 |
+
self.step += 1
|
| 374 |
+
epoch_step += 1
|
| 375 |
+
|
| 376 |
+
self.accelerator.wait_for_everyone()
|
| 377 |
+
return (
|
| 378 |
+
epoch_sum_loss
|
| 379 |
+
/ len(self.train_dataloader)
|
| 380 |
+
* self.cfg.train.gradient_accumulation_step
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
@torch.inference_mode()
|
| 384 |
+
def _valid_epoch(self):
|
| 385 |
+
r"""Testing epoch. Should return average loss of a batch (sample) over
|
| 386 |
+
one epoch. See ``train_loop`` for usage.
|
| 387 |
+
"""
|
| 388 |
+
self.model.eval()
|
| 389 |
+
epoch_sum_loss = 0.0
|
| 390 |
+
for batch in tqdm(
|
| 391 |
+
self.valid_dataloader,
|
| 392 |
+
desc=f"Validating Epoch {self.epoch}",
|
| 393 |
+
unit="batch",
|
| 394 |
+
colour="GREEN",
|
| 395 |
+
leave=False,
|
| 396 |
+
dynamic_ncols=True,
|
| 397 |
+
smoothing=0.04,
|
| 398 |
+
disable=not self.accelerator.is_main_process,
|
| 399 |
+
):
|
| 400 |
+
batch_loss = self._valid_step(batch)
|
| 401 |
+
epoch_sum_loss += batch_loss.item()
|
| 402 |
+
|
| 403 |
+
self.accelerator.wait_for_everyone()
|
| 404 |
+
return epoch_sum_loss / len(self.valid_dataloader)
|
| 405 |
+
|
| 406 |
+
def _train_step(self, batch):
|
| 407 |
+
r"""Training forward step. Should return average loss of a sample over
|
| 408 |
+
one batch. Provoke ``_forward_step`` is recommended except for special case.
|
| 409 |
+
See ``_train_epoch`` for usage.
|
| 410 |
+
"""
|
| 411 |
+
return self._forward_step(batch)
|
| 412 |
+
|
| 413 |
+
@torch.inference_mode()
|
| 414 |
+
def _valid_step(self, batch):
|
| 415 |
+
r"""Testing forward step. Should return average loss of a sample over
|
| 416 |
+
one batch. Provoke ``_forward_step`` is recommended except for special case.
|
| 417 |
+
See ``_test_epoch`` for usage.
|
| 418 |
+
"""
|
| 419 |
+
return self._forward_step(batch)
|
| 420 |
+
|
| 421 |
+
def __load_model(
|
| 422 |
+
self,
|
| 423 |
+
checkpoint_dir: str = None,
|
| 424 |
+
checkpoint_path: str = None,
|
| 425 |
+
resume_type: str = "",
|
| 426 |
+
):
|
| 427 |
+
r"""Load model from checkpoint. If checkpoint_path is None, it will
|
| 428 |
+
load the latest checkpoint in checkpoint_dir. If checkpoint_path is not
|
| 429 |
+
None, it will load the checkpoint specified by checkpoint_path. **Only use this
|
| 430 |
+
method after** ``accelerator.prepare()``.
|
| 431 |
+
"""
|
| 432 |
+
if checkpoint_path is None:
|
| 433 |
+
ls = [str(i) for i in Path(checkpoint_dir).glob("*")]
|
| 434 |
+
ls.sort(key=lambda x: int(x.split("_")[-3].split("-")[-1]), reverse=True)
|
| 435 |
+
checkpoint_path = ls[0]
|
| 436 |
+
self.logger.info("Resume from {}...".format(checkpoint_path))
|
| 437 |
+
|
| 438 |
+
if resume_type in ["resume", ""]:
|
| 439 |
+
# Load all the things, including model weights, optimizer, scheduler, and random states.
|
| 440 |
+
self.accelerator.load_state(input_dir=checkpoint_path)
|
| 441 |
+
|
| 442 |
+
# set epoch and step
|
| 443 |
+
self.epoch = int(checkpoint_path.split("_")[-3].split("-")[-1]) + 1
|
| 444 |
+
self.step = int(checkpoint_path.split("_")[-2].split("-")[-1]) + 1
|
| 445 |
+
|
| 446 |
+
elif resume_type == "finetune":
|
| 447 |
+
# Load only the model weights
|
| 448 |
+
accelerate.load_checkpoint_and_dispatch(
|
| 449 |
+
self.accelerator.unwrap_model(self.model),
|
| 450 |
+
os.path.join(checkpoint_path, "pytorch_model.bin"),
|
| 451 |
+
)
|
| 452 |
+
self.logger.info("Load model weights for finetune...")
|
| 453 |
+
|
| 454 |
+
else:
|
| 455 |
+
raise ValueError("Resume_type must be `resume` or `finetune`.")
|
| 456 |
+
|
| 457 |
+
return checkpoint_path
|
| 458 |
+
|
| 459 |
+
# TODO: LEGACY CODE
|
| 460 |
+
def _build_dataloader(self):
|
| 461 |
+
Dataset, Collator = self._build_dataset()
|
| 462 |
+
|
| 463 |
+
# build dataset instance for each dataset and combine them by ConcatDataset
|
| 464 |
+
datasets_list = []
|
| 465 |
+
for dataset in self.cfg.dataset:
|
| 466 |
+
subdataset = Dataset(self.cfg, dataset, is_valid=False)
|
| 467 |
+
datasets_list.append(subdataset)
|
| 468 |
+
train_dataset = ConcatDataset(datasets_list)
|
| 469 |
+
train_collate = Collator(self.cfg)
|
| 470 |
+
_, batch_sampler = build_samplers(train_dataset, self.cfg, self.logger, "train")
|
| 471 |
+
self.logger.debug(f"train batch_sampler: {list(batch_sampler)}")
|
| 472 |
+
self.logger.debug(f"length: {train_dataset.cumulative_sizes}")
|
| 473 |
+
# TODO: use config instead of (sampler, shuffle, drop_last, batch_size)
|
| 474 |
+
train_loader = DataLoader(
|
| 475 |
+
train_dataset,
|
| 476 |
+
collate_fn=train_collate,
|
| 477 |
+
batch_sampler=batch_sampler,
|
| 478 |
+
num_workers=self.cfg.train.dataloader.num_worker,
|
| 479 |
+
pin_memory=self.cfg.train.dataloader.pin_memory,
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
# Build valid dataloader
|
| 483 |
+
datasets_list = []
|
| 484 |
+
for dataset in self.cfg.dataset:
|
| 485 |
+
subdataset = Dataset(self.cfg, dataset, is_valid=True)
|
| 486 |
+
datasets_list.append(subdataset)
|
| 487 |
+
valid_dataset = ConcatDataset(datasets_list)
|
| 488 |
+
valid_collate = Collator(self.cfg)
|
| 489 |
+
_, batch_sampler = build_samplers(valid_dataset, self.cfg, self.logger, "valid")
|
| 490 |
+
self.logger.debug(f"valid batch_sampler: {list(batch_sampler)}")
|
| 491 |
+
self.logger.debug(f"length: {valid_dataset.cumulative_sizes}")
|
| 492 |
+
valid_loader = DataLoader(
|
| 493 |
+
valid_dataset,
|
| 494 |
+
collate_fn=valid_collate,
|
| 495 |
+
batch_sampler=batch_sampler,
|
| 496 |
+
num_workers=self.cfg.train.dataloader.num_worker,
|
| 497 |
+
pin_memory=self.cfg.train.dataloader.pin_memory,
|
| 498 |
+
)
|
| 499 |
+
return train_loader, valid_loader
|
| 500 |
+
|
| 501 |
+
@staticmethod
|
| 502 |
+
def _set_random_seed(seed):
|
| 503 |
+
r"""Set random seed for all possible random modules."""
|
| 504 |
+
random.seed(seed)
|
| 505 |
+
np.random.seed(seed)
|
| 506 |
+
torch.random.manual_seed(seed)
|
| 507 |
+
|
| 508 |
+
def _check_nan(self, loss, y_pred, y_gt):
|
| 509 |
+
if torch.any(torch.isnan(loss)):
|
| 510 |
+
self.logger.fatal("Fatal Error: Training is down since loss has Nan!")
|
| 511 |
+
self.logger.error("loss = {:.6f}".format(loss.item()), in_order=True)
|
| 512 |
+
if torch.any(torch.isnan(y_pred)):
|
| 513 |
+
self.logger.error(
|
| 514 |
+
f"y_pred has Nan: {torch.any(torch.isnan(y_pred))}", in_order=True
|
| 515 |
+
)
|
| 516 |
+
else:
|
| 517 |
+
self.logger.debug(
|
| 518 |
+
f"y_pred has Nan: {torch.any(torch.isnan(y_pred))}", in_order=True
|
| 519 |
+
)
|
| 520 |
+
if torch.any(torch.isnan(y_gt)):
|
| 521 |
+
self.logger.error(
|
| 522 |
+
f"y_gt has Nan: {torch.any(torch.isnan(y_gt))}", in_order=True
|
| 523 |
+
)
|
| 524 |
+
else:
|
| 525 |
+
self.logger.debug(
|
| 526 |
+
f"y_gt has nan: {torch.any(torch.isnan(y_gt))}", in_order=True
|
| 527 |
+
)
|
| 528 |
+
if torch.any(torch.isnan(y_pred)):
|
| 529 |
+
self.logger.error(f"y_pred: {y_pred}", in_order=True)
|
| 530 |
+
else:
|
| 531 |
+
self.logger.debug(f"y_pred: {y_pred}", in_order=True)
|
| 532 |
+
if torch.any(torch.isnan(y_gt)):
|
| 533 |
+
self.logger.error(f"y_gt: {y_gt}", in_order=True)
|
| 534 |
+
else:
|
| 535 |
+
self.logger.debug(f"y_gt: {y_gt}", in_order=True)
|
| 536 |
+
|
| 537 |
+
# TODO: still OK to save tracking?
|
| 538 |
+
self.accelerator.end_training()
|
| 539 |
+
raise RuntimeError("Loss has Nan! See log for more info.")
|
| 540 |
+
|
| 541 |
+
### Protected methods end ###
|
| 542 |
+
|
| 543 |
+
## Following are private methods ##
|
| 544 |
+
## !!! These are inconvenient for GAN-based model training. It'd be better to move these to svc_trainer.py if needed.
|
| 545 |
+
def __build_optimizer(self):
|
| 546 |
+
r"""Build optimizer for model."""
|
| 547 |
+
# Make case-insensitive matching
|
| 548 |
+
if self.cfg.train.optimizer.lower() == "adadelta":
|
| 549 |
+
optimizer = torch.optim.Adadelta(
|
| 550 |
+
self.model.parameters(), **self.cfg.train.adadelta
|
| 551 |
+
)
|
| 552 |
+
self.logger.info("Using Adadelta optimizer.")
|
| 553 |
+
elif self.cfg.train.optimizer.lower() == "adagrad":
|
| 554 |
+
optimizer = torch.optim.Adagrad(
|
| 555 |
+
self.model.parameters(), **self.cfg.train.adagrad
|
| 556 |
+
)
|
| 557 |
+
self.logger.info("Using Adagrad optimizer.")
|
| 558 |
+
elif self.cfg.train.optimizer.lower() == "adam":
|
| 559 |
+
optimizer = torch.optim.Adam(self.model.parameters(), **self.cfg.train.adam)
|
| 560 |
+
self.logger.info("Using Adam optimizer.")
|
| 561 |
+
elif self.cfg.train.optimizer.lower() == "adamw":
|
| 562 |
+
optimizer = torch.optim.AdamW(
|
| 563 |
+
self.model.parameters(), **self.cfg.train.adamw
|
| 564 |
+
)
|
| 565 |
+
elif self.cfg.train.optimizer.lower() == "sparseadam":
|
| 566 |
+
optimizer = torch.optim.SparseAdam(
|
| 567 |
+
self.model.parameters(), **self.cfg.train.sparseadam
|
| 568 |
+
)
|
| 569 |
+
elif self.cfg.train.optimizer.lower() == "adamax":
|
| 570 |
+
optimizer = torch.optim.Adamax(
|
| 571 |
+
self.model.parameters(), **self.cfg.train.adamax
|
| 572 |
+
)
|
| 573 |
+
elif self.cfg.train.optimizer.lower() == "asgd":
|
| 574 |
+
optimizer = torch.optim.ASGD(self.model.parameters(), **self.cfg.train.asgd)
|
| 575 |
+
elif self.cfg.train.optimizer.lower() == "lbfgs":
|
| 576 |
+
optimizer = torch.optim.LBFGS(
|
| 577 |
+
self.model.parameters(), **self.cfg.train.lbfgs
|
| 578 |
+
)
|
| 579 |
+
elif self.cfg.train.optimizer.lower() == "nadam":
|
| 580 |
+
optimizer = torch.optim.NAdam(
|
| 581 |
+
self.model.parameters(), **self.cfg.train.nadam
|
| 582 |
+
)
|
| 583 |
+
elif self.cfg.train.optimizer.lower() == "radam":
|
| 584 |
+
optimizer = torch.optim.RAdam(
|
| 585 |
+
self.model.parameters(), **self.cfg.train.radam
|
| 586 |
+
)
|
| 587 |
+
elif self.cfg.train.optimizer.lower() == "rmsprop":
|
| 588 |
+
optimizer = torch.optim.RMSprop(
|
| 589 |
+
self.model.parameters(), **self.cfg.train.rmsprop
|
| 590 |
+
)
|
| 591 |
+
elif self.cfg.train.optimizer.lower() == "rprop":
|
| 592 |
+
optimizer = torch.optim.Rprop(
|
| 593 |
+
self.model.parameters(), **self.cfg.train.rprop
|
| 594 |
+
)
|
| 595 |
+
elif self.cfg.train.optimizer.lower() == "sgd":
|
| 596 |
+
optimizer = torch.optim.SGD(self.model.parameters(), **self.cfg.train.sgd)
|
| 597 |
+
else:
|
| 598 |
+
raise NotImplementedError(
|
| 599 |
+
f"Optimizer {self.cfg.train.optimizer} not supported yet!"
|
| 600 |
+
)
|
| 601 |
+
return optimizer
|
| 602 |
+
|
| 603 |
+
def __build_scheduler(self):
|
| 604 |
+
r"""Build scheduler for optimizer."""
|
| 605 |
+
# Make case-insensitive matching
|
| 606 |
+
if self.cfg.train.scheduler.lower() == "lambdalr":
|
| 607 |
+
scheduler = torch.optim.lr_scheduler.LambdaLR(
|
| 608 |
+
self.optimizer, **self.cfg.train.lambdalr
|
| 609 |
+
)
|
| 610 |
+
elif self.cfg.train.scheduler.lower() == "multiplicativelr":
|
| 611 |
+
scheduler = torch.optim.lr_scheduler.MultiplicativeLR(
|
| 612 |
+
self.optimizer, **self.cfg.train.multiplicativelr
|
| 613 |
+
)
|
| 614 |
+
elif self.cfg.train.scheduler.lower() == "steplr":
|
| 615 |
+
scheduler = torch.optim.lr_scheduler.StepLR(
|
| 616 |
+
self.optimizer, **self.cfg.train.steplr
|
| 617 |
+
)
|
| 618 |
+
elif self.cfg.train.scheduler.lower() == "multisteplr":
|
| 619 |
+
scheduler = torch.optim.lr_scheduler.MultiStepLR(
|
| 620 |
+
self.optimizer, **self.cfg.train.multisteplr
|
| 621 |
+
)
|
| 622 |
+
elif self.cfg.train.scheduler.lower() == "constantlr":
|
| 623 |
+
scheduler = torch.optim.lr_scheduler.ConstantLR(
|
| 624 |
+
self.optimizer, **self.cfg.train.constantlr
|
| 625 |
+
)
|
| 626 |
+
elif self.cfg.train.scheduler.lower() == "linearlr":
|
| 627 |
+
scheduler = torch.optim.lr_scheduler.LinearLR(
|
| 628 |
+
self.optimizer, **self.cfg.train.linearlr
|
| 629 |
+
)
|
| 630 |
+
elif self.cfg.train.scheduler.lower() == "exponentiallr":
|
| 631 |
+
scheduler = torch.optim.lr_scheduler.ExponentialLR(
|
| 632 |
+
self.optimizer, **self.cfg.train.exponentiallr
|
| 633 |
+
)
|
| 634 |
+
elif self.cfg.train.scheduler.lower() == "polynomiallr":
|
| 635 |
+
scheduler = torch.optim.lr_scheduler.PolynomialLR(
|
| 636 |
+
self.optimizer, **self.cfg.train.polynomiallr
|
| 637 |
+
)
|
| 638 |
+
elif self.cfg.train.scheduler.lower() == "cosineannealinglr":
|
| 639 |
+
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
|
| 640 |
+
self.optimizer, **self.cfg.train.cosineannealinglr
|
| 641 |
+
)
|
| 642 |
+
elif self.cfg.train.scheduler.lower() == "sequentiallr":
|
| 643 |
+
scheduler = torch.optim.lr_scheduler.SequentialLR(
|
| 644 |
+
self.optimizer, **self.cfg.train.sequentiallr
|
| 645 |
+
)
|
| 646 |
+
elif self.cfg.train.scheduler.lower() == "reducelronplateau":
|
| 647 |
+
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
|
| 648 |
+
self.optimizer, **self.cfg.train.reducelronplateau
|
| 649 |
+
)
|
| 650 |
+
elif self.cfg.train.scheduler.lower() == "cycliclr":
|
| 651 |
+
scheduler = torch.optim.lr_scheduler.CyclicLR(
|
| 652 |
+
self.optimizer, **self.cfg.train.cycliclr
|
| 653 |
+
)
|
| 654 |
+
elif self.cfg.train.scheduler.lower() == "onecyclelr":
|
| 655 |
+
scheduler = torch.optim.lr_scheduler.OneCycleLR(
|
| 656 |
+
self.optimizer, **self.cfg.train.onecyclelr
|
| 657 |
+
)
|
| 658 |
+
elif self.cfg.train.scheduler.lower() == "cosineannearingwarmrestarts":
|
| 659 |
+
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
|
| 660 |
+
self.optimizer, **self.cfg.train.cosineannearingwarmrestarts
|
| 661 |
+
)
|
| 662 |
+
elif self.cfg.train.scheduler.lower() == "noamlr":
|
| 663 |
+
scheduler = NoamLR(self.optimizer, **self.cfg.train.lr_scheduler)
|
| 664 |
+
else:
|
| 665 |
+
raise NotImplementedError(
|
| 666 |
+
f"Scheduler {self.cfg.train.scheduler} not supported yet!"
|
| 667 |
+
)
|
| 668 |
+
return scheduler
|
| 669 |
+
|
| 670 |
+
def _init_accelerator(self):
|
| 671 |
+
self.exp_dir = os.path.join(
|
| 672 |
+
os.path.abspath(self.cfg.log_dir), self.args.exp_name
|
| 673 |
+
)
|
| 674 |
+
project_config = ProjectConfiguration(
|
| 675 |
+
project_dir=self.exp_dir,
|
| 676 |
+
logging_dir=os.path.join(self.exp_dir, "log"),
|
| 677 |
+
)
|
| 678 |
+
self.accelerator = accelerate.Accelerator(
|
| 679 |
+
gradient_accumulation_steps=self.cfg.train.gradient_accumulation_step,
|
| 680 |
+
log_with=self.cfg.train.tracker,
|
| 681 |
+
project_config=project_config,
|
| 682 |
+
)
|
| 683 |
+
if self.accelerator.is_main_process:
|
| 684 |
+
os.makedirs(project_config.project_dir, exist_ok=True)
|
| 685 |
+
os.makedirs(project_config.logging_dir, exist_ok=True)
|
| 686 |
+
with self.accelerator.main_process_first():
|
| 687 |
+
self.accelerator.init_trackers(self.args.exp_name)
|
| 688 |
+
|
| 689 |
+
def __check_basic_configs(self):
|
| 690 |
+
if self.cfg.train.gradient_accumulation_step <= 0:
|
| 691 |
+
self.logger.fatal("Invalid gradient_accumulation_step value!")
|
| 692 |
+
self.logger.error(
|
| 693 |
+
f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
|
| 694 |
+
)
|
| 695 |
+
self.accelerator.end_training()
|
| 696 |
+
raise ValueError(
|
| 697 |
+
f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
|
| 698 |
+
)
|
| 699 |
+
# TODO: check other values
|
| 700 |
+
|
| 701 |
+
@staticmethod
|
| 702 |
+
def __count_parameters(model):
|
| 703 |
+
model_param = 0.0
|
| 704 |
+
if isinstance(model, dict):
|
| 705 |
+
for key, value in model.items():
|
| 706 |
+
model_param += sum(p.numel() for p in model[key].parameters())
|
| 707 |
+
else:
|
| 708 |
+
model_param = sum(p.numel() for p in model.parameters())
|
| 709 |
+
return model_param
|
| 710 |
+
|
| 711 |
+
def __dump_cfg(self, path):
|
| 712 |
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
| 713 |
+
json5.dump(
|
| 714 |
+
self.cfg,
|
| 715 |
+
open(path, "w"),
|
| 716 |
+
indent=4,
|
| 717 |
+
sort_keys=True,
|
| 718 |
+
ensure_ascii=False,
|
| 719 |
+
quote_keys=True,
|
| 720 |
+
)
|
| 721 |
+
|
| 722 |
+
### Private methods end ###
|
models/svc/__init__.py
ADDED
|
File without changes
|
models/svc/base/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
from .svc_inference import SVCInference
|
| 7 |
+
from .svc_trainer import SVCTrainer
|
models/svc/base/svc_dataset.py
ADDED
|
@@ -0,0 +1,425 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import random
|
| 7 |
+
import torch
|
| 8 |
+
from torch.nn.utils.rnn import pad_sequence
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import numpy as np
|
| 12 |
+
from utils.data_utils import *
|
| 13 |
+
from processors.acoustic_extractor import cal_normalized_mel, load_mel_extrema
|
| 14 |
+
from processors.content_extractor import (
|
| 15 |
+
ContentvecExtractor,
|
| 16 |
+
WhisperExtractor,
|
| 17 |
+
WenetExtractor,
|
| 18 |
+
)
|
| 19 |
+
from models.base.base_dataset import (
|
| 20 |
+
BaseCollator,
|
| 21 |
+
BaseDataset,
|
| 22 |
+
)
|
| 23 |
+
from models.base.new_dataset import BaseTestDataset
|
| 24 |
+
|
| 25 |
+
EPS = 1.0e-12
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class SVCDataset(BaseDataset):
|
| 29 |
+
def __init__(self, cfg, dataset, is_valid=False):
|
| 30 |
+
BaseDataset.__init__(self, cfg, dataset, is_valid=is_valid)
|
| 31 |
+
|
| 32 |
+
cfg = self.cfg
|
| 33 |
+
|
| 34 |
+
if cfg.model.condition_encoder.use_whisper:
|
| 35 |
+
self.whisper_aligner = WhisperExtractor(self.cfg)
|
| 36 |
+
self.utt2whisper_path = load_content_feature_path(
|
| 37 |
+
self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.whisper_dir
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
if cfg.model.condition_encoder.use_contentvec:
|
| 41 |
+
self.contentvec_aligner = ContentvecExtractor(self.cfg)
|
| 42 |
+
self.utt2contentVec_path = load_content_feature_path(
|
| 43 |
+
self.metadata,
|
| 44 |
+
cfg.preprocess.processed_dir,
|
| 45 |
+
cfg.preprocess.contentvec_dir,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
if cfg.model.condition_encoder.use_mert:
|
| 49 |
+
self.utt2mert_path = load_content_feature_path(
|
| 50 |
+
self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.mert_dir
|
| 51 |
+
)
|
| 52 |
+
if cfg.model.condition_encoder.use_wenet:
|
| 53 |
+
self.wenet_aligner = WenetExtractor(self.cfg)
|
| 54 |
+
self.utt2wenet_path = load_content_feature_path(
|
| 55 |
+
self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.wenet_dir
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
def __getitem__(self, index):
|
| 59 |
+
single_feature = BaseDataset.__getitem__(self, index)
|
| 60 |
+
|
| 61 |
+
utt_info = self.metadata[index]
|
| 62 |
+
dataset = utt_info["Dataset"]
|
| 63 |
+
uid = utt_info["Uid"]
|
| 64 |
+
utt = "{}_{}".format(dataset, uid)
|
| 65 |
+
|
| 66 |
+
if self.cfg.model.condition_encoder.use_whisper:
|
| 67 |
+
assert "target_len" in single_feature.keys()
|
| 68 |
+
aligned_whisper_feat = self.whisper_aligner.offline_align(
|
| 69 |
+
np.load(self.utt2whisper_path[utt]), single_feature["target_len"]
|
| 70 |
+
)
|
| 71 |
+
single_feature["whisper_feat"] = aligned_whisper_feat
|
| 72 |
+
|
| 73 |
+
if self.cfg.model.condition_encoder.use_contentvec:
|
| 74 |
+
assert "target_len" in single_feature.keys()
|
| 75 |
+
aligned_contentvec = self.contentvec_aligner.offline_align(
|
| 76 |
+
np.load(self.utt2contentVec_path[utt]), single_feature["target_len"]
|
| 77 |
+
)
|
| 78 |
+
single_feature["contentvec_feat"] = aligned_contentvec
|
| 79 |
+
|
| 80 |
+
if self.cfg.model.condition_encoder.use_mert:
|
| 81 |
+
assert "target_len" in single_feature.keys()
|
| 82 |
+
aligned_mert_feat = align_content_feature_length(
|
| 83 |
+
np.load(self.utt2mert_path[utt]),
|
| 84 |
+
single_feature["target_len"],
|
| 85 |
+
source_hop=self.cfg.preprocess.mert_hop_size,
|
| 86 |
+
)
|
| 87 |
+
single_feature["mert_feat"] = aligned_mert_feat
|
| 88 |
+
|
| 89 |
+
if self.cfg.model.condition_encoder.use_wenet:
|
| 90 |
+
assert "target_len" in single_feature.keys()
|
| 91 |
+
aligned_wenet_feat = self.wenet_aligner.offline_align(
|
| 92 |
+
np.load(self.utt2wenet_path[utt]), single_feature["target_len"]
|
| 93 |
+
)
|
| 94 |
+
single_feature["wenet_feat"] = aligned_wenet_feat
|
| 95 |
+
|
| 96 |
+
# print(single_feature.keys())
|
| 97 |
+
# for k, v in single_feature.items():
|
| 98 |
+
# if type(v) in [torch.Tensor, np.ndarray]:
|
| 99 |
+
# print(k, v.shape)
|
| 100 |
+
# else:
|
| 101 |
+
# print(k, v)
|
| 102 |
+
# exit()
|
| 103 |
+
|
| 104 |
+
return self.clip_if_too_long(single_feature)
|
| 105 |
+
|
| 106 |
+
def __len__(self):
|
| 107 |
+
return len(self.metadata)
|
| 108 |
+
|
| 109 |
+
def random_select(self, feature_seq_len, max_seq_len, ending_ts=2812):
|
| 110 |
+
"""
|
| 111 |
+
ending_ts: to avoid invalid whisper features for over 30s audios
|
| 112 |
+
2812 = 30 * 24000 // 256
|
| 113 |
+
"""
|
| 114 |
+
ts = max(feature_seq_len - max_seq_len, 0)
|
| 115 |
+
ts = min(ts, ending_ts - max_seq_len)
|
| 116 |
+
|
| 117 |
+
start = random.randint(0, ts)
|
| 118 |
+
end = start + max_seq_len
|
| 119 |
+
return start, end
|
| 120 |
+
|
| 121 |
+
def clip_if_too_long(self, sample, max_seq_len=512):
|
| 122 |
+
"""
|
| 123 |
+
sample :
|
| 124 |
+
{
|
| 125 |
+
'spk_id': (1,),
|
| 126 |
+
'target_len': int
|
| 127 |
+
'mel': (seq_len, dim),
|
| 128 |
+
'frame_pitch': (seq_len,)
|
| 129 |
+
'frame_energy': (seq_len,)
|
| 130 |
+
'content_vector_feat': (seq_len, dim)
|
| 131 |
+
}
|
| 132 |
+
"""
|
| 133 |
+
if sample["target_len"] <= max_seq_len:
|
| 134 |
+
return sample
|
| 135 |
+
|
| 136 |
+
start, end = self.random_select(sample["target_len"], max_seq_len)
|
| 137 |
+
sample["target_len"] = end - start
|
| 138 |
+
|
| 139 |
+
for k in sample.keys():
|
| 140 |
+
if k not in ["spk_id", "target_len"]:
|
| 141 |
+
sample[k] = sample[k][start:end]
|
| 142 |
+
|
| 143 |
+
return sample
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class SVCCollator(BaseCollator):
|
| 147 |
+
"""Zero-pads model inputs and targets based on number of frames per step"""
|
| 148 |
+
|
| 149 |
+
def __init__(self, cfg):
|
| 150 |
+
BaseCollator.__init__(self, cfg)
|
| 151 |
+
|
| 152 |
+
def __call__(self, batch):
|
| 153 |
+
parsed_batch_features = BaseCollator.__call__(self, batch)
|
| 154 |
+
return parsed_batch_features
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
class SVCTestDataset(BaseTestDataset):
|
| 158 |
+
def __init__(self, args, cfg, infer_type):
|
| 159 |
+
BaseTestDataset.__init__(self, args, cfg, infer_type)
|
| 160 |
+
self.metadata = self.get_metadata()
|
| 161 |
+
|
| 162 |
+
target_singer = args.target_singer
|
| 163 |
+
self.cfg = cfg
|
| 164 |
+
self.trans_key = args.trans_key
|
| 165 |
+
assert type(target_singer) == str
|
| 166 |
+
|
| 167 |
+
self.target_singer = target_singer.split("_")[-1]
|
| 168 |
+
self.target_dataset = target_singer.replace(
|
| 169 |
+
"_{}".format(self.target_singer), ""
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
self.target_mel_extrema = load_mel_extrema(cfg.preprocess, self.target_dataset)
|
| 173 |
+
self.target_mel_extrema = torch.as_tensor(
|
| 174 |
+
self.target_mel_extrema[0]
|
| 175 |
+
), torch.as_tensor(self.target_mel_extrema[1])
|
| 176 |
+
|
| 177 |
+
######### Load source acoustic features #########
|
| 178 |
+
if cfg.preprocess.use_spkid:
|
| 179 |
+
spk2id_path = os.path.join(args.acoustics_dir, cfg.preprocess.spk2id)
|
| 180 |
+
# utt2sp_path = os.path.join(self.data_root, cfg.preprocess.utt2spk)
|
| 181 |
+
|
| 182 |
+
with open(spk2id_path, "r") as f:
|
| 183 |
+
self.spk2id = json.load(f)
|
| 184 |
+
# print("self.spk2id", self.spk2id)
|
| 185 |
+
|
| 186 |
+
if cfg.preprocess.use_uv:
|
| 187 |
+
self.utt2uv_path = {
|
| 188 |
+
f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
|
| 189 |
+
cfg.preprocess.processed_dir,
|
| 190 |
+
utt_info["Dataset"],
|
| 191 |
+
cfg.preprocess.uv_dir,
|
| 192 |
+
utt_info["Uid"] + ".npy",
|
| 193 |
+
)
|
| 194 |
+
for utt_info in self.metadata
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
if cfg.preprocess.use_frame_pitch:
|
| 198 |
+
self.utt2frame_pitch_path = {
|
| 199 |
+
f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
|
| 200 |
+
cfg.preprocess.processed_dir,
|
| 201 |
+
utt_info["Dataset"],
|
| 202 |
+
cfg.preprocess.pitch_dir,
|
| 203 |
+
utt_info["Uid"] + ".npy",
|
| 204 |
+
)
|
| 205 |
+
for utt_info in self.metadata
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
# Target F0 median
|
| 209 |
+
target_f0_statistics_path = os.path.join(
|
| 210 |
+
cfg.preprocess.processed_dir,
|
| 211 |
+
self.target_dataset,
|
| 212 |
+
cfg.preprocess.pitch_dir,
|
| 213 |
+
"statistics.json",
|
| 214 |
+
)
|
| 215 |
+
self.target_pitch_median = json.load(open(target_f0_statistics_path, "r"))[
|
| 216 |
+
f"{self.target_dataset}_{self.target_singer}"
|
| 217 |
+
]["voiced_positions"]["median"]
|
| 218 |
+
|
| 219 |
+
# Source F0 median (if infer from file)
|
| 220 |
+
if infer_type == "from_file":
|
| 221 |
+
source_audio_name = cfg.inference.source_audio_name
|
| 222 |
+
source_f0_statistics_path = os.path.join(
|
| 223 |
+
cfg.preprocess.processed_dir,
|
| 224 |
+
source_audio_name,
|
| 225 |
+
cfg.preprocess.pitch_dir,
|
| 226 |
+
"statistics.json",
|
| 227 |
+
)
|
| 228 |
+
self.source_pitch_median = json.load(
|
| 229 |
+
open(source_f0_statistics_path, "r")
|
| 230 |
+
)[f"{source_audio_name}_{source_audio_name}"]["voiced_positions"][
|
| 231 |
+
"median"
|
| 232 |
+
]
|
| 233 |
+
else:
|
| 234 |
+
self.source_pitch_median = None
|
| 235 |
+
|
| 236 |
+
if cfg.preprocess.use_frame_energy:
|
| 237 |
+
self.utt2frame_energy_path = {
|
| 238 |
+
f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
|
| 239 |
+
cfg.preprocess.processed_dir,
|
| 240 |
+
utt_info["Dataset"],
|
| 241 |
+
cfg.preprocess.energy_dir,
|
| 242 |
+
utt_info["Uid"] + ".npy",
|
| 243 |
+
)
|
| 244 |
+
for utt_info in self.metadata
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
if cfg.preprocess.use_mel:
|
| 248 |
+
self.utt2mel_path = {
|
| 249 |
+
f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
|
| 250 |
+
cfg.preprocess.processed_dir,
|
| 251 |
+
utt_info["Dataset"],
|
| 252 |
+
cfg.preprocess.mel_dir,
|
| 253 |
+
utt_info["Uid"] + ".npy",
|
| 254 |
+
)
|
| 255 |
+
for utt_info in self.metadata
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
######### Load source content features' path #########
|
| 259 |
+
if cfg.model.condition_encoder.use_whisper:
|
| 260 |
+
self.whisper_aligner = WhisperExtractor(cfg)
|
| 261 |
+
self.utt2whisper_path = load_content_feature_path(
|
| 262 |
+
self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.whisper_dir
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
if cfg.model.condition_encoder.use_contentvec:
|
| 266 |
+
self.contentvec_aligner = ContentvecExtractor(cfg)
|
| 267 |
+
self.utt2contentVec_path = load_content_feature_path(
|
| 268 |
+
self.metadata,
|
| 269 |
+
cfg.preprocess.processed_dir,
|
| 270 |
+
cfg.preprocess.contentvec_dir,
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
if cfg.model.condition_encoder.use_mert:
|
| 274 |
+
self.utt2mert_path = load_content_feature_path(
|
| 275 |
+
self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.mert_dir
|
| 276 |
+
)
|
| 277 |
+
if cfg.model.condition_encoder.use_wenet:
|
| 278 |
+
self.wenet_aligner = WenetExtractor(cfg)
|
| 279 |
+
self.utt2wenet_path = load_content_feature_path(
|
| 280 |
+
self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.wenet_dir
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
def __getitem__(self, index):
|
| 284 |
+
single_feature = {}
|
| 285 |
+
|
| 286 |
+
utt_info = self.metadata[index]
|
| 287 |
+
dataset = utt_info["Dataset"]
|
| 288 |
+
uid = utt_info["Uid"]
|
| 289 |
+
utt = "{}_{}".format(dataset, uid)
|
| 290 |
+
|
| 291 |
+
source_dataset = self.metadata[index]["Dataset"]
|
| 292 |
+
|
| 293 |
+
if self.cfg.preprocess.use_spkid:
|
| 294 |
+
single_feature["spk_id"] = np.array(
|
| 295 |
+
[self.spk2id[f"{self.target_dataset}_{self.target_singer}"]],
|
| 296 |
+
dtype=np.int32,
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
######### Get Acoustic Features Item #########
|
| 300 |
+
if self.cfg.preprocess.use_mel:
|
| 301 |
+
mel = np.load(self.utt2mel_path[utt])
|
| 302 |
+
assert mel.shape[0] == self.cfg.preprocess.n_mel # [n_mels, T]
|
| 303 |
+
if self.cfg.preprocess.use_min_max_norm_mel:
|
| 304 |
+
# mel norm
|
| 305 |
+
mel = cal_normalized_mel(mel, source_dataset, self.cfg.preprocess)
|
| 306 |
+
|
| 307 |
+
if "target_len" not in single_feature.keys():
|
| 308 |
+
single_feature["target_len"] = mel.shape[1]
|
| 309 |
+
single_feature["mel"] = mel.T # [T, n_mels]
|
| 310 |
+
|
| 311 |
+
if self.cfg.preprocess.use_frame_pitch:
|
| 312 |
+
frame_pitch_path = self.utt2frame_pitch_path[utt]
|
| 313 |
+
frame_pitch = np.load(frame_pitch_path)
|
| 314 |
+
|
| 315 |
+
if self.trans_key:
|
| 316 |
+
try:
|
| 317 |
+
self.trans_key = int(self.trans_key)
|
| 318 |
+
except:
|
| 319 |
+
pass
|
| 320 |
+
if type(self.trans_key) == int:
|
| 321 |
+
frame_pitch = transpose_key(frame_pitch, self.trans_key)
|
| 322 |
+
elif self.trans_key:
|
| 323 |
+
assert self.target_singer
|
| 324 |
+
|
| 325 |
+
frame_pitch = pitch_shift_to_target(
|
| 326 |
+
frame_pitch, self.target_pitch_median, self.source_pitch_median
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
if "target_len" not in single_feature.keys():
|
| 330 |
+
single_feature["target_len"] = len(frame_pitch)
|
| 331 |
+
aligned_frame_pitch = align_length(
|
| 332 |
+
frame_pitch, single_feature["target_len"]
|
| 333 |
+
)
|
| 334 |
+
single_feature["frame_pitch"] = aligned_frame_pitch
|
| 335 |
+
|
| 336 |
+
if self.cfg.preprocess.use_uv:
|
| 337 |
+
frame_uv_path = self.utt2uv_path[utt]
|
| 338 |
+
frame_uv = np.load(frame_uv_path)
|
| 339 |
+
aligned_frame_uv = align_length(frame_uv, single_feature["target_len"])
|
| 340 |
+
aligned_frame_uv = [
|
| 341 |
+
0 if frame_uv else 1 for frame_uv in aligned_frame_uv
|
| 342 |
+
]
|
| 343 |
+
aligned_frame_uv = np.array(aligned_frame_uv)
|
| 344 |
+
single_feature["frame_uv"] = aligned_frame_uv
|
| 345 |
+
|
| 346 |
+
if self.cfg.preprocess.use_frame_energy:
|
| 347 |
+
frame_energy_path = self.utt2frame_energy_path[utt]
|
| 348 |
+
frame_energy = np.load(frame_energy_path)
|
| 349 |
+
if "target_len" not in single_feature.keys():
|
| 350 |
+
single_feature["target_len"] = len(frame_energy)
|
| 351 |
+
aligned_frame_energy = align_length(
|
| 352 |
+
frame_energy, single_feature["target_len"]
|
| 353 |
+
)
|
| 354 |
+
single_feature["frame_energy"] = aligned_frame_energy
|
| 355 |
+
|
| 356 |
+
######### Get Content Features Item #########
|
| 357 |
+
if self.cfg.model.condition_encoder.use_whisper:
|
| 358 |
+
assert "target_len" in single_feature.keys()
|
| 359 |
+
aligned_whisper_feat = self.whisper_aligner.offline_align(
|
| 360 |
+
np.load(self.utt2whisper_path[utt]), single_feature["target_len"]
|
| 361 |
+
)
|
| 362 |
+
single_feature["whisper_feat"] = aligned_whisper_feat
|
| 363 |
+
|
| 364 |
+
if self.cfg.model.condition_encoder.use_contentvec:
|
| 365 |
+
assert "target_len" in single_feature.keys()
|
| 366 |
+
aligned_contentvec = self.contentvec_aligner.offline_align(
|
| 367 |
+
np.load(self.utt2contentVec_path[utt]), single_feature["target_len"]
|
| 368 |
+
)
|
| 369 |
+
single_feature["contentvec_feat"] = aligned_contentvec
|
| 370 |
+
|
| 371 |
+
if self.cfg.model.condition_encoder.use_mert:
|
| 372 |
+
assert "target_len" in single_feature.keys()
|
| 373 |
+
aligned_mert_feat = align_content_feature_length(
|
| 374 |
+
np.load(self.utt2mert_path[utt]),
|
| 375 |
+
single_feature["target_len"],
|
| 376 |
+
source_hop=self.cfg.preprocess.mert_hop_size,
|
| 377 |
+
)
|
| 378 |
+
single_feature["mert_feat"] = aligned_mert_feat
|
| 379 |
+
|
| 380 |
+
if self.cfg.model.condition_encoder.use_wenet:
|
| 381 |
+
assert "target_len" in single_feature.keys()
|
| 382 |
+
aligned_wenet_feat = self.wenet_aligner.offline_align(
|
| 383 |
+
np.load(self.utt2wenet_path[utt]), single_feature["target_len"]
|
| 384 |
+
)
|
| 385 |
+
single_feature["wenet_feat"] = aligned_wenet_feat
|
| 386 |
+
|
| 387 |
+
return single_feature
|
| 388 |
+
|
| 389 |
+
def __len__(self):
|
| 390 |
+
return len(self.metadata)
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
class SVCTestCollator:
|
| 394 |
+
"""Zero-pads model inputs and targets based on number of frames per step"""
|
| 395 |
+
|
| 396 |
+
def __init__(self, cfg):
|
| 397 |
+
self.cfg = cfg
|
| 398 |
+
|
| 399 |
+
def __call__(self, batch):
|
| 400 |
+
packed_batch_features = dict()
|
| 401 |
+
|
| 402 |
+
# mel: [b, T, n_mels]
|
| 403 |
+
# frame_pitch, frame_energy: [1, T]
|
| 404 |
+
# target_len: [1]
|
| 405 |
+
# spk_id: [b, 1]
|
| 406 |
+
# mask: [b, T, 1]
|
| 407 |
+
|
| 408 |
+
for key in batch[0].keys():
|
| 409 |
+
if key == "target_len":
|
| 410 |
+
packed_batch_features["target_len"] = torch.LongTensor(
|
| 411 |
+
[b["target_len"] for b in batch]
|
| 412 |
+
)
|
| 413 |
+
masks = [
|
| 414 |
+
torch.ones((b["target_len"], 1), dtype=torch.long) for b in batch
|
| 415 |
+
]
|
| 416 |
+
packed_batch_features["mask"] = pad_sequence(
|
| 417 |
+
masks, batch_first=True, padding_value=0
|
| 418 |
+
)
|
| 419 |
+
else:
|
| 420 |
+
values = [torch.from_numpy(b[key]) for b in batch]
|
| 421 |
+
packed_batch_features[key] = pad_sequence(
|
| 422 |
+
values, batch_first=True, padding_value=0
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
return packed_batch_features
|
models/svc/base/svc_inference.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
from models.base.new_inference import BaseInference
|
| 7 |
+
from models.svc.base.svc_dataset import SVCTestCollator, SVCTestDataset
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class SVCInference(BaseInference):
|
| 11 |
+
def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
|
| 12 |
+
BaseInference.__init__(self, args, cfg, infer_type)
|
| 13 |
+
|
| 14 |
+
def _build_test_dataset(self):
|
| 15 |
+
return SVCTestDataset, SVCTestCollator
|
models/svc/base/svc_trainer.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
import torch
|
| 10 |
+
import torch.nn as nn
|
| 11 |
+
|
| 12 |
+
from models.base.new_trainer import BaseTrainer
|
| 13 |
+
from models.svc.base.svc_dataset import SVCCollator, SVCDataset
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class SVCTrainer(BaseTrainer):
|
| 17 |
+
r"""The base trainer for all SVC models. It inherits from BaseTrainer and implements
|
| 18 |
+
``build_criterion``, ``_build_dataset`` and ``_build_singer_lut`` methods. You can inherit from this
|
| 19 |
+
class, and implement ``_build_model``, ``_forward_step``.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, args=None, cfg=None):
|
| 23 |
+
self.args = args
|
| 24 |
+
self.cfg = cfg
|
| 25 |
+
|
| 26 |
+
self._init_accelerator()
|
| 27 |
+
|
| 28 |
+
# Only for SVC tasks
|
| 29 |
+
with self.accelerator.main_process_first():
|
| 30 |
+
self.singers = self._build_singer_lut()
|
| 31 |
+
|
| 32 |
+
# Super init
|
| 33 |
+
BaseTrainer.__init__(self, args, cfg)
|
| 34 |
+
|
| 35 |
+
# Only for SVC tasks
|
| 36 |
+
self.task_type = "SVC"
|
| 37 |
+
self.logger.info("Task type: {}".format(self.task_type))
|
| 38 |
+
|
| 39 |
+
### Following are methods only for SVC tasks ###
|
| 40 |
+
# TODO: LEGACY CODE, NEED TO BE REFACTORED
|
| 41 |
+
def _build_dataset(self):
|
| 42 |
+
return SVCDataset, SVCCollator
|
| 43 |
+
|
| 44 |
+
@staticmethod
|
| 45 |
+
def _build_criterion():
|
| 46 |
+
criterion = nn.MSELoss(reduction="none")
|
| 47 |
+
return criterion
|
| 48 |
+
|
| 49 |
+
@staticmethod
|
| 50 |
+
def _compute_loss(criterion, y_pred, y_gt, loss_mask):
|
| 51 |
+
"""
|
| 52 |
+
Args:
|
| 53 |
+
criterion: MSELoss(reduction='none')
|
| 54 |
+
y_pred, y_gt: (bs, seq_len, D)
|
| 55 |
+
loss_mask: (bs, seq_len, 1)
|
| 56 |
+
Returns:
|
| 57 |
+
loss: Tensor of shape []
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
# (bs, seq_len, D)
|
| 61 |
+
loss = criterion(y_pred, y_gt)
|
| 62 |
+
# expand loss_mask to (bs, seq_len, D)
|
| 63 |
+
loss_mask = loss_mask.repeat(1, 1, loss.shape[-1])
|
| 64 |
+
|
| 65 |
+
loss = torch.sum(loss * loss_mask) / torch.sum(loss_mask)
|
| 66 |
+
return loss
|
| 67 |
+
|
| 68 |
+
def _save_auxiliary_states(self):
|
| 69 |
+
"""
|
| 70 |
+
To save the singer's look-up table in the checkpoint saving path
|
| 71 |
+
"""
|
| 72 |
+
with open(
|
| 73 |
+
os.path.join(self.tmp_checkpoint_save_path, self.cfg.preprocess.spk2id), "w"
|
| 74 |
+
) as f:
|
| 75 |
+
json.dump(self.singers, f, indent=4, ensure_ascii=False)
|
| 76 |
+
|
| 77 |
+
def _build_singer_lut(self):
|
| 78 |
+
resumed_singer_path = None
|
| 79 |
+
if self.args.resume_from_ckpt_path and self.args.resume_from_ckpt_path != "":
|
| 80 |
+
resumed_singer_path = os.path.join(
|
| 81 |
+
self.args.resume_from_ckpt_path, self.cfg.preprocess.spk2id
|
| 82 |
+
)
|
| 83 |
+
if os.path.exists(os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)):
|
| 84 |
+
resumed_singer_path = os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)
|
| 85 |
+
|
| 86 |
+
if resumed_singer_path:
|
| 87 |
+
with open(resumed_singer_path, "r") as f:
|
| 88 |
+
singers = json.load(f)
|
| 89 |
+
else:
|
| 90 |
+
singers = dict()
|
| 91 |
+
|
| 92 |
+
for dataset in self.cfg.dataset:
|
| 93 |
+
singer_lut_path = os.path.join(
|
| 94 |
+
self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.spk2id
|
| 95 |
+
)
|
| 96 |
+
with open(singer_lut_path, "r") as singer_lut_path:
|
| 97 |
+
singer_lut = json.load(singer_lut_path)
|
| 98 |
+
for singer in singer_lut.keys():
|
| 99 |
+
if singer not in singers:
|
| 100 |
+
singers[singer] = len(singers)
|
| 101 |
+
|
| 102 |
+
with open(
|
| 103 |
+
os.path.join(self.exp_dir, self.cfg.preprocess.spk2id), "w"
|
| 104 |
+
) as singer_file:
|
| 105 |
+
json.dump(singers, singer_file, indent=4, ensure_ascii=False)
|
| 106 |
+
print(
|
| 107 |
+
"singers have been dumped to {}".format(
|
| 108 |
+
os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)
|
| 109 |
+
)
|
| 110 |
+
)
|
| 111 |
+
return singers
|
models/svc/comosvc/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023 Amphion.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|