Spaces:
Running
Running
mrfakename
commited on
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
pyproject.toml
CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4 |
|
5 |
[project]
|
6 |
name = "f5-tts"
|
7 |
-
version = "0.
|
8 |
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
9 |
readme = "README.md"
|
10 |
license = {text = "MIT License"}
|
@@ -21,6 +21,7 @@ dependencies = [
|
|
21 |
"datasets",
|
22 |
"ema_pytorch>=0.5.2",
|
23 |
"gradio>=3.45.2",
|
|
|
24 |
"jieba",
|
25 |
"librosa",
|
26 |
"matplotlib",
|
@@ -39,7 +40,6 @@ dependencies = [
|
|
39 |
"vocos",
|
40 |
"wandb",
|
41 |
"x_transformers>=1.31.14",
|
42 |
-
"hydra-core>=1.3.0",
|
43 |
]
|
44 |
|
45 |
[project.optional-dependencies]
|
|
|
4 |
|
5 |
[project]
|
6 |
name = "f5-tts"
|
7 |
+
version = "0.2.0"
|
8 |
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
9 |
readme = "README.md"
|
10 |
license = {text = "MIT License"}
|
|
|
21 |
"datasets",
|
22 |
"ema_pytorch>=0.5.2",
|
23 |
"gradio>=3.45.2",
|
24 |
+
"hydra-core>=1.3.0",
|
25 |
"jieba",
|
26 |
"librosa",
|
27 |
"matplotlib",
|
|
|
40 |
"vocos",
|
41 |
"wandb",
|
42 |
"x_transformers>=1.31.14",
|
|
|
43 |
]
|
44 |
|
45 |
[project.optional-dependencies]
|
src/f5_tts/configs/E2TTS_Base_train.yaml
CHANGED
@@ -3,41 +3,41 @@ hydra:
|
|
3 |
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
|
5 |
datasets:
|
6 |
-
name: Emilia_ZH_EN
|
7 |
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
8 |
-
batch_size_type: frame
|
9 |
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
10 |
-
num_workers: 16
|
11 |
|
12 |
optim:
|
13 |
-
epochs: 15
|
14 |
-
learning_rate: 7.5e-5
|
15 |
num_warmup_updates: 20000 # warmup steps
|
16 |
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
17 |
-
max_grad_norm: 1.0
|
18 |
-
bnb_optimizer: False
|
19 |
|
20 |
model:
|
21 |
-
name: E2TTS_Base
|
22 |
-
tokenizer: pinyin
|
23 |
tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
|
24 |
arch:
|
25 |
-
dim: 1024
|
26 |
-
depth: 24
|
27 |
-
heads: 16
|
28 |
-
ff_mult: 4
|
29 |
mel_spec:
|
30 |
-
target_sample_rate: 24000
|
31 |
-
n_mel_channels: 100
|
32 |
-
hop_length: 256
|
33 |
-
win_length: 1024
|
34 |
-
n_fft: 1024
|
35 |
mel_spec_type: vocos # 'vocos' or 'bigvgan'
|
36 |
-
is_local_vocoder: False
|
37 |
-
local_vocoder_path: None
|
38 |
|
39 |
ckpts:
|
40 |
-
logger: wandb
|
41 |
-
save_per_updates: 50000
|
42 |
-
last_per_steps: 5000
|
43 |
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
|
|
3 |
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
|
5 |
datasets:
|
6 |
+
name: Emilia_ZH_EN # dataset name
|
7 |
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
8 |
+
batch_size_type: frame # "frame" or "sample"
|
9 |
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
10 |
+
num_workers: 16
|
11 |
|
12 |
optim:
|
13 |
+
epochs: 15
|
14 |
+
learning_rate: 7.5e-5
|
15 |
num_warmup_updates: 20000 # warmup steps
|
16 |
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
17 |
+
max_grad_norm: 1.0 # gradient clipping
|
18 |
+
bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
|
19 |
|
20 |
model:
|
21 |
+
name: E2TTS_Base
|
22 |
+
tokenizer: pinyin
|
23 |
tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
|
24 |
arch:
|
25 |
+
dim: 1024
|
26 |
+
depth: 24
|
27 |
+
heads: 16
|
28 |
+
ff_mult: 4
|
29 |
mel_spec:
|
30 |
+
target_sample_rate: 24000
|
31 |
+
n_mel_channels: 100
|
32 |
+
hop_length: 256
|
33 |
+
win_length: 1024
|
34 |
+
n_fft: 1024
|
35 |
mel_spec_type: vocos # 'vocos' or 'bigvgan'
|
36 |
+
is_local_vocoder: False # use local offline vocoder ckpt or not
|
37 |
+
local_vocoder_path: None # path to local vocoder
|
38 |
|
39 |
ckpts:
|
40 |
+
logger: wandb # wandb | tensorboard | None
|
41 |
+
save_per_updates: 50000 # save checkpoint per steps
|
42 |
+
last_per_steps: 5000 # save last checkpoint per steps
|
43 |
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
src/f5_tts/configs/E2TTS_Small_train.yaml
CHANGED
@@ -5,9 +5,9 @@ hydra:
|
|
5 |
datasets:
|
6 |
name: Emilia_ZH_EN
|
7 |
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
8 |
-
batch_size_type: frame
|
9 |
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
10 |
-
num_workers: 16
|
11 |
|
12 |
optim:
|
13 |
epochs: 15
|
@@ -37,7 +37,7 @@ model:
|
|
37 |
local_vocoder_path: None
|
38 |
|
39 |
ckpts:
|
40 |
-
logger: wandb
|
41 |
-
save_per_updates: 50000
|
42 |
-
last_per_steps: 5000
|
43 |
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
|
|
5 |
datasets:
|
6 |
name: Emilia_ZH_EN
|
7 |
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
8 |
+
batch_size_type: frame # "frame" or "sample"
|
9 |
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
10 |
+
num_workers: 16
|
11 |
|
12 |
optim:
|
13 |
epochs: 15
|
|
|
37 |
local_vocoder_path: None
|
38 |
|
39 |
ckpts:
|
40 |
+
logger: wandb # wandb | tensorboard | None
|
41 |
+
save_per_updates: 50000 # save checkpoint per steps
|
42 |
+
last_per_steps: 5000 # save last checkpoint per steps
|
43 |
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
src/f5_tts/configs/F5TTS_Base_train.yaml
CHANGED
@@ -3,43 +3,43 @@ hydra:
|
|
3 |
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
|
5 |
datasets:
|
6 |
-
name: Emilia_ZH_EN
|
7 |
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
8 |
-
batch_size_type: frame
|
9 |
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
10 |
-
num_workers: 16
|
11 |
|
12 |
optim:
|
13 |
-
epochs: 15
|
14 |
-
learning_rate: 7.5e-5
|
15 |
num_warmup_updates: 20000 # warmup steps
|
16 |
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
17 |
-
max_grad_norm: 1.0
|
18 |
-
bnb_optimizer: False
|
19 |
|
20 |
model:
|
21 |
-
name: F5TTS_Base
|
22 |
-
tokenizer: pinyin
|
23 |
tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
|
24 |
arch:
|
25 |
-
dim: 1024
|
26 |
-
depth: 22
|
27 |
-
heads: 16
|
28 |
-
ff_mult: 2
|
29 |
-
text_dim: 512
|
30 |
-
conv_layers: 4
|
31 |
mel_spec:
|
32 |
-
target_sample_rate: 24000
|
33 |
-
n_mel_channels: 100
|
34 |
-
hop_length: 256
|
35 |
-
win_length: 1024
|
36 |
-
n_fft: 1024
|
37 |
mel_spec_type: vocos # 'vocos' or 'bigvgan'
|
38 |
-
is_local_vocoder: False
|
39 |
-
local_vocoder_path: None
|
40 |
|
41 |
ckpts:
|
42 |
-
logger: wandb
|
43 |
-
save_per_updates: 50000
|
44 |
-
last_per_steps: 5000
|
45 |
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
|
|
3 |
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
|
5 |
datasets:
|
6 |
+
name: Emilia_ZH_EN # dataset name
|
7 |
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
8 |
+
batch_size_type: frame # "frame" or "sample"
|
9 |
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
10 |
+
num_workers: 16
|
11 |
|
12 |
optim:
|
13 |
+
epochs: 15
|
14 |
+
learning_rate: 7.5e-5
|
15 |
num_warmup_updates: 20000 # warmup steps
|
16 |
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
17 |
+
max_grad_norm: 1.0 # gradient clipping
|
18 |
+
bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
|
19 |
|
20 |
model:
|
21 |
+
name: F5TTS_Base # model name
|
22 |
+
tokenizer: pinyin # tokenizer type
|
23 |
tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
|
24 |
arch:
|
25 |
+
dim: 1024
|
26 |
+
depth: 22
|
27 |
+
heads: 16
|
28 |
+
ff_mult: 2
|
29 |
+
text_dim: 512
|
30 |
+
conv_layers: 4
|
31 |
mel_spec:
|
32 |
+
target_sample_rate: 24000
|
33 |
+
n_mel_channels: 100
|
34 |
+
hop_length: 256
|
35 |
+
win_length: 1024
|
36 |
+
n_fft: 1024
|
37 |
mel_spec_type: vocos # 'vocos' or 'bigvgan'
|
38 |
+
is_local_vocoder: False # use local offline vocoder ckpt or not
|
39 |
+
local_vocoder_path: None # local vocoder path
|
40 |
|
41 |
ckpts:
|
42 |
+
logger: wandb # wandb | tensorboard | None
|
43 |
+
save_per_updates: 50000 # save checkpoint per steps
|
44 |
+
last_per_steps: 5000 # save last checkpoint per steps
|
45 |
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
src/f5_tts/configs/F5TTS_Small_train.yaml
CHANGED
@@ -5,17 +5,17 @@ hydra:
|
|
5 |
datasets:
|
6 |
name: Emilia_ZH_EN
|
7 |
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
8 |
-
batch_size_type: frame
|
9 |
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
10 |
-
num_workers: 16
|
11 |
|
12 |
optim:
|
13 |
epochs: 15
|
14 |
learning_rate: 7.5e-5
|
15 |
num_warmup_updates: 20000 # warmup steps
|
16 |
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
17 |
-
max_grad_norm: 1.0
|
18 |
-
bnb_optimizer: False
|
19 |
|
20 |
model:
|
21 |
name: F5TTS_Small
|
@@ -39,7 +39,7 @@ model:
|
|
39 |
local_vocoder_path: None
|
40 |
|
41 |
ckpts:
|
42 |
-
logger: wandb
|
43 |
-
save_per_updates: 50000
|
44 |
-
last_per_steps: 5000
|
45 |
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
|
|
5 |
datasets:
|
6 |
name: Emilia_ZH_EN
|
7 |
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
8 |
+
batch_size_type: frame # "frame" or "sample"
|
9 |
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
10 |
+
num_workers: 16
|
11 |
|
12 |
optim:
|
13 |
epochs: 15
|
14 |
learning_rate: 7.5e-5
|
15 |
num_warmup_updates: 20000 # warmup steps
|
16 |
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
17 |
+
max_grad_norm: 1.0 # gradient clipping
|
18 |
+
bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
|
19 |
|
20 |
model:
|
21 |
name: F5TTS_Small
|
|
|
39 |
local_vocoder_path: None
|
40 |
|
41 |
ckpts:
|
42 |
+
logger: wandb # wandb | tensorboard | None
|
43 |
+
save_per_updates: 50000 # save checkpoint per steps
|
44 |
+
last_per_steps: 5000 # save last checkpoint per steps
|
45 |
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
src/f5_tts/train/README.md
CHANGED
@@ -2,9 +2,9 @@
|
|
2 |
|
3 |
## Prepare Dataset
|
4 |
|
5 |
-
Example data processing scripts
|
6 |
|
7 |
-
### 1.
|
8 |
Download corresponding dataset first, and fill in the path in scripts.
|
9 |
|
10 |
```bash
|
@@ -38,7 +38,9 @@ Once your datasets are prepared, you can start the training process.
|
|
38 |
# setup accelerate config, e.g. use multi-gpu ddp, fp16
|
39 |
# will be to: ~/.cache/huggingface/accelerate/default_config.yaml
|
40 |
accelerate config
|
41 |
-
|
|
|
|
|
42 |
```
|
43 |
|
44 |
### 2. Finetuning practice
|
|
|
2 |
|
3 |
## Prepare Dataset
|
4 |
|
5 |
+
Example data processing scripts, and you may tailor your own one along with a Dataset class in `src/f5_tts/model/dataset.py`.
|
6 |
|
7 |
+
### 1. Some specific Datasets preparing scripts
|
8 |
Download corresponding dataset first, and fill in the path in scripts.
|
9 |
|
10 |
```bash
|
|
|
38 |
# setup accelerate config, e.g. use multi-gpu ddp, fp16
|
39 |
# will be to: ~/.cache/huggingface/accelerate/default_config.yaml
|
40 |
accelerate config
|
41 |
+
|
42 |
+
# .yaml files are under src/f5_tts/configs directory
|
43 |
+
accelerate launch src/f5_tts/train/train.py --config-name F5TTS_Base_train.yaml
|
44 |
```
|
45 |
|
46 |
### 2. Finetuning practice
|
src/f5_tts/train/train.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
# training script.
|
|
|
2 |
import os
|
3 |
from importlib.resources import files
|
4 |
|
@@ -8,7 +9,7 @@ from f5_tts.model import CFM, DiT, Trainer, UNetT
|
|
8 |
from f5_tts.model.dataset import load_dataset
|
9 |
from f5_tts.model.utils import get_tokenizer
|
10 |
|
11 |
-
os.chdir(str(files("f5_tts").joinpath("../..")))
|
12 |
|
13 |
|
14 |
@hydra.main(version_base="1.3", config_path=str(files("f5_tts").joinpath("configs")), config_name=None)
|
|
|
1 |
# training script.
|
2 |
+
|
3 |
import os
|
4 |
from importlib.resources import files
|
5 |
|
|
|
9 |
from f5_tts.model.dataset import load_dataset
|
10 |
from f5_tts.model.utils import get_tokenizer
|
11 |
|
12 |
+
os.chdir(str(files("f5_tts").joinpath("../.."))) # change working directory to root of project (local editable)
|
13 |
|
14 |
|
15 |
@hydra.main(version_base="1.3", config_path=str(files("f5_tts").joinpath("configs")), config_name=None)
|