Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
- app.py +78 -22
- pyproject.toml +1 -1
- src/f5_tts/infer/README.md +2 -0
- src/f5_tts/infer/SHARED.md +34 -0
app.py
CHANGED
|
@@ -38,22 +38,40 @@ from f5_tts.infer.utils_infer import (
|
|
| 38 |
save_spectrogram,
|
| 39 |
)
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
# load models
|
| 45 |
-
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
|
| 46 |
-
F5TTS_ema_model = load_model(
|
| 47 |
-
DiT, F5TTS_model_cfg, str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"))
|
| 48 |
-
)
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
-
DEFAULT_TTS_MODEL = "F5-TTS"
|
| 56 |
-
tts_model_choice = DEFAULT_TTS_MODEL
|
| 57 |
chat_model_state = None
|
| 58 |
chat_tokenizer_state = None
|
| 59 |
|
|
@@ -90,7 +108,16 @@ def infer(
|
|
| 90 |
if model == "F5-TTS":
|
| 91 |
ema_model = F5TTS_ema_model
|
| 92 |
elif model == "E2-TTS":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
ema_model = E2TTS_ema_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
final_wave, final_sample_rate, combined_spectrogram = infer_process(
|
| 96 |
ref_audio,
|
|
@@ -712,21 +739,50 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
|
|
| 712 |
"""
|
| 713 |
)
|
| 714 |
|
| 715 |
-
def switch_tts_model(new_choice):
|
| 716 |
global tts_model_choice
|
| 717 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 718 |
|
| 719 |
-
if not USING_SPACES:
|
| 720 |
-
choose_tts_model = gr.Radio(
|
| 721 |
-
choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
|
| 722 |
-
)
|
| 723 |
-
else:
|
| 724 |
-
choose_tts_model = gr.Radio(
|
| 725 |
-
choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
|
| 726 |
-
)
|
| 727 |
choose_tts_model.change(
|
| 728 |
switch_tts_model,
|
| 729 |
-
inputs=choose_tts_model,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 730 |
)
|
| 731 |
|
| 732 |
gr.TabbedInterface(
|
|
|
|
| 38 |
save_spectrogram,
|
| 39 |
)
|
| 40 |
|
| 41 |
+
|
| 42 |
+
DEFAULT_TTS_MODEL = "F5-TTS"
|
| 43 |
+
tts_model_choice = DEFAULT_TTS_MODEL
|
| 44 |
|
| 45 |
|
| 46 |
# load models
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
+
vocoder = load_vocoder()
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def load_f5tts(ckpt_path=str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"))):
|
| 52 |
+
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
|
| 53 |
+
return load_model(DiT, F5TTS_model_cfg, ckpt_path)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def load_e2tts(ckpt_path=str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))):
|
| 57 |
+
E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
|
| 58 |
+
return load_model(UNetT, E2TTS_model_cfg, ckpt_path)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def load_custom(ckpt_path: str, vocab_path="", model_cfg=None):
|
| 62 |
+
ckpt_path, vocab_path = ckpt_path.strip(), vocab_path.strip()
|
| 63 |
+
if ckpt_path.startswith("hf://"):
|
| 64 |
+
ckpt_path = str(cached_path(ckpt_path))
|
| 65 |
+
if vocab_path.startswith("hf://"):
|
| 66 |
+
vocab_path = str(cached_path(vocab_path))
|
| 67 |
+
if model_cfg is None:
|
| 68 |
+
model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
|
| 69 |
+
return load_model(DiT, model_cfg, ckpt_path, vocab_file=vocab_path)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
F5TTS_ema_model = load_f5tts()
|
| 73 |
+
E2TTS_ema_model = load_e2tts() if USING_SPACES else None
|
| 74 |
|
|
|
|
|
|
|
| 75 |
chat_model_state = None
|
| 76 |
chat_tokenizer_state = None
|
| 77 |
|
|
|
|
| 108 |
if model == "F5-TTS":
|
| 109 |
ema_model = F5TTS_ema_model
|
| 110 |
elif model == "E2-TTS":
|
| 111 |
+
global E2TTS_ema_model
|
| 112 |
+
if E2TTS_ema_model is None:
|
| 113 |
+
show_info("Loading E2-TTS model...")
|
| 114 |
+
E2TTS_ema_model = load_e2tts()
|
| 115 |
ema_model = E2TTS_ema_model
|
| 116 |
+
elif isinstance(model, list) and model[0] == "Custom":
|
| 117 |
+
assert not USING_SPACES, "Only official checkpoints allowed in Spaces."
|
| 118 |
+
show_info("Loading Custom TTS model...")
|
| 119 |
+
custom_ema_model = load_custom(model[1], vocab_path=model[2])
|
| 120 |
+
ema_model = custom_ema_model
|
| 121 |
|
| 122 |
final_wave, final_sample_rate, combined_spectrogram = infer_process(
|
| 123 |
ref_audio,
|
|
|
|
| 739 |
"""
|
| 740 |
)
|
| 741 |
|
| 742 |
+
def switch_tts_model(new_choice, custom_ckpt_path, custom_vocab_path):
|
| 743 |
global tts_model_choice
|
| 744 |
+
if new_choice == "Custom":
|
| 745 |
+
tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path]
|
| 746 |
+
return gr.update(visible=True)
|
| 747 |
+
else:
|
| 748 |
+
tts_model_choice = new_choice
|
| 749 |
+
return gr.update(visible=False)
|
| 750 |
+
|
| 751 |
+
with gr.Row():
|
| 752 |
+
if not USING_SPACES:
|
| 753 |
+
choose_tts_model = gr.Radio(
|
| 754 |
+
choices=[DEFAULT_TTS_MODEL, "E2-TTS", "Custom"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
|
| 755 |
+
)
|
| 756 |
+
else:
|
| 757 |
+
choose_tts_model = gr.Radio(
|
| 758 |
+
choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
|
| 759 |
+
)
|
| 760 |
+
with gr.Column(visible=False) as choose_custom_tts_model:
|
| 761 |
+
custom_ckpt_path = gr.Textbox(
|
| 762 |
+
placeholder="MODEL_CKPT: local_path | hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors",
|
| 763 |
+
show_label=False,
|
| 764 |
+
min_width=200,
|
| 765 |
+
)
|
| 766 |
+
custom_vocab_path = gr.Textbox(
|
| 767 |
+
placeholder="VOCAB_FILE: local_path | hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt | leave blank to use default",
|
| 768 |
+
show_label=False,
|
| 769 |
+
min_width=200,
|
| 770 |
+
)
|
| 771 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 772 |
choose_tts_model.change(
|
| 773 |
switch_tts_model,
|
| 774 |
+
inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
|
| 775 |
+
outputs=[choose_custom_tts_model],
|
| 776 |
+
)
|
| 777 |
+
custom_ckpt_path.change(
|
| 778 |
+
switch_tts_model,
|
| 779 |
+
inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
|
| 780 |
+
outputs=[choose_custom_tts_model],
|
| 781 |
+
)
|
| 782 |
+
custom_vocab_path.change(
|
| 783 |
+
switch_tts_model,
|
| 784 |
+
inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
|
| 785 |
+
outputs=[choose_custom_tts_model],
|
| 786 |
)
|
| 787 |
|
| 788 |
gr.TabbedInterface(
|
pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "f5-tts"
|
| 7 |
-
|
| 8 |
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
| 9 |
readme = "README.md"
|
| 10 |
license = {text = "MIT License"}
|
|
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "f5-tts"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
| 9 |
readme = "README.md"
|
| 10 |
license = {text = "MIT License"}
|
src/f5_tts/infer/README.md
CHANGED
|
@@ -2,6 +2,8 @@
|
|
| 2 |
|
| 3 |
The pretrained model checkpoints can be reached at [π€ Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [π€ Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or will be automatically downloaded when running inference scripts.
|
| 4 |
|
|
|
|
|
|
|
| 5 |
Currently support **30s for a single** generation, which is the **total length** including both prompt and output audio. However, you can provide `infer_cli` and `infer_gradio` with longer text, will automatically do chunk generation. Long reference audio will be **clip short to ~15s**.
|
| 6 |
|
| 7 |
To avoid possible inference failures, make sure you have seen through the following instructions.
|
|
|
|
| 2 |
|
| 3 |
The pretrained model checkpoints can be reached at [π€ Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [π€ Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or will be automatically downloaded when running inference scripts.
|
| 4 |
|
| 5 |
+
More checkpoints with whole community efforts can be found [here](src/f5_tts/infer/SHARED.md), supporting more languages.
|
| 6 |
+
|
| 7 |
Currently support **30s for a single** generation, which is the **total length** including both prompt and output audio. However, you can provide `infer_cli` and `infer_gradio` with longer text, will automatically do chunk generation. Long reference audio will be **clip short to ~15s**.
|
| 8 |
|
| 9 |
To avoid possible inference failures, make sure you have seen through the following instructions.
|
src/f5_tts/infer/SHARED.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- omit in toc -->
|
| 2 |
+
# Shared Model Cards
|
| 3 |
+
|
| 4 |
+
- This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
|
| 5 |
+
- The models in this repository are open source and are based on voluntary contributions from contributors.
|
| 6 |
+
- The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
<!-- omit in toc -->
|
| 10 |
+
### Support Language
|
| 11 |
+
- [Multilingual](#multilingual)
|
| 12 |
+
- [F5-TTS Base @ pretrain @ zh \& en](#f5-tts-base--pretrain--zh--en)
|
| 13 |
+
- [Mandarin](#mandarin)
|
| 14 |
+
- [English](#english)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
### Multilingual
|
| 18 |
+
|
| 19 |
+
#### F5-TTS Base @ pretrain @ zh & en
|
| 20 |
+
|Model|π€Hugging Face|Data (Hours)|Model License|
|
| 21 |
+
|:---:|:------------:|:-----------:|:-------------:|
|
| 22 |
+
|F5-TTS Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
MODEL_CKPT: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
|
| 26 |
+
VOCAB_FILE: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
|
| 27 |
+
```
|
| 28 |
+
***Other infos, e.g. Github Repo, Usage Instruction, Tutorial (Blog, Video, etc.) ...***
|
| 29 |
+
|
| 30 |
+
### Mandarin
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
### English
|
| 34 |
+
|