mrfakename commited on
Commit
7804f9c
·
verified ·
1 Parent(s): e35df77

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

app.py CHANGED
@@ -38,22 +38,40 @@ from f5_tts.infer.utils_infer import (
38
  save_spectrogram,
39
  )
40
 
41
- vocoder = load_vocoder()
 
 
42
 
43
 
44
  # load models
45
- F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
46
- F5TTS_ema_model = load_model(
47
- DiT, F5TTS_model_cfg, str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"))
48
- )
49
 
50
- E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
51
- E2TTS_ema_model = load_model(
52
- UNetT, E2TTS_model_cfg, str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
53
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- DEFAULT_TTS_MODEL = "F5-TTS"
56
- tts_model_choice = DEFAULT_TTS_MODEL
57
  chat_model_state = None
58
  chat_tokenizer_state = None
59
 
@@ -90,7 +108,16 @@ def infer(
90
  if model == "F5-TTS":
91
  ema_model = F5TTS_ema_model
92
  elif model == "E2-TTS":
 
 
 
 
93
  ema_model = E2TTS_ema_model
 
 
 
 
 
94
 
95
  final_wave, final_sample_rate, combined_spectrogram = infer_process(
96
  ref_audio,
@@ -712,21 +739,50 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
712
  """
713
  )
714
 
715
- def switch_tts_model(new_choice):
716
  global tts_model_choice
717
- tts_model_choice = new_choice
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
 
719
- if not USING_SPACES:
720
- choose_tts_model = gr.Radio(
721
- choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
722
- )
723
- else:
724
- choose_tts_model = gr.Radio(
725
- choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
726
- )
727
  choose_tts_model.change(
728
  switch_tts_model,
729
- inputs=choose_tts_model,
 
 
 
 
 
 
 
 
 
 
 
730
  )
731
 
732
  gr.TabbedInterface(
 
38
  save_spectrogram,
39
  )
40
 
41
+
42
+ DEFAULT_TTS_MODEL = "F5-TTS"
43
+ tts_model_choice = DEFAULT_TTS_MODEL
44
 
45
 
46
  # load models
 
 
 
 
47
 
48
+ vocoder = load_vocoder()
49
+
50
+
51
+ def load_f5tts(ckpt_path=str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"))):
52
+ F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
53
+ return load_model(DiT, F5TTS_model_cfg, ckpt_path)
54
+
55
+
56
+ def load_e2tts(ckpt_path=str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))):
57
+ E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
58
+ return load_model(UNetT, E2TTS_model_cfg, ckpt_path)
59
+
60
+
61
+ def load_custom(ckpt_path: str, vocab_path="", model_cfg=None):
62
+ ckpt_path, vocab_path = ckpt_path.strip(), vocab_path.strip()
63
+ if ckpt_path.startswith("hf://"):
64
+ ckpt_path = str(cached_path(ckpt_path))
65
+ if vocab_path.startswith("hf://"):
66
+ vocab_path = str(cached_path(vocab_path))
67
+ if model_cfg is None:
68
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
69
+ return load_model(DiT, model_cfg, ckpt_path, vocab_file=vocab_path)
70
+
71
+
72
+ F5TTS_ema_model = load_f5tts()
73
+ E2TTS_ema_model = load_e2tts() if USING_SPACES else None
74
 
 
 
75
  chat_model_state = None
76
  chat_tokenizer_state = None
77
 
 
108
  if model == "F5-TTS":
109
  ema_model = F5TTS_ema_model
110
  elif model == "E2-TTS":
111
+ global E2TTS_ema_model
112
+ if E2TTS_ema_model is None:
113
+ show_info("Loading E2-TTS model...")
114
+ E2TTS_ema_model = load_e2tts()
115
  ema_model = E2TTS_ema_model
116
+ elif isinstance(model, list) and model[0] == "Custom":
117
+ assert not USING_SPACES, "Only official checkpoints allowed in Spaces."
118
+ show_info("Loading Custom TTS model...")
119
+ custom_ema_model = load_custom(model[1], vocab_path=model[2])
120
+ ema_model = custom_ema_model
121
 
122
  final_wave, final_sample_rate, combined_spectrogram = infer_process(
123
  ref_audio,
 
739
  """
740
  )
741
 
742
+ def switch_tts_model(new_choice, custom_ckpt_path, custom_vocab_path):
743
  global tts_model_choice
744
+ if new_choice == "Custom":
745
+ tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path]
746
+ return gr.update(visible=True)
747
+ else:
748
+ tts_model_choice = new_choice
749
+ return gr.update(visible=False)
750
+
751
+ with gr.Row():
752
+ if not USING_SPACES:
753
+ choose_tts_model = gr.Radio(
754
+ choices=[DEFAULT_TTS_MODEL, "E2-TTS", "Custom"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
755
+ )
756
+ else:
757
+ choose_tts_model = gr.Radio(
758
+ choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
759
+ )
760
+ with gr.Column(visible=False) as choose_custom_tts_model:
761
+ custom_ckpt_path = gr.Textbox(
762
+ placeholder="MODEL_CKPT: local_path | hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors",
763
+ show_label=False,
764
+ min_width=200,
765
+ )
766
+ custom_vocab_path = gr.Textbox(
767
+ placeholder="VOCAB_FILE: local_path | hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt | leave blank to use default",
768
+ show_label=False,
769
+ min_width=200,
770
+ )
771
 
 
 
 
 
 
 
 
 
772
  choose_tts_model.change(
773
  switch_tts_model,
774
+ inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
775
+ outputs=[choose_custom_tts_model],
776
+ )
777
+ custom_ckpt_path.change(
778
+ switch_tts_model,
779
+ inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
780
+ outputs=[choose_custom_tts_model],
781
+ )
782
+ custom_vocab_path.change(
783
+ switch_tts_model,
784
+ inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
785
+ outputs=[choose_custom_tts_model],
786
  )
787
 
788
  gr.TabbedInterface(
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "f5-tts"
7
- dynamic = ["version"]
8
  description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
9
  readme = "README.md"
10
  license = {text = "MIT License"}
 
4
 
5
  [project]
6
  name = "f5-tts"
7
+ version = "0.1.0"
8
  description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
9
  readme = "README.md"
10
  license = {text = "MIT License"}
src/f5_tts/infer/README.md CHANGED
@@ -2,6 +2,8 @@
2
 
3
  The pretrained model checkpoints can be reached at [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [🤖 Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or will be automatically downloaded when running inference scripts.
4
 
 
 
5
  Currently support **30s for a single** generation, which is the **total length** including both prompt and output audio. However, you can provide `infer_cli` and `infer_gradio` with longer text, will automatically do chunk generation. Long reference audio will be **clip short to ~15s**.
6
 
7
  To avoid possible inference failures, make sure you have seen through the following instructions.
 
2
 
3
  The pretrained model checkpoints can be reached at [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [🤖 Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or will be automatically downloaded when running inference scripts.
4
 
5
+ More checkpoints with whole community efforts can be found [here](src/f5_tts/infer/SHARED.md), supporting more languages.
6
+
7
  Currently support **30s for a single** generation, which is the **total length** including both prompt and output audio. However, you can provide `infer_cli` and `infer_gradio` with longer text, will automatically do chunk generation. Long reference audio will be **clip short to ~15s**.
8
 
9
  To avoid possible inference failures, make sure you have seen through the following instructions.
src/f5_tts/infer/SHARED.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- omit in toc -->
2
+ # Shared Model Cards
3
+
4
+ - This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
5
+ - The models in this repository are open source and are based on voluntary contributions from contributors.
6
+ - The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
7
+
8
+
9
+ <!-- omit in toc -->
10
+ ### Support Language
11
+ - [Multilingual](#multilingual)
12
+ - [F5-TTS Base @ pretrain @ zh \& en](#f5-tts-base--pretrain--zh--en)
13
+ - [Mandarin](#mandarin)
14
+ - [English](#english)
15
+
16
+
17
+ ### Multilingual
18
+
19
+ #### F5-TTS Base @ pretrain @ zh & en
20
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
21
+ |:---:|:------------:|:-----------:|:-------------:|
22
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
23
+
24
+ ```bash
25
+ MODEL_CKPT: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
26
+ VOCAB_FILE: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
27
+ ```
28
+ ***Other infos, e.g. Github Repo, Usage Instruction, Tutorial (Blog, Video, etc.) ...***
29
+
30
+ ### Mandarin
31
+
32
+
33
+ ### English
34
+