Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
- app.py +78 -22
- pyproject.toml +1 -1
- src/f5_tts/infer/README.md +2 -0
- src/f5_tts/infer/SHARED.md +34 -0
    	
        app.py
    CHANGED
    
    | @@ -38,22 +38,40 @@ from f5_tts.infer.utils_infer import ( | |
| 38 | 
             
                save_spectrogram,
         | 
| 39 | 
             
            )
         | 
| 40 |  | 
| 41 | 
            -
             | 
|  | |
|  | |
| 42 |  | 
| 43 |  | 
| 44 | 
             
            # load models
         | 
| 45 | 
            -
            F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
         | 
| 46 | 
            -
            F5TTS_ema_model = load_model(
         | 
| 47 | 
            -
                DiT, F5TTS_model_cfg, str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"))
         | 
| 48 | 
            -
            )
         | 
| 49 |  | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
            )
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 54 |  | 
| 55 | 
            -
            DEFAULT_TTS_MODEL = "F5-TTS"
         | 
| 56 | 
            -
            tts_model_choice = DEFAULT_TTS_MODEL
         | 
| 57 | 
             
            chat_model_state = None
         | 
| 58 | 
             
            chat_tokenizer_state = None
         | 
| 59 |  | 
| @@ -90,7 +108,16 @@ def infer( | |
| 90 | 
             
                if model == "F5-TTS":
         | 
| 91 | 
             
                    ema_model = F5TTS_ema_model
         | 
| 92 | 
             
                elif model == "E2-TTS":
         | 
|  | |
|  | |
|  | |
|  | |
| 93 | 
             
                    ema_model = E2TTS_ema_model
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 94 |  | 
| 95 | 
             
                final_wave, final_sample_rate, combined_spectrogram = infer_process(
         | 
| 96 | 
             
                    ref_audio,
         | 
| @@ -712,21 +739,50 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip | |
| 712 | 
             
            """
         | 
| 713 | 
             
                )
         | 
| 714 |  | 
| 715 | 
            -
                def switch_tts_model(new_choice):
         | 
| 716 | 
             
                    global tts_model_choice
         | 
| 717 | 
            -
                     | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 718 |  | 
| 719 | 
            -
                if not USING_SPACES:
         | 
| 720 | 
            -
                    choose_tts_model = gr.Radio(
         | 
| 721 | 
            -
                        choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
         | 
| 722 | 
            -
                    )
         | 
| 723 | 
            -
                else:
         | 
| 724 | 
            -
                    choose_tts_model = gr.Radio(
         | 
| 725 | 
            -
                        choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
         | 
| 726 | 
            -
                    )
         | 
| 727 | 
             
                choose_tts_model.change(
         | 
| 728 | 
             
                    switch_tts_model,
         | 
| 729 | 
            -
                    inputs=choose_tts_model,
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 730 | 
             
                )
         | 
| 731 |  | 
| 732 | 
             
                gr.TabbedInterface(
         | 
|  | |
| 38 | 
             
                save_spectrogram,
         | 
| 39 | 
             
            )
         | 
| 40 |  | 
| 41 | 
            +
             | 
| 42 | 
            +
            DEFAULT_TTS_MODEL = "F5-TTS"
         | 
| 43 | 
            +
            tts_model_choice = DEFAULT_TTS_MODEL
         | 
| 44 |  | 
| 45 |  | 
| 46 | 
             
            # load models
         | 
|  | |
|  | |
|  | |
|  | |
| 47 |  | 
| 48 | 
            +
            vocoder = load_vocoder()
         | 
| 49 | 
            +
             | 
| 50 | 
            +
             | 
| 51 | 
            +
            def load_f5tts(ckpt_path=str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"))):
         | 
| 52 | 
            +
                F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
         | 
| 53 | 
            +
                return load_model(DiT, F5TTS_model_cfg, ckpt_path)
         | 
| 54 | 
            +
             | 
| 55 | 
            +
             | 
| 56 | 
            +
            def load_e2tts(ckpt_path=str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))):
         | 
| 57 | 
            +
                E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
         | 
| 58 | 
            +
                return load_model(UNetT, E2TTS_model_cfg, ckpt_path)
         | 
| 59 | 
            +
             | 
| 60 | 
            +
             | 
| 61 | 
            +
            def load_custom(ckpt_path: str, vocab_path="", model_cfg=None):
         | 
| 62 | 
            +
                ckpt_path, vocab_path = ckpt_path.strip(), vocab_path.strip()
         | 
| 63 | 
            +
                if ckpt_path.startswith("hf://"):
         | 
| 64 | 
            +
                    ckpt_path = str(cached_path(ckpt_path))
         | 
| 65 | 
            +
                if vocab_path.startswith("hf://"):
         | 
| 66 | 
            +
                    vocab_path = str(cached_path(vocab_path))
         | 
| 67 | 
            +
                if model_cfg is None:
         | 
| 68 | 
            +
                    model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
         | 
| 69 | 
            +
                return load_model(DiT, model_cfg, ckpt_path, vocab_file=vocab_path)
         | 
| 70 | 
            +
             | 
| 71 | 
            +
             | 
| 72 | 
            +
            F5TTS_ema_model = load_f5tts()
         | 
| 73 | 
            +
            E2TTS_ema_model = load_e2tts() if USING_SPACES else None
         | 
| 74 |  | 
|  | |
|  | |
| 75 | 
             
            chat_model_state = None
         | 
| 76 | 
             
            chat_tokenizer_state = None
         | 
| 77 |  | 
|  | |
| 108 | 
             
                if model == "F5-TTS":
         | 
| 109 | 
             
                    ema_model = F5TTS_ema_model
         | 
| 110 | 
             
                elif model == "E2-TTS":
         | 
| 111 | 
            +
                    global E2TTS_ema_model
         | 
| 112 | 
            +
                    if E2TTS_ema_model is None:
         | 
| 113 | 
            +
                        show_info("Loading E2-TTS model...")
         | 
| 114 | 
            +
                        E2TTS_ema_model = load_e2tts()
         | 
| 115 | 
             
                    ema_model = E2TTS_ema_model
         | 
| 116 | 
            +
                elif isinstance(model, list) and model[0] == "Custom":
         | 
| 117 | 
            +
                    assert not USING_SPACES, "Only official checkpoints allowed in Spaces."
         | 
| 118 | 
            +
                    show_info("Loading Custom TTS model...")
         | 
| 119 | 
            +
                    custom_ema_model = load_custom(model[1], vocab_path=model[2])
         | 
| 120 | 
            +
                    ema_model = custom_ema_model
         | 
| 121 |  | 
| 122 | 
             
                final_wave, final_sample_rate, combined_spectrogram = infer_process(
         | 
| 123 | 
             
                    ref_audio,
         | 
|  | |
| 739 | 
             
            """
         | 
| 740 | 
             
                )
         | 
| 741 |  | 
| 742 | 
            +
                def switch_tts_model(new_choice, custom_ckpt_path, custom_vocab_path):
         | 
| 743 | 
             
                    global tts_model_choice
         | 
| 744 | 
            +
                    if new_choice == "Custom":
         | 
| 745 | 
            +
                        tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path]
         | 
| 746 | 
            +
                        return gr.update(visible=True)
         | 
| 747 | 
            +
                    else:
         | 
| 748 | 
            +
                        tts_model_choice = new_choice
         | 
| 749 | 
            +
                        return gr.update(visible=False)
         | 
| 750 | 
            +
             | 
| 751 | 
            +
                with gr.Row():
         | 
| 752 | 
            +
                    if not USING_SPACES:
         | 
| 753 | 
            +
                        choose_tts_model = gr.Radio(
         | 
| 754 | 
            +
                            choices=[DEFAULT_TTS_MODEL, "E2-TTS", "Custom"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
         | 
| 755 | 
            +
                        )
         | 
| 756 | 
            +
                    else:
         | 
| 757 | 
            +
                        choose_tts_model = gr.Radio(
         | 
| 758 | 
            +
                            choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
         | 
| 759 | 
            +
                        )
         | 
| 760 | 
            +
                    with gr.Column(visible=False) as choose_custom_tts_model:
         | 
| 761 | 
            +
                        custom_ckpt_path = gr.Textbox(
         | 
| 762 | 
            +
                            placeholder="MODEL_CKPT:  local_path  |  hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors",
         | 
| 763 | 
            +
                            show_label=False,
         | 
| 764 | 
            +
                            min_width=200,
         | 
| 765 | 
            +
                        )
         | 
| 766 | 
            +
                        custom_vocab_path = gr.Textbox(
         | 
| 767 | 
            +
                            placeholder="VOCAB_FILE:  local_path  |  hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt  |  leave blank to use default",
         | 
| 768 | 
            +
                            show_label=False,
         | 
| 769 | 
            +
                            min_width=200,
         | 
| 770 | 
            +
                        )
         | 
| 771 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 772 | 
             
                choose_tts_model.change(
         | 
| 773 | 
             
                    switch_tts_model,
         | 
| 774 | 
            +
                    inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
         | 
| 775 | 
            +
                    outputs=[choose_custom_tts_model],
         | 
| 776 | 
            +
                )
         | 
| 777 | 
            +
                custom_ckpt_path.change(
         | 
| 778 | 
            +
                    switch_tts_model,
         | 
| 779 | 
            +
                    inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
         | 
| 780 | 
            +
                    outputs=[choose_custom_tts_model],
         | 
| 781 | 
            +
                )
         | 
| 782 | 
            +
                custom_vocab_path.change(
         | 
| 783 | 
            +
                    switch_tts_model,
         | 
| 784 | 
            +
                    inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
         | 
| 785 | 
            +
                    outputs=[choose_custom_tts_model],
         | 
| 786 | 
             
                )
         | 
| 787 |  | 
| 788 | 
             
                gr.TabbedInterface(
         | 
    	
        pyproject.toml
    CHANGED
    
    | @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" | |
| 4 |  | 
| 5 | 
             
            [project]
         | 
| 6 | 
             
            name = "f5-tts"
         | 
| 7 | 
            -
             | 
| 8 | 
             
            description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
         | 
| 9 | 
             
            readme = "README.md"
         | 
| 10 | 
             
            license = {text = "MIT License"}
         | 
|  | |
| 4 |  | 
| 5 | 
             
            [project]
         | 
| 6 | 
             
            name = "f5-tts"
         | 
| 7 | 
            +
            version = "0.1.0"
         | 
| 8 | 
             
            description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
         | 
| 9 | 
             
            readme = "README.md"
         | 
| 10 | 
             
            license = {text = "MIT License"}
         | 
    	
        src/f5_tts/infer/README.md
    CHANGED
    
    | @@ -2,6 +2,8 @@ | |
| 2 |  | 
| 3 | 
             
            The pretrained model checkpoints can be reached at [π€ Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [π€ Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or will be automatically downloaded when running inference scripts.
         | 
| 4 |  | 
|  | |
|  | |
| 5 | 
             
            Currently support **30s for a single** generation, which is the **total length** including both prompt and output audio. However, you can provide `infer_cli` and `infer_gradio` with longer text, will automatically do chunk generation. Long reference audio will be **clip short to ~15s**.
         | 
| 6 |  | 
| 7 | 
             
            To avoid possible inference failures, make sure you have seen through the following instructions.
         | 
|  | |
| 2 |  | 
| 3 | 
             
            The pretrained model checkpoints can be reached at [π€ Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [π€ Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or will be automatically downloaded when running inference scripts.
         | 
| 4 |  | 
| 5 | 
            +
            More checkpoints with whole community efforts can be found [here](src/f5_tts/infer/SHARED.md), supporting more languages.
         | 
| 6 | 
            +
             | 
| 7 | 
             
            Currently support **30s for a single** generation, which is the **total length** including both prompt and output audio. However, you can provide `infer_cli` and `infer_gradio` with longer text, will automatically do chunk generation. Long reference audio will be **clip short to ~15s**.
         | 
| 8 |  | 
| 9 | 
             
            To avoid possible inference failures, make sure you have seen through the following instructions.
         | 
    	
        src/f5_tts/infer/SHARED.md
    ADDED
    
    | @@ -0,0 +1,34 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            <!-- omit in toc -->
         | 
| 2 | 
            +
            # Shared Model Cards
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            - This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
         | 
| 5 | 
            +
            - The models in this repository are open source and are based on voluntary contributions from contributors.
         | 
| 6 | 
            +
            - The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            <!-- omit in toc -->
         | 
| 10 | 
            +
            ### Support Language
         | 
| 11 | 
            +
            - [Multilingual](#multilingual)
         | 
| 12 | 
            +
              - [F5-TTS Base @ pretrain @ zh \& en](#f5-tts-base--pretrain--zh--en)
         | 
| 13 | 
            +
            - [Mandarin](#mandarin)
         | 
| 14 | 
            +
            - [English](#english)
         | 
| 15 | 
            +
             | 
| 16 | 
            +
             | 
| 17 | 
            +
            ### Multilingual
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            #### F5-TTS Base @ pretrain @ zh & en
         | 
| 20 | 
            +
            |Model|π€Hugging Face|Data (Hours)|Model License|
         | 
| 21 | 
            +
            |:---:|:------------:|:-----------:|:-------------:|
         | 
| 22 | 
            +
            |F5-TTS Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            ```bash
         | 
| 25 | 
            +
            MODEL_CKPT: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
         | 
| 26 | 
            +
            VOCAB_FILE: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
         | 
| 27 | 
            +
            ```
         | 
| 28 | 
            +
            ***Other infos, e.g. Github Repo, Usage Instruction, Tutorial (Blog, Video, etc.) ...***
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            ### Mandarin
         | 
| 31 | 
            +
             | 
| 32 | 
            +
             | 
| 33 | 
            +
            ### English
         | 
| 34 | 
            +
             | 
 
			
