Spaces:

united-link
/

formosan-f5-tts

Running on Zero

App Files Files Community

txya900619 commited on Feb 12

Commit

0c074b9

1 Parent(s): 947b905

feat: add app.py

Browse files

Files changed (9) hide show

DEMO.md +17 -0
app.py +274 -0
configs/g2p.yaml +1 -0
configs/models.yaml +4 -0
ipa/__init__.py +72 -0
ipa/ipa.py +58 -0
ref_wav/E-PV001-0085.wav +0 -0
ref_wav/E-PV001-0254.wav +0 -0
requirements.txt +3 -0

DEMO.md ADDED Viewed

	@@ -0,0 +1,17 @@

+# 原語會族語語音合成系統
+ILRDF Formosan Text-To-Speech System
+## 研發團隊
+- [李鴻欣 Hung-Shin Lee](mailto:[email protected])
+- [陳力瑋 Li-Wei Chen](mailto:[email protected])
+- [意傳科技](https://ithuan.tw/)
+- [原住民族語言研究發展基金會](https://www.ilrdf.org.tw/)
+## 特別致謝
+- [聯和科創](https://www.104.com.tw/company/1a2x6bmu75)
+- [Pipalofasaran to Sowal no Pangcah/'Amis 台灣阿美族語言永續發展學會](https://www.facebook.com/groups/ypspt/about)
+- [台灣太魯閣族語言發展學會](https://qkktt.com/)
+- [台灣原住民族賽德克族語言文化學會](https://www.facebook.com/3S3TBL/)
+- 族語老師們

app.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import tempfile
+import gradio as gr
+import soundfile as sf
+import torchaudio
+from cached_path import cached_path
+from omegaconf import OmegaConf
+from ipa.ipa import text_to_ipa
+try:
+    import spaces
+    USING_SPACES = True
+except ImportError:
+    USING_SPACES = False
+from f5_tts.infer.utils_infer import (
+    infer_process,
+    load_model,
+    load_vocoder,
+    preprocess_ref_audio_text,
+    remove_silence_for_generated_wav,
+    save_spectrogram,
+)
+from f5_tts.model import DiT
+def gpu_decorator(func):
+    if USING_SPACES:
+        return spaces.GPU(func)
+    else:
+        return func
+vocoder = load_vocoder()
+def load_f5tts(ckpt_path, vocab_path):
+    ckpt_path = str(cached_path(ckpt_path))
+    F5TTS_model_cfg = dict(
+        dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
+    )
+    vocab_path = str(cached_path(vocab_path))
+    return load_model(DiT, F5TTS_model_cfg, ckpt_path, vocab_file=vocab_path)
+OmegaConf.register_new_resolver("load_f5tts", load_f5tts)
+models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
+dialects = OmegaConf.to_object(OmegaConf.load("configs/dialects.yaml"))
+DEFAULT_MODEL_ID = list(models_config.keys())[0]
+DEFAULT_DIALECT = list(dialects.values())[0]
+@gpu_decorator
+def infer(
+    ref_audio_orig,
+    ref_text,
+    gen_text,
+    model,
+    remove_silence,
+    cross_fade_duration=0.15,
+    nfe_step=32,
+    fix_duration=1,
+    show_info=gr.Info,
+):
+    if not ref_audio_orig:
+        gr.Warning("Please provide reference audio.")
+        return gr.update(), gr.update(), ref_text
+    if not gen_text.strip():
+        gr.Warning("Please enter text to generate.")
+        return gr.update(), gr.update(), ref_text
+    ref_audio, ref_text = preprocess_ref_audio_text(
+        ref_audio_orig, ref_text, show_info=show_info
+    )
+    final_wave, final_sample_rate, combined_spectrogram = infer_process(
+        ref_audio,
+        ref_text,
+        gen_text,
+        model,
+        vocoder,
+        cross_fade_duration=cross_fade_duration,
+        nfe_step=nfe_step,
+        fix_duration=fix_duration,
+        show_info=show_info,
+        progress=gr.Progress(),
+    )
+    # Remove silence
+    if remove_silence:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+            sf.write(f.name, final_wave, final_sample_rate)
+            remove_silence_for_generated_wav(f.name)
+            final_wave, _ = torchaudio.load(f.name)
+        final_wave = final_wave.squeeze().cpu().numpy()
+    print(f"Final wave duration: {final_wave.shape[0] / final_sample_rate:.2f}s")
+    # Save the spectrogram
+    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
+        spectrogram_path = tmp_spectrogram.name
+        save_spectrogram(combined_spectrogram, spectrogram_path)
+    return (final_sample_rate, final_wave), spectrogram_path
+def get_title():
+    with open("DEMO.md") as tong:
+        return tong.readline().strip("# ")
+demo = gr.Blocks(
+    title=get_title(),
+    css="@import url(https://tauhu.tw/tauhu-oo.css);",
+    theme=gr.themes.Default(
+        font=(
+            "tauhu-oo",
+            gr.themes.GoogleFont("Source Sans Pro"),
+            "ui-sans-serif",
+            "system-ui",
+            "sans-serif",
+        )
+    ),
+)
+with demo:
+    with open("DEMO.md") as tong:
+        gr.Markdown(tong.read())
+    with gr.Row():
+        with gr.Column():
+            model_drop_down = gr.Dropdown(
+                models_config.keys(),
+                value=DEFAULT_MODEL_ID,
+                label="模型",
+            )
+            ref_audio_input = gr.Audio(
+                type="filepath",
+                waveform_options=gr.WaveformOptions(
+                    sample_rate=24000,
+                ),
+                label="Reference Audio",
+            )
+            ref_text_input = gr.Textbox(
+                value="",
+                label="Reference Text",
+            )
+            gen_text_input = gr.Textbox(
+                label="Text to Generate",
+                value="",
+            )
+            generate_btn = gr.Button("Synthesize", variant="primary")
+            with gr.Accordion("Advanced Settings", open=False):
+                remove_silence = gr.Checkbox(
+                    label="Remove Silences",
+                    info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
+                    value=False,
+                )
+                speed_slider = gr.Slider(
+                    label="Speed",
+                    minimum=0.3,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1,
+                    info="語速（越小越慢）",
+                )
+                nfe_slider = gr.Slider(
+                    label="NFE Steps",
+                    minimum=4,
+                    maximum=64,
+                    value=32,
+                    step=2,
+                    info="Set the number of denoising steps.",
+                )
+                cross_fade_duration_slider = gr.Slider(
+                    label="Cross-Fade Duration (s)",
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.15,
+                    step=0.01,
+                    info="Set the duration of the cross-fade between audio clips.",
+                )
+        with gr.Column():
+            audio_output = gr.Audio(label="Synthesized Audio")
+            spectrogram_output = gr.Image(label="Spectrogram")
+    @gpu_decorator
+    def basic_tts(
+        model_drop_down: str,
+        ref_audio_input: str,
+        ref_text_input: str,
+        gen_text_input: str,
+        remove_silence: bool,
+        cross_fade_duration_slider: float,
+        nfe_slider: int,
+        speed_slider: float,
+    ):
+        ref_audio_info = torchaudio.info(ref_audio_input)
+        ref_duration = ref_audio_info.num_frames / ref_audio_info.sample_rate
+        target_duration = (
+            ref_duration
+            * len(gen_text_input.replace(" ", ""))
+            / len(ref_text_input.replace(" ", ""))
+            / speed_slider
+        )
+        print(f"Reference duration: {ref_duration}")
+        print(f"Target duration: {target_duration}")
+        if len(ref_text_input) == 0:
+            raise gr.Error("請勿輸入空字串。")
+        ref_text_input = text_to_ipa(ref_text_input)
+        if len(gen_text_input) == 0:
+            raise gr.Error("請勿輸入空字串。")
+        gen_text_input = text_to_ipa(gen_text_input)
+        audio_out, spectrogram_path = infer(
+            ref_audio_input,
+            ref_text_input,
+            gen_text_input,
+            models_config[model_drop_down],
+            remove_silence,
+            cross_fade_duration=cross_fade_duration_slider,
+            nfe_step=nfe_slider,
+            fix_duration=ref_duration + target_duration,
+        )
+        return audio_out, spectrogram_path
+    generate_btn.click(
+        basic_tts,
+        inputs=[
+            model_drop_down,
+            ref_audio_input,
+            ref_text_input,
+            gen_text_input,
+            remove_silence,
+            cross_fade_duration_slider,
+            nfe_slider,
+            speed_slider,
+        ],
+        outputs=[audio_output, spectrogram_output],
+    )
+    gr.Examples(
+        [
+            [
+                "./ref_wav/E-PV001-0085.wav",
+                "romakat kako a talapicodadan to romi’ami’ad",
+                "Mafana’ kiso a misanoPangcah haw?",
+            ],
+            [
+                "./ref_wav/E-PV001-0254.wav",
+                "kering sa masoni^ to ko pipahanhanan a tatokian o fe:soc no niyam a tayra i piondoan",
+                "Pafelien cingra to misapoeneray a faloco', nanay mada'oc matilid i faloco' nira konini.",
+            ],
+        ],
+        label="範例",
+        inputs=[
+            ref_audio_input,
+            ref_text_input,
+            gen_text_input,
+        ],
+    )
+demo.launch()

configs/g2p.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ g2p: ${load_g2p:${gh_download:FormoSpeech/FormoG2P, formosan/g2p.csv}}

configs/models.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+step-24000: ${load_f5tts:hf://united-link/f5-tts-ami-xiuguluan-finetune/model_24000.safetensors,hf://united-link/f5-tts-ami-xiuguluan-finetune/vocab.txt}
+step-48000: ${load_f5tts:hf://united-link/f5-tts-ami-xiuguluan-finetune/model_48000.safetensors,hf://united-link/f5-tts-ami-xiuguluan-finetune/vocab.txt}
+step-60000: ${load_f5tts:hf://united-link/f5-tts-ami-xiuguluan-finetune/model_60000.safetensors,hf://united-link/f5-tts-ami-xiuguluan-finetune/vocab.txt}
+step-76500: ${load_f5tts:hf://united-link/f5-tts-ami-xiuguluan-finetune/model_76500.safetensors,hf://united-link/f5-tts-ami-xiuguluan-finetune/vocab.txt}

ipa/__init__.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import csv
+from io import BytesIO
+import requests
+from omegaconf import OmegaConf
+EXTRA_G2P = {
+    "z": "z",
+    "o": "o",
+    "h": "h",
+    "g": "g",
+    "y": "j",
+    "w": "w",
+    "c": "ʦ",
+    "u": "u",
+    "f": "f",
+    "v": "v",
+    "j": "ɟ",
+    "b": "b",
+    "q": "q",
+    "e": "e",
+    ",": ",",
+}
+def gh_download(repo, path):
+    headers = {
+        "Accept": "application/vnd.github.raw+json",
+    }
+    url = f"https://api.github.com/repos/{repo}/contents/{path}"
+    response = requests.get(url, headers=headers)
+    if response.status_code != 200:
+        raise Exception(f"Failed to download {path} from {repo}, response: {response}")
+    response.encoding = "utf-8-sig"
+    return response.text
+def load_g2p(g2p_string):
+    g2p = dict()
+    csv_reader = csv.DictReader(g2p_string.split("\n"))
+    for row in csv_reader:
+        language = row["Language"]
+        dialect = row["Dialect"]
+        if dialect == "-":
+            lang_tag = f"{language}"
+        else:
+            lang_tag = f"{language}_{dialect}"
+        for key in row:
+            if key in ["Language", "Dialect"]:
+                continue
+            if row[key] == "-":
+                continue
+            g2p[lang_tag] = g2p.get(lang_tag, {})
+            g2p[lang_tag][key] = row[key].split(",")[0]
+        for g, p in EXTRA_G2P.items():
+            if g not in g2p[lang_tag]:
+                g2p[lang_tag][g] = p
+    return g2p
+OmegaConf.register_new_resolver("gh_download", gh_download)
+OmegaConf.register_new_resolver("load_g2p", load_g2p)

ipa/ipa.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import re
+from omegaconf import OmegaConf
+XIUGULUAN_G2P = OmegaConf.to_object(OmegaConf.load("configs/g2p.yaml"))["g2p"][
+    "阿美_秀姑巒"
+]
+def can_form_string(x, symbol_dict):
+    def helper(x, symbol_dict, matched_parts):
+        if not x:
+            return True, matched_parts
+        for key in symbol_dict.keys():
+            if x.startswith(key):
+                result, parts = helper(
+                    x[len(key) :], symbol_dict, matched_parts + [key]
+                )
+                if result:
+                    return True, parts
+        return False, []
+    return helper(x, symbol_dict, [])
+def text_to_ipa(text, ignore_comma=True):
+    ipa = []
+    text = text.lower()
+    text = re.sub(r"[.?!]", "", text)
+    text = text.replace("'", "’")
+    words = text.split()  # change in future
+    print(f"ipa: {words}")
+    for word in words:
+        ipa_parts = ""
+        extended_g2p = {**XIUGULUAN_G2P, ",": "" if ignore_comma else ","}
+        result, matched_parts = can_form_string(word, extended_g2p)
+        if result is False:
+            print(f"no match g2p : {word}")
+            return ""
+        for matched_part in matched_parts:
+            ipa_parts = ipa_parts + extended_g2p[matched_part]
+        ipa.append(ipa_parts)
+    ipa = (
+        " ".join(ipa)
+        .replace("g", "ɡ")
+        .replace("ʦ", "t͡s")
+        .replace("ʨ", "t͡ɕ")
+        .replace("R", "ʀ")
+        .replace("ʤ", "dʒ")
+    )
+    return ipa

ref_wav/E-PV001-0085.wav ADDED Viewed

Binary file (309 kB). View file

ref_wav/E-PV001-0254.wav ADDED Viewed

Binary file (548 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+omegaconf
+opencc
+git+https://github.com/SWivid/F5-TTS.git