Spaces:

benjaminzhang
/

cou_tts

Sleeping

App Files Files Community

benjaminzhang commited on Aug 12

Commit

593b69f

1 Parent(s): 4cd9425

Fix model/config path for Hugging Face

Browse files

Files changed (1) hide show

app.py +32 -108

app.py CHANGED Viewed

@@ -1,11 +1,4 @@
-# truku_tts_gradio_app.py (極簡版)
-# ------------------------------------------------------------
-# 太魯閣語 (Truku) TTS — ONNX 推論的極簡 Gradio 介面
-# 需求：只輸入文字 → 直接合成（固定 model/config/scales）
-# - UI：只保留文字輸入、合成按鈕、音檔播放/下載
-# - 其餘選項全部移除
-# ------------------------------------------------------------
 import os
 import json
 import time
@@ -14,114 +7,46 @@ import soundfile as sf
 import onnxruntime as ort
 import gradio as gr
-# ======== 固定參數（依你的環境修改） ========
 MODEL_PATH = "cou_total.onnx"
 CONFIG_PATH = "cou_medium.onnx.json"
 SCALES = (0.667, 1.0, 0.8)  # (length_scale, noise_scale, noise_w)
-# ======== 輔助：載入 config/phoneme_map、建立 ONNX session ========
-with open(CONFIG_PATH, "r", encoding="utf-8") as f:
-    _config = json.load(f)
-_phoneme_map = _config["phoneme_id_map"]
-_pad_id = _phoneme_map["_"]
-_bos_id = _phoneme_map["^"]
-_eos_id = _phoneme_map["$"]
-# 若需要 GPU，可改 providers，例如：
-# providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
-_session = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
-def _text_to_ids(text: str) -> np.ndarray:
-    # 逐字轉 id；未知字用 pad_id；前後加 BOS/EOS
-    ids = [_bos_id] + [_phoneme_map.get(c, _pad_id) for c in text] + [_eos_id]
-    return np.array(ids, dtype=np.int64)
-def synthesize(text: str):
-    text = (text or "").strip()
-    if not text:
-        raise gr.Error("請輸入要合成的文字！")
-    ids = _text_to_ids(text).reshape(1, -1)
-    ids_len = np.array([ids.shape[1]], dtype=np.int64)
-    scales = np.array(list(SCALES), dtype=np.float32)
-    start = time.time()
-    audio = _session.run(
-        None,
-        {"input": ids, "input_lengths": ids_len, "scales": scales},
-    )[0].squeeze()
-    rt = round(time.time() - start, 3)
-    sr = int(_config["audio"]["sample_rate"])  # 取樣率
-    # 同時寫檔，供下載
-    out_name = f"truku_tts_{int(time.time()*1000)}.wav"
-    out_path = os.path.abspath(out_name)
-    sf.write(out_path, audio, samplerate=sr)
-    # gr.Audio 可直接用 (sr, waveform)
-    return (sr, audio), out_path
-# ======== 極簡 UI：只有一個輸入 + 合成 + 音檔 ========
-demo = gr.Interface(
-    fn=synthesize,
-    inputs=gr.Textbox(lines=3, placeholder="請輸入太魯閣語文字…", label="輸入文字"),
-    outputs=[
-        gr.Audio(label="合成音檔", interactive=False, show_download_button=True),
-        gr.File(label="下載 WAV 檔"),
-    ],
-    title="太魯閣語語音合成 (女聲) — ONNX",
-    description="輸入文字後按下「提交」即可合成。模型與參數固定在程式內。",
-)
-if __name__ == "__main__":
-    # 本機建議：若在 WSL，無法用 localhost 時，改用 share=True 或用 127.0.0.1:port
-    demo.launch(server_name="127.0.0.1", server_port=7860)
-# truku_tts_gradio_app.py (極簡版)
-# ------------------------------------------------------------
-# 太魯閣語 (Truku) TTS — ONNX 推論的極簡 Gradio 介面
-# 需求：只輸入文字 → 直接合成（固定 model/config/scales）
-# - UI：只保留文字輸入、合成按鈕、音檔播放/下載
-# - 其餘選項全部移除
-# ------------------------------------------------------------
-import os
-import json
-import time
-import numpy as np
-import soundfile as sf
-import onnxruntime as ort
-import gradio as gr
-# ======== 固定參數（依你的環境修改） ========
-MODEL_PATH = "/home/benjamin/TTS/cou_total.onnx"
-CONFIG_PATH = "/home/benjamin/TTS/cou_medium.onnx.json"
-SCALES = (0.667, 1.0, 0.8)  # (length_scale, noise_scale, noise_w)
-# ======== 輔助：載入 config/phoneme_map、建立 ONNX session ========
-with open(CONFIG_PATH, "r", encoding="utf-8") as f:
-    _config = json.load(f)
-_phoneme_map = _config["phoneme_id_map"]
-_pad_id = _phoneme_map["_"]
-_bos_id = _phoneme_map["^"]
-_eos_id = _phoneme_map["$"]
-# 若需要 GPU，可改 providers，例如：
-# providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
 _session = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
 def _text_to_ids(text: str) -> np.ndarray:
     # 逐字轉 id；未知字用 pad_id；前後加 BOS/EOS
     ids = [_bos_id] + [_phoneme_map.get(c, _pad_id) for c in text] + [_eos_id]
     return np.array(ids, dtype=np.int64)
 def synthesize(text: str):
     text = (text or "").strip()
     if not text:
@@ -131,25 +56,23 @@ def synthesize(text: str):
     ids_len = np.array([ids.shape[1]], dtype=np.int64)
     scales = np.array(list(SCALES), dtype=np.float32)
-    start = time.time()
     audio = _session.run(
         None,
         {"input": ids, "input_lengths": ids_len, "scales": scales},
     )[0].squeeze()
-    rt = round(time.time() - start, 3)
-    sr = int(_config["audio"]["sample_rate"])  # 取樣率
-    # 同時寫檔，供下載
     out_name = f"truku_tts_{int(time.time()*1000)}.wav"
     out_path = os.path.abspath(out_name)
     sf.write(out_path, audio, samplerate=sr)
-    # gr.Audio 可直接用 (sr, waveform)
     return (sr, audio), out_path
-# ======== 極簡 UI：只有一個輸入 + 合成 + 音檔 ========
 demo = gr.Interface(
     fn=synthesize,
     inputs=gr.Textbox(lines=3, placeholder="請輸入太魯閣語文字…", label="輸入文字"),
@@ -159,8 +82,9 @@ demo = gr.Interface(
     ],
     title="太魯閣語語音合成 (女聲) — ONNX",
     description="輸入文字後按下「提交」即可合成。模型與參數固定在程式內。",
 )
 if __name__ == "__main__":
-    # 本機建議：若在 WSL，無法用 localhost 時，改用 share=True 或用 127.0.0.1:port
-    demo.launch(server_name="127.0.0.1", server_port=7860)

+# 太魯閣語 (Truku) TTS — ONNX + Gradio（Hugging Face 版，公開可用）
 import os
 import json
 import time
 import onnxruntime as ort
 import gradio as gr
+# ======== 固定參數（相對路徑，跟 app.py 放同一層） ========
 MODEL_PATH = "cou_total.onnx"
 CONFIG_PATH = "cou_medium.onnx.json"
 SCALES = (0.667, 1.0, 0.8)  # (length_scale, noise_scale, noise_w)
+# ======== 載入 config 與建立 ONNX session ========
+def _load_config(cfg_path: str):
+    if not os.path.exists(cfg_path):
+        raise FileNotFoundError(f"找不到設定檔：{cfg_path}")
+    with open(cfg_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+try:
+    _config = _load_config(CONFIG_PATH)
+except Exception as e:
+    # 讓 Spaces 在介面上清楚回報錯誤
+    raise RuntimeError(f"CONFIG 載入失敗：{e}")
+if "phoneme_id_map" not in _config:
+    raise RuntimeError("CONFIG 缺少 'phoneme_id_map' 欄位")
+_phoneme_map = _config["phoneme_id_map"]
+_pad_id = _phoneme_map.get("_")
+_bos_id = _phoneme_map.get("^")
+_eos_id = _phoneme_map.get("$")
+if None in (_pad_id, _bos_id, _eos_id):
+    raise RuntimeError("CONFIG 的 phoneme_id_map 缺少 _、^、$ 其中之一")
+if not os.path.exists(MODEL_PATH):
+    raise FileNotFoundError(f"找不到模型檔：{MODEL_PATH}")
+# 若需 GPU，可改 providers=["CUDAExecutionProvider","CPUExecutionProvider"]
 _session = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
 def _text_to_ids(text: str) -> np.ndarray:
     # 逐字轉 id；未知字用 pad_id；前後加 BOS/EOS
     ids = [_bos_id] + [_phoneme_map.get(c, _pad_id) for c in text] + [_eos_id]
     return np.array(ids, dtype=np.int64)
 def synthesize(text: str):
     text = (text or "").strip()
     if not text:
     ids_len = np.array([ids.shape[1]], dtype=np.int64)
     scales = np.array(list(SCALES), dtype=np.float32)
     audio = _session.run(
         None,
         {"input": ids, "input_lengths": ids_len, "scales": scales},
     )[0].squeeze()
+    # 取樣率
+    sr = int(_config["audio"]["sample_rate"])
+    # 寫檔供下載（放工作目錄即可）
     out_name = f"truku_tts_{int(time.time()*1000)}.wav"
     out_path = os.path.abspath(out_name)
     sf.write(out_path, audio, samplerate=sr)
+    # 回傳 (sr, waveform) 供播放 + 檔案供下載
     return (sr, audio), out_path
+# ======== 極簡 UI ========
 demo = gr.Interface(
     fn=synthesize,
     inputs=gr.Textbox(lines=3, placeholder="請輸入太魯閣語文字…", label="輸入文字"),
     ],
     title="太魯閣語語音合成 (女聲) — ONNX",
     description="輸入文字後按下「提交」即可合成。模型與參數固定在程式內。",
+    allow_flagging="never",
 )
 if __name__ == "__main__":
+    # 在 Hugging Face Spaces 不要指定 host/port，也不要 share=True
+    demo.launch()