Spaces:
Sleeping
Sleeping
Commit
·
fa4895b
1
Parent(s):
593b69f
Add app & mp3 samples via Git LFS
Browse files- .gitattributes +1 -0
- app.py +55 -51
- samples/1.mp3 +3 -0
- samples/2.mp3 +3 -0
- samples/3.mp3 +3 -0
- samples/4.mp3 +3 -0
- samples/5.mp3 +3 -0
- samples/cou_b_01065_c19.mp3:Zone.Identifier +0 -0
- samples/cou_b_03730_c19.mp3:Zone.Identifier +0 -0
- samples/cou_b_06023_c19.mp3:Zone.Identifier +0 -0
- samples/cou_e_03884_c19.mp3:Zone.Identifier +0 -0
- samples/cou_e_04320_c19.mp3:Zone.Identifier +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
#
|
| 2 |
import os
|
| 3 |
import json
|
| 4 |
import time
|
|
@@ -7,43 +7,42 @@ import soundfile as sf
|
|
| 7 |
import onnxruntime as ort
|
| 8 |
import gradio as gr
|
| 9 |
|
| 10 |
-
# ======== 固定參數(相對路徑,跟 app.py 放同一層) ========
|
| 11 |
MODEL_PATH = "cou_total.onnx"
|
| 12 |
CONFIG_PATH = "cou_medium.onnx.json"
|
| 13 |
-
SCALES = (0.667, 1.0, 0.8)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
# ======== 載入 config 與建立 ONNX session ========
|
| 16 |
def _load_config(cfg_path: str):
|
| 17 |
if not os.path.exists(cfg_path):
|
| 18 |
raise FileNotFoundError(f"找不到設定檔:{cfg_path}")
|
| 19 |
with open(cfg_path, "r", encoding="utf-8") as f:
|
| 20 |
return json.load(f)
|
| 21 |
|
| 22 |
-
|
| 23 |
-
_config = _load_config(CONFIG_PATH)
|
| 24 |
-
except Exception as e:
|
| 25 |
-
# 讓 Spaces 在介面上清楚回報錯誤
|
| 26 |
-
raise RuntimeError(f"CONFIG 載入失敗:{e}")
|
| 27 |
-
|
| 28 |
-
if "phoneme_id_map" not in _config:
|
| 29 |
-
raise RuntimeError("CONFIG 缺少 'phoneme_id_map' 欄位")
|
| 30 |
-
|
| 31 |
_phoneme_map = _config["phoneme_id_map"]
|
| 32 |
_pad_id = _phoneme_map.get("_")
|
| 33 |
_bos_id = _phoneme_map.get("^")
|
| 34 |
_eos_id = _phoneme_map.get("$")
|
| 35 |
|
| 36 |
-
if None in (_pad_id, _bos_id, _eos_id):
|
| 37 |
-
raise RuntimeError("CONFIG 的 phoneme_id_map 缺少 _、^、$ 其中之一")
|
| 38 |
-
|
| 39 |
-
if not os.path.exists(MODEL_PATH):
|
| 40 |
-
raise FileNotFoundError(f"找不到模型檔:{MODEL_PATH}")
|
| 41 |
-
|
| 42 |
-
# 若需 GPU,可改 providers=["CUDAExecutionProvider","CPUExecutionProvider"]
|
| 43 |
_session = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
|
| 44 |
|
| 45 |
def _text_to_ids(text: str) -> np.ndarray:
|
| 46 |
-
# 逐字轉 id;未知字用 pad_id;前後加 BOS/EOS
|
| 47 |
ids = [_bos_id] + [_phoneme_map.get(c, _pad_id) for c in text] + [_eos_id]
|
| 48 |
return np.array(ids, dtype=np.int64)
|
| 49 |
|
|
@@ -51,40 +50,45 @@ def synthesize(text: str):
|
|
| 51 |
text = (text or "").strip()
|
| 52 |
if not text:
|
| 53 |
raise gr.Error("請輸入要合成的文字!")
|
| 54 |
-
|
| 55 |
ids = _text_to_ids(text).reshape(1, -1)
|
| 56 |
ids_len = np.array([ids.shape[1]], dtype=np.int64)
|
| 57 |
scales = np.array(list(SCALES), dtype=np.float32)
|
| 58 |
-
|
| 59 |
-
audio = _session.run(
|
| 60 |
-
None,
|
| 61 |
-
{"input": ids, "input_lengths": ids_len, "scales": scales},
|
| 62 |
-
)[0].squeeze()
|
| 63 |
-
|
| 64 |
-
# 取樣率
|
| 65 |
sr = int(_config["audio"]["sample_rate"])
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
return
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
if __name__ == "__main__":
|
| 89 |
-
# 在 Hugging Face Spaces 不要指定 host/port,也不要 share=True
|
| 90 |
demo.launch()
|
|
|
|
| 1 |
+
# 鄒語 (Tsou) TTS — ONNX + Gradio(Hugging Face 版)
|
| 2 |
import os
|
| 3 |
import json
|
| 4 |
import time
|
|
|
|
| 7 |
import onnxruntime as ort
|
| 8 |
import gradio as gr
|
| 9 |
|
|
|
|
| 10 |
MODEL_PATH = "cou_total.onnx"
|
| 11 |
CONFIG_PATH = "cou_medium.onnx.json"
|
| 12 |
+
SCALES = (0.667, 1.0, 0.8)
|
| 13 |
+
|
| 14 |
+
SAMPLES_DIR = "samples"
|
| 15 |
+
os.makedirs(SAMPLES_DIR, exist_ok=True)
|
| 16 |
+
|
| 17 |
+
# 改為 mp3 檔案
|
| 18 |
+
NUM_EXAMPLES = [
|
| 19 |
+
{"zh": "一", "tsou": "coni", "mp3": os.path.join(SAMPLES_DIR, "1.mp3")},
|
| 20 |
+
{"zh": "二", "tsou": "yuso", "mp3": os.path.join(SAMPLES_DIR, "2.mp3")},
|
| 21 |
+
{"zh": "三", "tsou": "tuyu", "mp3": os.path.join(SAMPLES_DIR, "3.mp3")},
|
| 22 |
+
{"zh": "四", "tsou": "sʉptʉ", "mp3": os.path.join(SAMPLES_DIR, "4.mp3")},
|
| 23 |
+
{"zh": "五", "tsou": "eimo", "mp3": os.path.join(SAMPLES_DIR, "5.mp3")},
|
| 24 |
+
{"zh": "六", "tsou": "nomʉ", "mp3": os.path.join(SAMPLES_DIR, "6.mp3")},
|
| 25 |
+
{"zh": "七", "tsou": "pitu", "mp3": os.path.join(SAMPLES_DIR, "7.mp3")},
|
| 26 |
+
{"zh": "八", "tsou": "voyu", "mp3": os.path.join(SAMPLES_DIR, "8.mp3")},
|
| 27 |
+
{"zh": "九", "tsou": "siyo", "mp3": os.path.join(SAMPLES_DIR, "9.mp3")},
|
| 28 |
+
{"zh": "十", "tsou": "maskʉ", "mp3": os.path.join(SAMPLES_DIR, "10.mp3")},
|
| 29 |
+
]
|
| 30 |
|
|
|
|
| 31 |
def _load_config(cfg_path: str):
|
| 32 |
if not os.path.exists(cfg_path):
|
| 33 |
raise FileNotFoundError(f"找不到設定檔:{cfg_path}")
|
| 34 |
with open(cfg_path, "r", encoding="utf-8") as f:
|
| 35 |
return json.load(f)
|
| 36 |
|
| 37 |
+
_config = _load_config(CONFIG_PATH)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
_phoneme_map = _config["phoneme_id_map"]
|
| 39 |
_pad_id = _phoneme_map.get("_")
|
| 40 |
_bos_id = _phoneme_map.get("^")
|
| 41 |
_eos_id = _phoneme_map.get("$")
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
_session = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
|
| 44 |
|
| 45 |
def _text_to_ids(text: str) -> np.ndarray:
|
|
|
|
| 46 |
ids = [_bos_id] + [_phoneme_map.get(c, _pad_id) for c in text] + [_eos_id]
|
| 47 |
return np.array(ids, dtype=np.int64)
|
| 48 |
|
|
|
|
| 50 |
text = (text or "").strip()
|
| 51 |
if not text:
|
| 52 |
raise gr.Error("請輸入要合成的文字!")
|
|
|
|
| 53 |
ids = _text_to_ids(text).reshape(1, -1)
|
| 54 |
ids_len = np.array([ids.shape[1]], dtype=np.int64)
|
| 55 |
scales = np.array(list(SCALES), dtype=np.float32)
|
| 56 |
+
audio = _session.run(None, {"input": ids, "input_lengths": ids_len, "scales": scales})[0].squeeze()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
sr = int(_config["audio"]["sample_rate"])
|
| 58 |
+
out_name = f"tsou_tts_{int(time.time()*1000)}.wav"
|
| 59 |
+
sf.write(out_name, audio, samplerate=sr)
|
| 60 |
+
return (sr, audio), out_name
|
| 61 |
+
|
| 62 |
+
def _pick_example(sample):
|
| 63 |
+
if isinstance(sample, list) and sample:
|
| 64 |
+
return sample[1]
|
| 65 |
+
return ""
|
| 66 |
+
|
| 67 |
+
with gr.Blocks(title="鄒語語音合成 (女聲) — ONNX") as demo:
|
| 68 |
+
gr.Markdown("""
|
| 69 |
+
# 鄒語語音合成 (女聲) — ONNX
|
| 70 |
+
- 輸入鄒語文字,按「合成」取得音檔
|
| 71 |
+
- 下方提供 **中文/鄒語/音檔** 範例(數字 1–10,mp3 格式)。點範例可自動帶入上方輸入框。
|
| 72 |
+
""")
|
| 73 |
+
text_in = gr.Textbox(lines=3, placeholder="請輸入鄒語文字…", label="輸入文字")
|
| 74 |
+
run_btn = gr.Button("🚀 合成", variant="primary")
|
| 75 |
+
audio_out = gr.Audio(label="合成音檔", interactive=False, show_download_button=True)
|
| 76 |
+
file_out = gr.File(label="下載 WAV 檔")
|
| 77 |
+
run_btn.click(synthesize, inputs=[text_in], outputs=[audio_out, file_out])
|
| 78 |
+
gr.Markdown("""
|
| 79 |
+
### 範例(中文/鄒語/音檔:1~10,mp3 格式)
|
| 80 |
+
請將 `1.mp3` ~ `10.mp3` 放到 `samples/` 資料夾。
|
| 81 |
+
""")
|
| 82 |
+
dataset = gr.Dataset(
|
| 83 |
+
components=[
|
| 84 |
+
gr.Textbox(label="中文", interactive=False),
|
| 85 |
+
gr.Textbox(label="鄒語", interactive=False),
|
| 86 |
+
gr.Audio(label="音檔", interactive=False),
|
| 87 |
+
],
|
| 88 |
+
samples=[[e["zh"], e["tsou"], e["mp3"] if os.path.exists(e["mp3"]) else None] for e in NUM_EXAMPLES],
|
| 89 |
+
label="中文/鄒語/音檔(1~10,mp3 格式)",
|
| 90 |
+
)
|
| 91 |
+
dataset.click(_pick_example, inputs=dataset, outputs=text_in)
|
| 92 |
|
| 93 |
if __name__ == "__main__":
|
|
|
|
| 94 |
demo.launch()
|
samples/1.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3769ff49ada3e47f632c80413eba5da2636dd01a309cd5d7a394cf78c4807c1
|
| 3 |
+
size 54629
|
samples/2.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5cf809334b0f967f8c5299579bd24f2794c2d2438602b97c63724310aa990d77
|
| 3 |
+
size 150752
|
samples/3.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2ea60eccf5912a305bc060dd928df57bfbbf4e39f0c0de27b045c901cee0e07b
|
| 3 |
+
size 59442
|
samples/4.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc2462c07d68b28a49eacda6cd1887632987a73c247d6517c42b883b90f63af8
|
| 3 |
+
size 432698
|
samples/5.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ebe4cbc9d9605b41a6a7e084b518c2225152e887c4f6b2df6193729ad6b51d2
|
| 3 |
+
size 459894
|
samples/cou_b_01065_c19.mp3:Zone.Identifier
ADDED
|
File without changes
|
samples/cou_b_03730_c19.mp3:Zone.Identifier
ADDED
|
File without changes
|
samples/cou_b_06023_c19.mp3:Zone.Identifier
ADDED
|
File without changes
|
samples/cou_e_03884_c19.mp3:Zone.Identifier
ADDED
|
File without changes
|
samples/cou_e_04320_c19.mp3:Zone.Identifier
ADDED
|
File without changes
|