benjaminzhang commited on
Commit
593b69f
·
1 Parent(s): 4cd9425

Fix model/config path for Hugging Face

Browse files
Files changed (1) hide show
  1. app.py +32 -108
app.py CHANGED
@@ -1,11 +1,4 @@
1
- # truku_tts_gradio_app.py (極簡版)
2
- # ------------------------------------------------------------
3
- # 太魯閣語 (Truku) TTS — ONNX 推論的極簡 Gradio 介面
4
- # 需求:只輸入文字 → 直接合成(固定 model/config/scales)
5
- # - UI:只保留文字輸入、合成按鈕、音檔播放/下載
6
- # - 其餘選項全部移除
7
- # ------------------------------------------------------------
8
-
9
  import os
10
  import json
11
  import time
@@ -14,114 +7,46 @@ import soundfile as sf
14
  import onnxruntime as ort
15
  import gradio as gr
16
 
17
- # ======== 固定參數(依你的環境修改) ========
18
  MODEL_PATH = "cou_total.onnx"
19
  CONFIG_PATH = "cou_medium.onnx.json"
20
  SCALES = (0.667, 1.0, 0.8) # (length_scale, noise_scale, noise_w)
21
 
22
- # ======== 輔助:載入 config/phoneme_map、建立 ONNX session ========
23
- with open(CONFIG_PATH, "r", encoding="utf-8") as f:
24
- _config = json.load(f)
25
-
26
- _phoneme_map = _config["phoneme_id_map"]
27
- _pad_id = _phoneme_map["_"]
28
- _bos_id = _phoneme_map["^"]
29
- _eos_id = _phoneme_map["$"]
30
-
31
- # 若需要 GPU,可改 providers,例如:
32
- # providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
33
- _session = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
34
-
35
-
36
- def _text_to_ids(text: str) -> np.ndarray:
37
- # 逐字轉 id;未知字用 pad_id;前後加 BOS/EOS
38
- ids = [_bos_id] + [_phoneme_map.get(c, _pad_id) for c in text] + [_eos_id]
39
- return np.array(ids, dtype=np.int64)
40
-
41
-
42
- def synthesize(text: str):
43
- text = (text or "").strip()
44
- if not text:
45
- raise gr.Error("請輸入要合成的文字!")
46
-
47
- ids = _text_to_ids(text).reshape(1, -1)
48
- ids_len = np.array([ids.shape[1]], dtype=np.int64)
49
- scales = np.array(list(SCALES), dtype=np.float32)
50
 
51
- start = time.time()
52
- audio = _session.run(
53
- None,
54
- {"input": ids, "input_lengths": ids_len, "scales": scales},
55
- )[0].squeeze()
56
- rt = round(time.time() - start, 3)
57
 
58
- sr = int(_config["audio"]["sample_rate"]) # 取樣率
 
59
 
60
- # 同時寫檔,供下載
61
- out_name = f"truku_tts_{int(time.time()*1000)}.wav"
62
- out_path = os.path.abspath(out_name)
63
- sf.write(out_path, audio, samplerate=sr)
64
-
65
- # gr.Audio 可直接用 (sr, waveform)
66
- return (sr, audio), out_path
67
-
68
-
69
- # ======== 極簡 UI:只有一個輸入 + 合成 + 音檔 ========
70
- demo = gr.Interface(
71
- fn=synthesize,
72
- inputs=gr.Textbox(lines=3, placeholder="請輸入太魯閣語文字…", label="輸入文字"),
73
- outputs=[
74
- gr.Audio(label="合成音檔", interactive=False, show_download_button=True),
75
- gr.File(label="下載 WAV 檔"),
76
- ],
77
- title="太魯閣語語音合成 (女聲) — ONNX",
78
- description="輸入文字後按下「提交」即可合成。模型與參數固定在程式內。",
79
- )
80
-
81
- if __name__ == "__main__":
82
- # 本機建議:若在 WSL,無法用 localhost 時,改用 share=True 或用 127.0.0.1:port
83
- demo.launch(server_name="127.0.0.1", server_port=7860)
84
- # truku_tts_gradio_app.py (極簡版)
85
- # ------------------------------------------------------------
86
- # 太魯閣語 (Truku) TTS — ONNX 推論的極簡 Gradio 介面
87
- # 需求:只輸入文字 → 直接合成(固定 model/config/scales)
88
- # - UI:只保留文字輸入、合成按鈕、音檔播放/下載
89
- # - 其餘選項全部移除
90
- # ------------------------------------------------------------
91
-
92
- import os
93
- import json
94
- import time
95
- import numpy as np
96
- import soundfile as sf
97
- import onnxruntime as ort
98
- import gradio as gr
99
-
100
- # ======== 固定參數(依你的環境修改) ========
101
- MODEL_PATH = "/home/benjamin/TTS/cou_total.onnx"
102
- CONFIG_PATH = "/home/benjamin/TTS/cou_medium.onnx.json"
103
- SCALES = (0.667, 1.0, 0.8) # (length_scale, noise_scale, noise_w)
104
 
105
- # ======== 輔助:載入 config/phoneme_map、建立 ONNX session ========
106
- with open(CONFIG_PATH, "r", encoding="utf-8") as f:
107
- _config = json.load(f)
108
 
109
- _phoneme_map = _config["phoneme_id_map"]
110
- _pad_id = _phoneme_map["_"]
111
- _bos_id = _phoneme_map["^"]
112
- _eos_id = _phoneme_map["$"]
113
 
114
- # 若需要 GPU,可改 providers,例如:
115
- # providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
116
  _session = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
117
 
118
-
119
  def _text_to_ids(text: str) -> np.ndarray:
120
  # 逐字轉 id;未知字用 pad_id;前後加 BOS/EOS
121
  ids = [_bos_id] + [_phoneme_map.get(c, _pad_id) for c in text] + [_eos_id]
122
  return np.array(ids, dtype=np.int64)
123
 
124
-
125
  def synthesize(text: str):
126
  text = (text or "").strip()
127
  if not text:
@@ -131,25 +56,23 @@ def synthesize(text: str):
131
  ids_len = np.array([ids.shape[1]], dtype=np.int64)
132
  scales = np.array(list(SCALES), dtype=np.float32)
133
 
134
- start = time.time()
135
  audio = _session.run(
136
  None,
137
  {"input": ids, "input_lengths": ids_len, "scales": scales},
138
  )[0].squeeze()
139
- rt = round(time.time() - start, 3)
140
 
141
- sr = int(_config["audio"]["sample_rate"]) # 取樣率
 
142
 
143
- # 同時寫檔,供下載
144
  out_name = f"truku_tts_{int(time.time()*1000)}.wav"
145
  out_path = os.path.abspath(out_name)
146
  sf.write(out_path, audio, samplerate=sr)
147
 
148
- # gr.Audio 可直接用 (sr, waveform)
149
  return (sr, audio), out_path
150
 
151
-
152
- # ======== 極簡 UI:只有一個輸入 + 合成 + 音檔 ========
153
  demo = gr.Interface(
154
  fn=synthesize,
155
  inputs=gr.Textbox(lines=3, placeholder="請輸入太魯閣語文字…", label="輸入文字"),
@@ -159,8 +82,9 @@ demo = gr.Interface(
159
  ],
160
  title="太魯閣語語音合成 (女聲) — ONNX",
161
  description="輸入文字後按下「提交」即可合成。模型與參數固定在程式內。",
 
162
  )
163
 
164
  if __name__ == "__main__":
165
- # 本機建議:若在 WSL,無法用 localhost 時,改用 share=True 或用 127.0.0.1:port
166
- demo.launch(server_name="127.0.0.1", server_port=7860)
 
1
+ # 太魯閣語 (Truku) TTS — ONNX + Gradio(Hugging Face 版,公開可用)
 
 
 
 
 
 
 
2
  import os
3
  import json
4
  import time
 
7
  import onnxruntime as ort
8
  import gradio as gr
9
 
10
+ # ======== 固定參數(相對路徑,跟 app.py 放同一層) ========
11
  MODEL_PATH = "cou_total.onnx"
12
  CONFIG_PATH = "cou_medium.onnx.json"
13
  SCALES = (0.667, 1.0, 0.8) # (length_scale, noise_scale, noise_w)
14
 
15
+ # ======== 載入 config 與建立 ONNX session ========
16
+ def _load_config(cfg_path: str):
17
+ if not os.path.exists(cfg_path):
18
+ raise FileNotFoundError(f"找不到設定檔:{cfg_path}")
19
+ with open(cfg_path, "r", encoding="utf-8") as f:
20
+ return json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ try:
23
+ _config = _load_config(CONFIG_PATH)
24
+ except Exception as e:
25
+ # Spaces 在介面上清楚回報錯誤
26
+ raise RuntimeError(f"CONFIG 載入失敗:{e}")
 
27
 
28
+ if "phoneme_id_map" not in _config:
29
+ raise RuntimeError("CONFIG 缺少 'phoneme_id_map' 欄位")
30
 
31
+ _phoneme_map = _config["phoneme_id_map"]
32
+ _pad_id = _phoneme_map.get("_")
33
+ _bos_id = _phoneme_map.get("^")
34
+ _eos_id = _phoneme_map.get("$")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ if None in (_pad_id, _bos_id, _eos_id):
37
+ raise RuntimeError("CONFIG 的 phoneme_id_map 缺少 _、^、$ 其中之一")
 
38
 
39
+ if not os.path.exists(MODEL_PATH):
40
+ raise FileNotFoundError(f"找不到模型檔:{MODEL_PATH}")
 
 
41
 
42
+ # 若需 GPU,可改 providers=["CUDAExecutionProvider","CPUExecutionProvider"]
 
43
  _session = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
44
 
 
45
  def _text_to_ids(text: str) -> np.ndarray:
46
  # 逐字轉 id;未知字用 pad_id;前後加 BOS/EOS
47
  ids = [_bos_id] + [_phoneme_map.get(c, _pad_id) for c in text] + [_eos_id]
48
  return np.array(ids, dtype=np.int64)
49
 
 
50
  def synthesize(text: str):
51
  text = (text or "").strip()
52
  if not text:
 
56
  ids_len = np.array([ids.shape[1]], dtype=np.int64)
57
  scales = np.array(list(SCALES), dtype=np.float32)
58
 
 
59
  audio = _session.run(
60
  None,
61
  {"input": ids, "input_lengths": ids_len, "scales": scales},
62
  )[0].squeeze()
 
63
 
64
+ # 取樣率
65
+ sr = int(_config["audio"]["sample_rate"])
66
 
67
+ # 寫檔供下載(放工作目錄即可)
68
  out_name = f"truku_tts_{int(time.time()*1000)}.wav"
69
  out_path = os.path.abspath(out_name)
70
  sf.write(out_path, audio, samplerate=sr)
71
 
72
+ # 回傳 (sr, waveform) 供播放 + 檔案供下載
73
  return (sr, audio), out_path
74
 
75
+ # ======== 極簡 UI ========
 
76
  demo = gr.Interface(
77
  fn=synthesize,
78
  inputs=gr.Textbox(lines=3, placeholder="請輸入太魯閣語文字…", label="輸入文字"),
 
82
  ],
83
  title="太魯閣語語音合成 (女聲) — ONNX",
84
  description="輸入文字後按下「提交」即可合成。模型與參數固定在程式內。",
85
+ allow_flagging="never",
86
  )
87
 
88
  if __name__ == "__main__":
89
+ # Hugging Face Spaces 不要指定 host/port,也不要 share=True
90
+ demo.launch()