JotunnBurton commited on
Commit
bfb8cef
·
verified ·
1 Parent(s): b537b52

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -0
app.py CHANGED
@@ -187,6 +187,126 @@ def create_tts_fn(hps, net_g, device):
187
  return "Success", (hps.data.sampling_rate, audio)
188
  return tts_fn
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  # Then patch create_tab to accept split_fn and use it in slicer.click
191
  # And in the model loop, generate both tts_fn and split_fn then pass both into create_tab
192
  # (Same as your current setup but now split_fn is isolated per model just like tts_fn)
 
187
  return "Success", (hps.data.sampling_rate, audio)
188
  return tts_fn
189
 
190
+
191
+ # Function to build a single tab per model
192
+ def create_tab(title, example, speakers, tts_fn, split_fn, repid):
193
+ with gr.TabItem(speakers[0]):
194
+ gr.Markdown(
195
+ '<div align="center">'
196
+ f'<a><strong>{repid}</strong></a>'
197
+ f'<br>'
198
+ f'<a><strong>{title}</strong></a>'
199
+ f'<br>'
200
+ f'<a><strong>{speakers}</strong></a>'
201
+ f'</div>'
202
+ )
203
+ with gr.Row():
204
+ with gr.Column():
205
+ input_text = gr.Textbox(label="Input text", lines=5, value=example)
206
+ speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="Speaker")
207
+ prompt_mode = gr.Radio(["Text prompt", "Audio prompt"], label="Prompt Mode", value="Text prompt")
208
+ text_prompt = gr.Textbox(label="Text prompt", value="Happy", visible=True)
209
+ audio_prompt = gr.Audio(label="Audio prompt", type="filepath", visible=False)
210
+ sdp_ratio = gr.Slider(0, 1, 0.2, 0.1, label="SDP Ratio")
211
+ noise_scale = gr.Slider(0.1, 2.0, 0.6, 0.1, label="Noise")
212
+ noise_scale_w = gr.Slider(0.1, 2.0, 0.8, 0.1, label="Noise_W")
213
+ length_scale = gr.Slider(0.1, 2.0, 1.0, 0.1, label="Length")
214
+ language = gr.Dropdown(choices=["JP", "ZH", "EN", "mix", "auto"], value="JP", label="Language")
215
+ btn = gr.Button("Generate Audio", variant="primary")
216
+
217
+ with gr.Column():
218
+ with gr.Accordion("Semantic Fusion", open=False):
219
+ gr.Markdown(
220
+ value="Use auxiliary text semantics to assist speech generation (language remains same as main text)\n\n"
221
+ "**Note**: Avoid using *command-style text* (e.g., 'Happy'). Use *emotionally rich text* (e.g., 'I'm so happy!!!')\n\n"
222
+ "Leave it blank to disable. \n\n"
223
+ "**If mispronunciations occur, try replacing characters and inputting the original here with weight set to 1.0 for semantic retention.**"
224
+ )
225
+ style_text = gr.Textbox(label="Auxiliary Text")
226
+ style_weight = gr.Slider(0, 1, 0.7, 0.1, label="Weight", info="Ratio between main and auxiliary BERT embeddings")
227
+
228
+ with gr.Row():
229
+ with gr.Column():
230
+ interval_between_sent = gr.Slider(0, 5, 0.2, 0.1, label="Pause between sentences (sec)")
231
+ interval_between_para = gr.Slider(0, 10, 1, 0.1, label="Pause between paragraphs (sec)")
232
+ opt_cut_by_sent = gr.Checkbox(label="Split by sentence")
233
+ slicer = gr.Button("Split and Generate", variant="primary")
234
+
235
+ with gr.Column():
236
+ output_msg = gr.Textbox(label="Output Message")
237
+ output_audio = gr.Audio(label="Output Audio")
238
+
239
+ prompt_mode.change(lambda x: gr_util(x), inputs=[prompt_mode], outputs=[text_prompt, audio_prompt])
240
+ audio_prompt.upload(lambda x: load_audio(x), inputs=[audio_prompt], outputs=[audio_prompt])
241
+ btn.click(
242
+ tts_fn,
243
+ inputs=[
244
+ input_text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language,
245
+ audio_prompt, text_prompt, prompt_mode, style_text, style_weight
246
+ ],
247
+ outputs=[output_msg, output_audio],
248
+ )
249
+ slicer.click(
250
+ split_fn,
251
+ inputs=[
252
+ input_text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language,
253
+ opt_cut_by_sent, interval_between_para, interval_between_sent,
254
+ audio_prompt, text_prompt, style_text, style_weight
255
+ ],
256
+ outputs=[output_msg, output_audio],
257
+ )
258
+
259
+ # --- Main entry point ---
260
+ if __name__ == "__main__":
261
+ parser = argparse.ArgumentParser()
262
+ parser.add_argument("--share", default=False, help="make link public", action="store_true")
263
+ parser.add_argument("-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log")
264
+ args = parser.parse_args()
265
+
266
+ if args.debug:
267
+ logger.setLevel(logging.DEBUG)
268
+
269
+ with open("pretrained_models/info.json", "r", encoding="utf-8") as f:
270
+ models_info = json.load(f)
271
+
272
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
273
+ models = []
274
+ for _, info in models_info.items():
275
+ if not info['enable']:
276
+ continue
277
+ name, title, repid, example, filename = info['name'], info['title'], info['repid'], info['example'], info['filename']
278
+
279
+ files = list_repo_files(repo_id=repid)
280
+ model_subfolder = None
281
+ for f in files:
282
+ if f.endswith(filename):
283
+ parts = f.split("/")
284
+ if len(parts) > 1:
285
+ model_subfolder = "/".join(parts[:-1])
286
+ break
287
+
288
+ if model_subfolder:
289
+ model_path = hf_hub_download(repo_id=repid, filename=filename, subfolder=model_subfolder)
290
+ config_path = hf_hub_download(repo_id=repid, filename="config.json", subfolder=model_subfolder)
291
+ else:
292
+ model_path = hf_hub_download(repo_id=repid, filename=filename)
293
+ config_path = hf_hub_download(repo_id=repid, filename="config.json")
294
+
295
+ hps = utils.get_hparams_from_file(config_path)
296
+ version = hps.version if hasattr(hps, "version") else "v2"
297
+ net_g = get_net_g(model_path, version, device, hps)
298
+ tts_fn = create_tts_fn(hps, net_g, device)
299
+ split_fn = create_split_fn(hps, net_g, device)
300
+ models.append((title, example, list(hps.data.spk2id.keys()), tts_fn, split_fn, repid))
301
+
302
+ with gr.Blocks(theme='NoCrypt/miku') as app:
303
+ gr.Markdown("## ✅ All models loaded successfully. Ready to use.")
304
+ with gr.Tabs():
305
+ for (title, example, speakers, tts_fn, split_fn, repid) in models:
306
+ create_tab(title, example, speakers, tts_fn, split_fn, repid)
307
+
308
+ app.queue().launch(share=args.share)
309
+
310
  # Then patch create_tab to accept split_fn and use it in slicer.click
311
  # And in the model loop, generate both tts_fn and split_fn then pass both into create_tab
312
  # (Same as your current setup but now split_fn is isolated per model just like tts_fn)