Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -187,6 +187,126 @@ def create_tts_fn(hps, net_g, device):
|
|
187 |
return "Success", (hps.data.sampling_rate, audio)
|
188 |
return tts_fn
|
189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
# Then patch create_tab to accept split_fn and use it in slicer.click
|
191 |
# And in the model loop, generate both tts_fn and split_fn then pass both into create_tab
|
192 |
# (Same as your current setup but now split_fn is isolated per model just like tts_fn)
|
|
|
187 |
return "Success", (hps.data.sampling_rate, audio)
|
188 |
return tts_fn
|
189 |
|
190 |
+
|
191 |
+
# Function to build a single tab per model
|
192 |
+
def create_tab(title, example, speakers, tts_fn, split_fn, repid):
|
193 |
+
with gr.TabItem(speakers[0]):
|
194 |
+
gr.Markdown(
|
195 |
+
'<div align="center">'
|
196 |
+
f'<a><strong>{repid}</strong></a>'
|
197 |
+
f'<br>'
|
198 |
+
f'<a><strong>{title}</strong></a>'
|
199 |
+
f'<br>'
|
200 |
+
f'<a><strong>{speakers}</strong></a>'
|
201 |
+
f'</div>'
|
202 |
+
)
|
203 |
+
with gr.Row():
|
204 |
+
with gr.Column():
|
205 |
+
input_text = gr.Textbox(label="Input text", lines=5, value=example)
|
206 |
+
speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="Speaker")
|
207 |
+
prompt_mode = gr.Radio(["Text prompt", "Audio prompt"], label="Prompt Mode", value="Text prompt")
|
208 |
+
text_prompt = gr.Textbox(label="Text prompt", value="Happy", visible=True)
|
209 |
+
audio_prompt = gr.Audio(label="Audio prompt", type="filepath", visible=False)
|
210 |
+
sdp_ratio = gr.Slider(0, 1, 0.2, 0.1, label="SDP Ratio")
|
211 |
+
noise_scale = gr.Slider(0.1, 2.0, 0.6, 0.1, label="Noise")
|
212 |
+
noise_scale_w = gr.Slider(0.1, 2.0, 0.8, 0.1, label="Noise_W")
|
213 |
+
length_scale = gr.Slider(0.1, 2.0, 1.0, 0.1, label="Length")
|
214 |
+
language = gr.Dropdown(choices=["JP", "ZH", "EN", "mix", "auto"], value="JP", label="Language")
|
215 |
+
btn = gr.Button("Generate Audio", variant="primary")
|
216 |
+
|
217 |
+
with gr.Column():
|
218 |
+
with gr.Accordion("Semantic Fusion", open=False):
|
219 |
+
gr.Markdown(
|
220 |
+
value="Use auxiliary text semantics to assist speech generation (language remains same as main text)\n\n"
|
221 |
+
"**Note**: Avoid using *command-style text* (e.g., 'Happy'). Use *emotionally rich text* (e.g., 'I'm so happy!!!')\n\n"
|
222 |
+
"Leave it blank to disable. \n\n"
|
223 |
+
"**If mispronunciations occur, try replacing characters and inputting the original here with weight set to 1.0 for semantic retention.**"
|
224 |
+
)
|
225 |
+
style_text = gr.Textbox(label="Auxiliary Text")
|
226 |
+
style_weight = gr.Slider(0, 1, 0.7, 0.1, label="Weight", info="Ratio between main and auxiliary BERT embeddings")
|
227 |
+
|
228 |
+
with gr.Row():
|
229 |
+
with gr.Column():
|
230 |
+
interval_between_sent = gr.Slider(0, 5, 0.2, 0.1, label="Pause between sentences (sec)")
|
231 |
+
interval_between_para = gr.Slider(0, 10, 1, 0.1, label="Pause between paragraphs (sec)")
|
232 |
+
opt_cut_by_sent = gr.Checkbox(label="Split by sentence")
|
233 |
+
slicer = gr.Button("Split and Generate", variant="primary")
|
234 |
+
|
235 |
+
with gr.Column():
|
236 |
+
output_msg = gr.Textbox(label="Output Message")
|
237 |
+
output_audio = gr.Audio(label="Output Audio")
|
238 |
+
|
239 |
+
prompt_mode.change(lambda x: gr_util(x), inputs=[prompt_mode], outputs=[text_prompt, audio_prompt])
|
240 |
+
audio_prompt.upload(lambda x: load_audio(x), inputs=[audio_prompt], outputs=[audio_prompt])
|
241 |
+
btn.click(
|
242 |
+
tts_fn,
|
243 |
+
inputs=[
|
244 |
+
input_text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language,
|
245 |
+
audio_prompt, text_prompt, prompt_mode, style_text, style_weight
|
246 |
+
],
|
247 |
+
outputs=[output_msg, output_audio],
|
248 |
+
)
|
249 |
+
slicer.click(
|
250 |
+
split_fn,
|
251 |
+
inputs=[
|
252 |
+
input_text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language,
|
253 |
+
opt_cut_by_sent, interval_between_para, interval_between_sent,
|
254 |
+
audio_prompt, text_prompt, style_text, style_weight
|
255 |
+
],
|
256 |
+
outputs=[output_msg, output_audio],
|
257 |
+
)
|
258 |
+
|
259 |
+
# --- Main entry point ---
|
260 |
+
if __name__ == "__main__":
|
261 |
+
parser = argparse.ArgumentParser()
|
262 |
+
parser.add_argument("--share", default=False, help="make link public", action="store_true")
|
263 |
+
parser.add_argument("-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log")
|
264 |
+
args = parser.parse_args()
|
265 |
+
|
266 |
+
if args.debug:
|
267 |
+
logger.setLevel(logging.DEBUG)
|
268 |
+
|
269 |
+
with open("pretrained_models/info.json", "r", encoding="utf-8") as f:
|
270 |
+
models_info = json.load(f)
|
271 |
+
|
272 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
273 |
+
models = []
|
274 |
+
for _, info in models_info.items():
|
275 |
+
if not info['enable']:
|
276 |
+
continue
|
277 |
+
name, title, repid, example, filename = info['name'], info['title'], info['repid'], info['example'], info['filename']
|
278 |
+
|
279 |
+
files = list_repo_files(repo_id=repid)
|
280 |
+
model_subfolder = None
|
281 |
+
for f in files:
|
282 |
+
if f.endswith(filename):
|
283 |
+
parts = f.split("/")
|
284 |
+
if len(parts) > 1:
|
285 |
+
model_subfolder = "/".join(parts[:-1])
|
286 |
+
break
|
287 |
+
|
288 |
+
if model_subfolder:
|
289 |
+
model_path = hf_hub_download(repo_id=repid, filename=filename, subfolder=model_subfolder)
|
290 |
+
config_path = hf_hub_download(repo_id=repid, filename="config.json", subfolder=model_subfolder)
|
291 |
+
else:
|
292 |
+
model_path = hf_hub_download(repo_id=repid, filename=filename)
|
293 |
+
config_path = hf_hub_download(repo_id=repid, filename="config.json")
|
294 |
+
|
295 |
+
hps = utils.get_hparams_from_file(config_path)
|
296 |
+
version = hps.version if hasattr(hps, "version") else "v2"
|
297 |
+
net_g = get_net_g(model_path, version, device, hps)
|
298 |
+
tts_fn = create_tts_fn(hps, net_g, device)
|
299 |
+
split_fn = create_split_fn(hps, net_g, device)
|
300 |
+
models.append((title, example, list(hps.data.spk2id.keys()), tts_fn, split_fn, repid))
|
301 |
+
|
302 |
+
with gr.Blocks(theme='NoCrypt/miku') as app:
|
303 |
+
gr.Markdown("## ✅ All models loaded successfully. Ready to use.")
|
304 |
+
with gr.Tabs():
|
305 |
+
for (title, example, speakers, tts_fn, split_fn, repid) in models:
|
306 |
+
create_tab(title, example, speakers, tts_fn, split_fn, repid)
|
307 |
+
|
308 |
+
app.queue().launch(share=args.share)
|
309 |
+
|
310 |
# Then patch create_tab to accept split_fn and use it in slicer.click
|
311 |
# And in the model loop, generate both tts_fn and split_fn then pass both into create_tab
|
312 |
# (Same as your current setup but now split_fn is isolated per model just like tts_fn)
|