Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
app.py
CHANGED
|
@@ -120,6 +120,14 @@ def infer(
|
|
| 120 |
speed=1,
|
| 121 |
show_info=gr.Info,
|
| 122 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
|
| 124 |
|
| 125 |
if model == "F5-TTS":
|
|
@@ -240,7 +248,7 @@ with gr.Blocks() as app_tts:
|
|
| 240 |
nfe_step=nfe_slider,
|
| 241 |
speed=speed_slider,
|
| 242 |
)
|
| 243 |
-
return audio_out, spectrogram_path,
|
| 244 |
|
| 245 |
generate_btn.click(
|
| 246 |
basic_tts,
|
|
@@ -320,7 +328,7 @@ with gr.Blocks() as app_multistyle:
|
|
| 320 |
)
|
| 321 |
|
| 322 |
# Regular speech type (mandatory)
|
| 323 |
-
with gr.Row():
|
| 324 |
with gr.Column():
|
| 325 |
regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
|
| 326 |
regular_insert = gr.Button("Insert Label", variant="secondary")
|
|
@@ -329,12 +337,12 @@ with gr.Blocks() as app_multistyle:
|
|
| 329 |
|
| 330 |
# Regular speech type (max 100)
|
| 331 |
max_speech_types = 100
|
| 332 |
-
speech_type_rows = []
|
| 333 |
-
speech_type_names = [regular_name]
|
| 334 |
-
speech_type_audios = [regular_audio]
|
| 335 |
-
speech_type_ref_texts = [regular_ref_text]
|
| 336 |
-
speech_type_delete_btns = []
|
| 337 |
-
speech_type_insert_btns = [regular_insert]
|
| 338 |
|
| 339 |
# Additional speech types (99 more)
|
| 340 |
for i in range(max_speech_types - 1):
|
|
@@ -355,51 +363,32 @@ with gr.Blocks() as app_multistyle:
|
|
| 355 |
# Button to add speech type
|
| 356 |
add_speech_type_btn = gr.Button("Add Speech Type")
|
| 357 |
|
| 358 |
-
# Keep track of
|
| 359 |
-
speech_type_count =
|
| 360 |
|
| 361 |
# Function to add a speech type
|
| 362 |
-
def add_speech_type_fn(
|
|
|
|
|
|
|
| 363 |
if speech_type_count < max_speech_types:
|
|
|
|
| 364 |
speech_type_count += 1
|
| 365 |
-
# Prepare updates for the rows
|
| 366 |
-
row_updates = []
|
| 367 |
-
for i in range(1, max_speech_types):
|
| 368 |
-
if i < speech_type_count:
|
| 369 |
-
row_updates.append(gr.update(visible=True))
|
| 370 |
-
else:
|
| 371 |
-
row_updates.append(gr.update())
|
| 372 |
else:
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
return [speech_type_count] + row_updates
|
| 376 |
|
| 377 |
-
add_speech_type_btn.click(
|
| 378 |
-
add_speech_type_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows
|
| 379 |
-
)
|
| 380 |
|
| 381 |
# Function to delete a speech type
|
| 382 |
-
def
|
| 383 |
-
|
| 384 |
-
# Prepare updates
|
| 385 |
-
row_updates = []
|
| 386 |
-
|
| 387 |
-
for i in range(1, max_speech_types):
|
| 388 |
-
if i == index:
|
| 389 |
-
row_updates.append(gr.update(visible=False))
|
| 390 |
-
else:
|
| 391 |
-
row_updates.append(gr.update())
|
| 392 |
-
|
| 393 |
-
speech_type_count = max(1, speech_type_count)
|
| 394 |
-
|
| 395 |
-
return [speech_type_count] + row_updates
|
| 396 |
-
|
| 397 |
-
return delete_speech_type_fn
|
| 398 |
|
| 399 |
# Update delete button clicks
|
| 400 |
-
for i
|
| 401 |
-
|
| 402 |
-
|
|
|
|
|
|
|
| 403 |
|
| 404 |
# Text input for the prompt
|
| 405 |
gen_text_input_multistyle = gr.Textbox(
|
|
@@ -413,7 +402,7 @@ with gr.Blocks() as app_multistyle:
|
|
| 413 |
current_text = current_text or ""
|
| 414 |
speech_type_name = speech_type_name or "None"
|
| 415 |
updated_text = current_text + f"{{{speech_type_name}}} "
|
| 416 |
-
return
|
| 417 |
|
| 418 |
return insert_speech_type_fn
|
| 419 |
|
|
@@ -473,10 +462,14 @@ with gr.Blocks() as app_multistyle:
|
|
| 473 |
if style in speech_types:
|
| 474 |
current_style = style
|
| 475 |
else:
|
| 476 |
-
|
| 477 |
current_style = "Regular"
|
| 478 |
|
| 479 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
ref_text = speech_types[current_style].get("ref_text", "")
|
| 481 |
|
| 482 |
# Generate speech for this segment
|
|
@@ -491,12 +484,10 @@ with gr.Blocks() as app_multistyle:
|
|
| 491 |
# Concatenate all audio segments
|
| 492 |
if generated_audio_segments:
|
| 493 |
final_audio_data = np.concatenate(generated_audio_segments)
|
| 494 |
-
return [(sr, final_audio_data)] + [
|
| 495 |
-
gr.update(value=speech_types[style]["ref_text"]) for style in speech_types
|
| 496 |
-
]
|
| 497 |
else:
|
| 498 |
gr.Warning("No audio generated.")
|
| 499 |
-
return [None] + [
|
| 500 |
|
| 501 |
generate_multistyle_btn.click(
|
| 502 |
generate_multistyle_speech,
|
|
@@ -514,7 +505,7 @@ with gr.Blocks() as app_multistyle:
|
|
| 514 |
|
| 515 |
# Validation function to disable Generate button if speech types are missing
|
| 516 |
def validate_speech_types(gen_text, regular_name, *args):
|
| 517 |
-
speech_type_names_list = args
|
| 518 |
|
| 519 |
# Collect the speech types names
|
| 520 |
speech_types_available = set()
|
|
@@ -678,7 +669,7 @@ Have a conversation with an AI using your reference voice!
|
|
| 678 |
speed=1.0,
|
| 679 |
show_info=print, # show_info=print no pull to top when generating
|
| 680 |
)
|
| 681 |
-
return audio_result,
|
| 682 |
|
| 683 |
def clear_conversation():
|
| 684 |
"""Reset the conversation"""
|
|
@@ -828,7 +819,10 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
|
|
| 828 |
visible=False,
|
| 829 |
)
|
| 830 |
custom_model_cfg = gr.Dropdown(
|
| 831 |
-
choices=[
|
|
|
|
|
|
|
|
|
|
| 832 |
value=load_last_used_custom()[2],
|
| 833 |
allow_custom_value=True,
|
| 834 |
label="Config: in a dictionary form",
|
|
|
|
| 120 |
speed=1,
|
| 121 |
show_info=gr.Info,
|
| 122 |
):
|
| 123 |
+
if not ref_audio_orig:
|
| 124 |
+
gr.Warning("Please provide reference audio.")
|
| 125 |
+
return gr.update(), gr.update(), ref_text
|
| 126 |
+
|
| 127 |
+
if not gen_text.strip():
|
| 128 |
+
gr.Warning("Please enter text to generate.")
|
| 129 |
+
return gr.update(), gr.update(), ref_text
|
| 130 |
+
|
| 131 |
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
|
| 132 |
|
| 133 |
if model == "F5-TTS":
|
|
|
|
| 248 |
nfe_step=nfe_slider,
|
| 249 |
speed=speed_slider,
|
| 250 |
)
|
| 251 |
+
return audio_out, spectrogram_path, ref_text_out
|
| 252 |
|
| 253 |
generate_btn.click(
|
| 254 |
basic_tts,
|
|
|
|
| 328 |
)
|
| 329 |
|
| 330 |
# Regular speech type (mandatory)
|
| 331 |
+
with gr.Row() as regular_row:
|
| 332 |
with gr.Column():
|
| 333 |
regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
|
| 334 |
regular_insert = gr.Button("Insert Label", variant="secondary")
|
|
|
|
| 337 |
|
| 338 |
# Regular speech type (max 100)
|
| 339 |
max_speech_types = 100
|
| 340 |
+
speech_type_rows = [regular_row]
|
| 341 |
+
speech_type_names = [regular_name]
|
| 342 |
+
speech_type_audios = [regular_audio]
|
| 343 |
+
speech_type_ref_texts = [regular_ref_text]
|
| 344 |
+
speech_type_delete_btns = [None]
|
| 345 |
+
speech_type_insert_btns = [regular_insert]
|
| 346 |
|
| 347 |
# Additional speech types (99 more)
|
| 348 |
for i in range(max_speech_types - 1):
|
|
|
|
| 363 |
# Button to add speech type
|
| 364 |
add_speech_type_btn = gr.Button("Add Speech Type")
|
| 365 |
|
| 366 |
+
# Keep track of autoincrement of speech types, no roll back
|
| 367 |
+
speech_type_count = 1
|
| 368 |
|
| 369 |
# Function to add a speech type
|
| 370 |
+
def add_speech_type_fn():
|
| 371 |
+
row_updates = [gr.update() for _ in range(max_speech_types)]
|
| 372 |
+
global speech_type_count
|
| 373 |
if speech_type_count < max_speech_types:
|
| 374 |
+
row_updates[speech_type_count] = gr.update(visible=True)
|
| 375 |
speech_type_count += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
else:
|
| 377 |
+
gr.Warning("Exhausted maximum number of speech types. Consider restart the app.")
|
| 378 |
+
return row_updates
|
|
|
|
| 379 |
|
| 380 |
+
add_speech_type_btn.click(add_speech_type_fn, outputs=speech_type_rows)
|
|
|
|
|
|
|
| 381 |
|
| 382 |
# Function to delete a speech type
|
| 383 |
+
def delete_speech_type_fn():
|
| 384 |
+
return gr.update(visible=False), None, None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
|
| 386 |
# Update delete button clicks
|
| 387 |
+
for i in range(1, len(speech_type_delete_btns)):
|
| 388 |
+
speech_type_delete_btns[i].click(
|
| 389 |
+
delete_speech_type_fn,
|
| 390 |
+
outputs=[speech_type_rows[i], speech_type_names[i], speech_type_audios[i], speech_type_ref_texts[i]],
|
| 391 |
+
)
|
| 392 |
|
| 393 |
# Text input for the prompt
|
| 394 |
gen_text_input_multistyle = gr.Textbox(
|
|
|
|
| 402 |
current_text = current_text or ""
|
| 403 |
speech_type_name = speech_type_name or "None"
|
| 404 |
updated_text = current_text + f"{{{speech_type_name}}} "
|
| 405 |
+
return updated_text
|
| 406 |
|
| 407 |
return insert_speech_type_fn
|
| 408 |
|
|
|
|
| 462 |
if style in speech_types:
|
| 463 |
current_style = style
|
| 464 |
else:
|
| 465 |
+
gr.Warning(f"Type {style} is not available, will use Regular as default.")
|
| 466 |
current_style = "Regular"
|
| 467 |
|
| 468 |
+
try:
|
| 469 |
+
ref_audio = speech_types[current_style]["audio"]
|
| 470 |
+
except KeyError:
|
| 471 |
+
gr.Warning(f"Please provide reference audio for type {current_style}.")
|
| 472 |
+
return [None] + [speech_types[style]["ref_text"] for style in speech_types]
|
| 473 |
ref_text = speech_types[current_style].get("ref_text", "")
|
| 474 |
|
| 475 |
# Generate speech for this segment
|
|
|
|
| 484 |
# Concatenate all audio segments
|
| 485 |
if generated_audio_segments:
|
| 486 |
final_audio_data = np.concatenate(generated_audio_segments)
|
| 487 |
+
return [(sr, final_audio_data)] + [speech_types[style]["ref_text"] for style in speech_types]
|
|
|
|
|
|
|
| 488 |
else:
|
| 489 |
gr.Warning("No audio generated.")
|
| 490 |
+
return [None] + [speech_types[style]["ref_text"] for style in speech_types]
|
| 491 |
|
| 492 |
generate_multistyle_btn.click(
|
| 493 |
generate_multistyle_speech,
|
|
|
|
| 505 |
|
| 506 |
# Validation function to disable Generate button if speech types are missing
|
| 507 |
def validate_speech_types(gen_text, regular_name, *args):
|
| 508 |
+
speech_type_names_list = args
|
| 509 |
|
| 510 |
# Collect the speech types names
|
| 511 |
speech_types_available = set()
|
|
|
|
| 669 |
speed=1.0,
|
| 670 |
show_info=print, # show_info=print no pull to top when generating
|
| 671 |
)
|
| 672 |
+
return audio_result, ref_text_out
|
| 673 |
|
| 674 |
def clear_conversation():
|
| 675 |
"""Reset the conversation"""
|
|
|
|
| 819 |
visible=False,
|
| 820 |
)
|
| 821 |
custom_model_cfg = gr.Dropdown(
|
| 822 |
+
choices=[
|
| 823 |
+
DEFAULT_TTS_MODEL_CFG[2],
|
| 824 |
+
json.dumps(dict(dim=768, depth=18, heads=12, ff_mult=2, text_dim=512, conv_layers=4)),
|
| 825 |
+
],
|
| 826 |
value=load_last_used_custom()[2],
|
| 827 |
allow_custom_value=True,
|
| 828 |
label="Config: in a dictionary form",
|