Spaces:
Running
Running
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
app.py
CHANGED
|
@@ -240,23 +240,28 @@ with gr.Blocks() as app_multistyle:
|
|
| 240 |
|
| 241 |
# Regular speech type (mandatory)
|
| 242 |
with gr.Row():
|
| 243 |
-
|
|
|
|
|
|
|
| 244 |
regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
|
| 245 |
regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
|
| 246 |
|
| 247 |
# Additional speech types (up to 99 more)
|
| 248 |
max_speech_types = 100
|
| 249 |
speech_type_rows = []
|
| 250 |
-
speech_type_names = []
|
| 251 |
speech_type_audios = []
|
| 252 |
speech_type_ref_texts = []
|
| 253 |
speech_type_delete_btns = []
|
|
|
|
|
|
|
| 254 |
|
| 255 |
for i in range(max_speech_types - 1):
|
| 256 |
with gr.Row(visible=False) as row:
|
| 257 |
with gr.Column():
|
| 258 |
name_input = gr.Textbox(label="Speech Type Name")
|
| 259 |
delete_btn = gr.Button("Delete", variant="secondary")
|
|
|
|
| 260 |
audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
| 261 |
ref_text_input = gr.Textbox(label="Reference Text", lines=2)
|
| 262 |
speech_type_rows.append(row)
|
|
@@ -264,6 +269,7 @@ with gr.Blocks() as app_multistyle:
|
|
| 264 |
speech_type_audios.append(audio_input)
|
| 265 |
speech_type_ref_texts.append(ref_text_input)
|
| 266 |
speech_type_delete_btns.append(delete_btn)
|
|
|
|
| 267 |
|
| 268 |
# Button to add speech type
|
| 269 |
add_speech_type_btn = gr.Button("Add Speech Type")
|
|
@@ -321,6 +327,22 @@ with gr.Blocks() as app_multistyle:
|
|
| 321 |
placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
|
| 322 |
)
|
| 323 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
# Model choice
|
| 325 |
model_choice_multistyle = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
|
| 326 |
|
|
@@ -347,7 +369,7 @@ with gr.Blocks() as app_multistyle:
|
|
| 347 |
speech_type_names_list = args[:num_additional_speech_types]
|
| 348 |
speech_type_audios_list = args[num_additional_speech_types : 2 * num_additional_speech_types]
|
| 349 |
speech_type_ref_texts_list = args[2 * num_additional_speech_types : 3 * num_additional_speech_types]
|
| 350 |
-
model_choice = args[3 * num_additional_speech_types]
|
| 351 |
remove_silence = args[3 * num_additional_speech_types + 1]
|
| 352 |
|
| 353 |
# Collect the speech types and their audios into a dict
|
|
|
|
| 240 |
|
| 241 |
# Regular speech type (mandatory)
|
| 242 |
with gr.Row():
|
| 243 |
+
with gr.Column():
|
| 244 |
+
regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
|
| 245 |
+
regular_insert = gr.Button("Insert", variant="secondary")
|
| 246 |
regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
|
| 247 |
regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
|
| 248 |
|
| 249 |
# Additional speech types (up to 99 more)
|
| 250 |
max_speech_types = 100
|
| 251 |
speech_type_rows = []
|
| 252 |
+
speech_type_names = [regular_name]
|
| 253 |
speech_type_audios = []
|
| 254 |
speech_type_ref_texts = []
|
| 255 |
speech_type_delete_btns = []
|
| 256 |
+
speech_type_insert_btns = []
|
| 257 |
+
speech_type_insert_btns.append(regular_insert)
|
| 258 |
|
| 259 |
for i in range(max_speech_types - 1):
|
| 260 |
with gr.Row(visible=False) as row:
|
| 261 |
with gr.Column():
|
| 262 |
name_input = gr.Textbox(label="Speech Type Name")
|
| 263 |
delete_btn = gr.Button("Delete", variant="secondary")
|
| 264 |
+
insert_btn = gr.Button("Insert", variant="secondary")
|
| 265 |
audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
| 266 |
ref_text_input = gr.Textbox(label="Reference Text", lines=2)
|
| 267 |
speech_type_rows.append(row)
|
|
|
|
| 269 |
speech_type_audios.append(audio_input)
|
| 270 |
speech_type_ref_texts.append(ref_text_input)
|
| 271 |
speech_type_delete_btns.append(delete_btn)
|
| 272 |
+
speech_type_insert_btns.append(insert_btn)
|
| 273 |
|
| 274 |
# Button to add speech type
|
| 275 |
add_speech_type_btn = gr.Button("Add Speech Type")
|
|
|
|
| 327 |
placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
|
| 328 |
)
|
| 329 |
|
| 330 |
+
def make_insert_speech_type_fn(index):
|
| 331 |
+
def insert_speech_type_fn(current_text, speech_type_name):
|
| 332 |
+
current_text = current_text or ""
|
| 333 |
+
speech_type_name = speech_type_name or "None"
|
| 334 |
+
updated_text = current_text + f"{{{speech_type_name}}} "
|
| 335 |
+
return gr.update(value=updated_text)
|
| 336 |
+
return insert_speech_type_fn
|
| 337 |
+
|
| 338 |
+
for i, insert_btn in enumerate(speech_type_insert_btns):
|
| 339 |
+
insert_fn = make_insert_speech_type_fn(i)
|
| 340 |
+
insert_btn.click(
|
| 341 |
+
insert_fn,
|
| 342 |
+
inputs=[gen_text_input_multistyle, speech_type_names[i]],
|
| 343 |
+
outputs=gen_text_input_multistyle,
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
# Model choice
|
| 347 |
model_choice_multistyle = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
|
| 348 |
|
|
|
|
| 369 |
speech_type_names_list = args[:num_additional_speech_types]
|
| 370 |
speech_type_audios_list = args[num_additional_speech_types : 2 * num_additional_speech_types]
|
| 371 |
speech_type_ref_texts_list = args[2 * num_additional_speech_types : 3 * num_additional_speech_types]
|
| 372 |
+
model_choice = args[3 * num_additional_speech_types + 1]
|
| 373 |
remove_silence = args[3 * num_additional_speech_types + 1]
|
| 374 |
|
| 375 |
# Collect the speech types and their audios into a dict
|