Spaces:
Sleeping
Sleeping
Joseph Pollack
commited on
adds dynamic and multilingual voxpopuli phrases instead, ssr_mode=false in gradio interfaces
Browse files- interface.py +135 -23
- requirements.txt +2 -1
- templates/spaces/demo_voxtral/app.py +1 -1
interface.py
CHANGED
|
@@ -251,18 +251,54 @@ def start_voxtral_training(
|
|
| 251 |
yield line
|
| 252 |
|
| 253 |
|
| 254 |
-
|
| 255 |
-
"
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
"
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
|
| 267 |
with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
| 268 |
has_gpu, gpu_msg = detect_nvidia_driver()
|
|
@@ -301,16 +337,87 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
| 301 |
|
| 302 |
jsonl_out = gr.Textbox(label="Dataset JSONL path", interactive=False, visible=True)
|
| 303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
# Recording grid with dynamic text readouts
|
| 305 |
-
phrase_texts_state = gr.State(
|
|
|
|
|
|
|
| 306 |
phrase_markdowns: list[gr.Markdown] = []
|
| 307 |
rec_components = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
with gr.Column():
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
# Advanced options accordion
|
| 316 |
with gr.Accordion("Advanced options", open=False):
|
|
@@ -366,7 +473,8 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
| 366 |
sr, data = rec
|
| 367 |
out_path = wav_dir / f"rec_{i:04d}.wav"
|
| 368 |
sf.write(str(out_path), data, sr)
|
| 369 |
-
|
|
|
|
| 370 |
rows.append({"audio_path": str(out_path), "text": label_text})
|
| 371 |
jsonl_path = dataset_dir / "data.jsonl"
|
| 372 |
_write_jsonl(rows, jsonl_path)
|
|
@@ -409,11 +517,15 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
| 409 |
jsonl_path = dataset_dir / "data.jsonl"
|
| 410 |
_write_jsonl(rows, jsonl_path)
|
| 411 |
# Build markdown content updates for on-screen prompts
|
| 412 |
-
|
| 413 |
for i in range(len(phrase_markdowns)):
|
| 414 |
t = texts[i] if i < len(texts) else ""
|
| 415 |
-
|
| 416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
vp_btn.click(
|
| 419 |
_collect_voxpopuli,
|
|
@@ -439,6 +551,6 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
| 439 |
if __name__ == "__main__":
|
| 440 |
server_port = int(os.environ.get("INTERFACE_PORT", "7860"))
|
| 441 |
server_name = os.environ.get("INTERFACE_HOST", "0.0.0.0")
|
| 442 |
-
demo.queue().launch(server_name=server_name, server_port=server_port, mcp_server=True)
|
| 443 |
|
| 444 |
|
|
|
|
| 251 |
yield line
|
| 252 |
|
| 253 |
|
| 254 |
+
def load_voxpopuli_phrases(language="en", max_phrases=None, split="train"):
|
| 255 |
+
"""Load phrases from VoxPopuli dataset.
|
| 256 |
+
|
| 257 |
+
Args:
|
| 258 |
+
language: Language code (e.g., 'en', 'de', 'fr', etc.)
|
| 259 |
+
max_phrases: Maximum number of phrases to load (None for all available)
|
| 260 |
+
split: Dataset split to use ('train', 'validation', 'test')
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
List of normalized text phrases
|
| 264 |
+
"""
|
| 265 |
+
try:
|
| 266 |
+
from datasets import load_dataset
|
| 267 |
+
import random
|
| 268 |
+
|
| 269 |
+
# Load the specified language dataset
|
| 270 |
+
ds = load_dataset("facebook/voxpopuli", language, split=split)
|
| 271 |
+
|
| 272 |
+
# Extract normalized text phrases
|
| 273 |
+
phrases = []
|
| 274 |
+
for example in ds:
|
| 275 |
+
text = example.get("normalized_text", "").strip()
|
| 276 |
+
if text and len(text) > 10: # Filter out very short phrases
|
| 277 |
+
phrases.append(text)
|
| 278 |
+
|
| 279 |
+
# Shuffle and limit if specified
|
| 280 |
+
if max_phrases:
|
| 281 |
+
phrases = random.sample(phrases, min(max_phrases, len(phrases)))
|
| 282 |
+
else:
|
| 283 |
+
# If no limit, shuffle the entire list
|
| 284 |
+
random.shuffle(phrases)
|
| 285 |
+
|
| 286 |
+
return phrases
|
| 287 |
+
|
| 288 |
+
except Exception as e:
|
| 289 |
+
print(f"Error loading VoxPopuli phrases: {e}")
|
| 290 |
+
# Fallback to some basic phrases if loading fails
|
| 291 |
+
return [
|
| 292 |
+
"The quick brown fox jumps over the lazy dog.",
|
| 293 |
+
"Please say your full name.",
|
| 294 |
+
"Today is a good day to learn something new.",
|
| 295 |
+
"Artificial intelligence helps with many tasks.",
|
| 296 |
+
"I enjoy reading books and listening to music.",
|
| 297 |
+
]
|
| 298 |
+
|
| 299 |
+
# Initialize phrases dynamically
|
| 300 |
+
VOXPOPULI_LANGUAGE = "en" # Default to English
|
| 301 |
+
ALL_PHRASES = load_voxpopuli_phrases(VOXPOPULI_LANGUAGE, max_phrases=None)
|
| 302 |
|
| 303 |
with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
| 304 |
has_gpu, gpu_msg = detect_nvidia_driver()
|
|
|
|
| 337 |
|
| 338 |
jsonl_out = gr.Textbox(label="Dataset JSONL path", interactive=False, visible=True)
|
| 339 |
|
| 340 |
+
# Language selection for VoxPopuli phrases
|
| 341 |
+
voxpopuli_lang = gr.Dropdown(
|
| 342 |
+
choices=["en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr", "sk", "sl", "et", "lt"],
|
| 343 |
+
value="en",
|
| 344 |
+
label="VoxPopuli Language",
|
| 345 |
+
info="Select language for phrases from VoxPopuli dataset"
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
# Recording grid with dynamic text readouts
|
| 349 |
+
phrase_texts_state = gr.State(ALL_PHRASES)
|
| 350 |
+
visible_rows_state = gr.State(10) # Start with 10 visible rows
|
| 351 |
+
max_rows = len(ALL_PHRASES) # No cap on total rows
|
| 352 |
phrase_markdowns: list[gr.Markdown] = []
|
| 353 |
rec_components = []
|
| 354 |
+
|
| 355 |
+
def create_recording_grid(phrases, visible_count=10):
|
| 356 |
+
"""Create recording grid components dynamically"""
|
| 357 |
+
markdowns = []
|
| 358 |
+
recordings = []
|
| 359 |
+
for idx, phrase in enumerate(phrases):
|
| 360 |
+
visible = idx < visible_count
|
| 361 |
+
md = gr.Markdown(f"**{idx+1}. {phrase}**", visible=visible)
|
| 362 |
+
markdowns.append(md)
|
| 363 |
+
comp = gr.Audio(sources="microphone", type="numpy", label=f"Recording {idx+1}", visible=visible)
|
| 364 |
+
recordings.append(comp)
|
| 365 |
+
return markdowns, recordings
|
| 366 |
+
|
| 367 |
+
# Initial grid creation
|
| 368 |
with gr.Column():
|
| 369 |
+
phrase_markdowns, rec_components = create_recording_grid(ALL_PHRASES, 10)
|
| 370 |
+
|
| 371 |
+
# Add more rows button
|
| 372 |
+
add_rows_btn = gr.Button("➕ Add 10 More Rows", variant="secondary")
|
| 373 |
+
|
| 374 |
+
def add_more_rows(current_visible, current_phrases):
|
| 375 |
+
"""Add 10 more rows by making them visible"""
|
| 376 |
+
new_visible = min(current_visible + 10, len(current_phrases))
|
| 377 |
+
visibility_updates = []
|
| 378 |
+
for i in range(len(current_phrases)):
|
| 379 |
+
if i < new_visible:
|
| 380 |
+
visibility_updates.append(gr.update(visible=True))
|
| 381 |
+
else:
|
| 382 |
+
visibility_updates.append(gr.update(visible=False))
|
| 383 |
+
return [new_visible] + visibility_updates
|
| 384 |
+
|
| 385 |
+
def change_language(language):
|
| 386 |
+
"""Change the language and reload phrases from VoxPopuli"""
|
| 387 |
+
new_phrases = load_voxpopuli_phrases(language, max_phrases=None)
|
| 388 |
+
# Reset visible rows to 10
|
| 389 |
+
visible_count = min(10, len(new_phrases))
|
| 390 |
+
|
| 391 |
+
# Create combined updates for existing components (up to current length)
|
| 392 |
+
current_len = len(phrase_markdowns)
|
| 393 |
+
combined_updates = []
|
| 394 |
+
|
| 395 |
+
# Update existing components
|
| 396 |
+
for i in range(current_len):
|
| 397 |
+
if i < len(new_phrases):
|
| 398 |
+
if i < visible_count:
|
| 399 |
+
combined_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=True))
|
| 400 |
+
else:
|
| 401 |
+
combined_updates.append(gr.update(visible=False))
|
| 402 |
+
else:
|
| 403 |
+
combined_updates.append(gr.update(visible=False))
|
| 404 |
+
|
| 405 |
+
# If we have more phrases than components, we can't update them via Gradio
|
| 406 |
+
# The interface will need to be reloaded for significantly different phrase counts
|
| 407 |
+
return [new_phrases, visible_count] + combined_updates
|
| 408 |
+
|
| 409 |
+
# Connect language change to phrase reloading
|
| 410 |
+
voxpopuli_lang.change(
|
| 411 |
+
change_language,
|
| 412 |
+
inputs=[voxpopuli_lang],
|
| 413 |
+
outputs=[phrase_texts_state, visible_rows_state] + phrase_markdowns + rec_components
|
| 414 |
+
)
|
| 415 |
+
|
| 416 |
+
add_rows_btn.click(
|
| 417 |
+
add_more_rows,
|
| 418 |
+
inputs=[visible_rows_state, phrase_texts_state],
|
| 419 |
+
outputs=[visible_rows_state] + phrase_markdowns + rec_components
|
| 420 |
+
)
|
| 421 |
|
| 422 |
# Advanced options accordion
|
| 423 |
with gr.Accordion("Advanced options", open=False):
|
|
|
|
| 473 |
sr, data = rec
|
| 474 |
out_path = wav_dir / f"rec_{i:04d}.wav"
|
| 475 |
sf.write(str(out_path), data, sr)
|
| 476 |
+
# Use the full phrase list (ALL_PHRASES) instead of just PHRASES
|
| 477 |
+
label_text = (texts[i] if isinstance(texts, list) and i < len(texts) else (ALL_PHRASES[i] if i < len(ALL_PHRASES) else ""))
|
| 478 |
rows.append({"audio_path": str(out_path), "text": label_text})
|
| 479 |
jsonl_path = dataset_dir / "data.jsonl"
|
| 480 |
_write_jsonl(rows, jsonl_path)
|
|
|
|
| 517 |
jsonl_path = dataset_dir / "data.jsonl"
|
| 518 |
_write_jsonl(rows, jsonl_path)
|
| 519 |
# Build markdown content updates for on-screen prompts
|
| 520 |
+
combined_updates = []
|
| 521 |
for i in range(len(phrase_markdowns)):
|
| 522 |
t = texts[i] if i < len(texts) else ""
|
| 523 |
+
if i < len(texts):
|
| 524 |
+
combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
|
| 525 |
+
else:
|
| 526 |
+
combined_updates.append(gr.update(visible=False))
|
| 527 |
+
|
| 528 |
+
return (str(jsonl_path), texts, *combined_updates)
|
| 529 |
|
| 530 |
vp_btn.click(
|
| 531 |
_collect_voxpopuli,
|
|
|
|
| 551 |
if __name__ == "__main__":
|
| 552 |
server_port = int(os.environ.get("INTERFACE_PORT", "7860"))
|
| 553 |
server_name = os.environ.get("INTERFACE_HOST", "0.0.0.0")
|
| 554 |
+
demo.queue().launch(server_name=server_name, server_port=server_port, mcp_server=True, ssr_mode=False)
|
| 555 |
|
| 556 |
|
requirements.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
torch
|
| 2 |
datasets
|
| 3 |
peft
|
| 4 |
-
transformers
|
|
|
|
|
|
| 1 |
torch
|
| 2 |
datasets
|
| 3 |
peft
|
| 4 |
+
transformers
|
| 5 |
+
gradio
|
templates/spaces/demo_voxtral/app.py
CHANGED
|
@@ -30,6 +30,6 @@ with gr.Blocks() as demo:
|
|
| 30 |
btn.click(transcribe, inputs=[audio], outputs=[out])
|
| 31 |
|
| 32 |
if __name__ == "__main__":
|
| 33 |
-
demo.launch(mcp_server=True)
|
| 34 |
|
| 35 |
|
|
|
|
| 30 |
btn.click(transcribe, inputs=[audio], outputs=[out])
|
| 31 |
|
| 32 |
if __name__ == "__main__":
|
| 33 |
+
demo.launch(mcp_server=True, ssr_mode=False)
|
| 34 |
|
| 35 |
|