Spaces:
Running
Running
Joseph Pollack
commited on
adds dynamic and multilingual voxpopuli phrases instead, ssr_mode=false in gradio interfaces
Browse files- interface.py +135 -23
- requirements.txt +2 -1
- templates/spaces/demo_voxtral/app.py +1 -1
interface.py
CHANGED
@@ -251,18 +251,54 @@ def start_voxtral_training(
|
|
251 |
yield line
|
252 |
|
253 |
|
254 |
-
|
255 |
-
"
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
"
|
265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
|
267 |
with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
268 |
has_gpu, gpu_msg = detect_nvidia_driver()
|
@@ -301,16 +337,87 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
301 |
|
302 |
jsonl_out = gr.Textbox(label="Dataset JSONL path", interactive=False, visible=True)
|
303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
# Recording grid with dynamic text readouts
|
305 |
-
phrase_texts_state = gr.State(
|
|
|
|
|
306 |
phrase_markdowns: list[gr.Markdown] = []
|
307 |
rec_components = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
with gr.Column():
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
|
315 |
# Advanced options accordion
|
316 |
with gr.Accordion("Advanced options", open=False):
|
@@ -366,7 +473,8 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
366 |
sr, data = rec
|
367 |
out_path = wav_dir / f"rec_{i:04d}.wav"
|
368 |
sf.write(str(out_path), data, sr)
|
369 |
-
|
|
|
370 |
rows.append({"audio_path": str(out_path), "text": label_text})
|
371 |
jsonl_path = dataset_dir / "data.jsonl"
|
372 |
_write_jsonl(rows, jsonl_path)
|
@@ -409,11 +517,15 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
409 |
jsonl_path = dataset_dir / "data.jsonl"
|
410 |
_write_jsonl(rows, jsonl_path)
|
411 |
# Build markdown content updates for on-screen prompts
|
412 |
-
|
413 |
for i in range(len(phrase_markdowns)):
|
414 |
t = texts[i] if i < len(texts) else ""
|
415 |
-
|
416 |
-
|
|
|
|
|
|
|
|
|
417 |
|
418 |
vp_btn.click(
|
419 |
_collect_voxpopuli,
|
@@ -439,6 +551,6 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
439 |
if __name__ == "__main__":
|
440 |
server_port = int(os.environ.get("INTERFACE_PORT", "7860"))
|
441 |
server_name = os.environ.get("INTERFACE_HOST", "0.0.0.0")
|
442 |
-
demo.queue().launch(server_name=server_name, server_port=server_port, mcp_server=True)
|
443 |
|
444 |
|
|
|
251 |
yield line
|
252 |
|
253 |
|
254 |
+
def load_voxpopuli_phrases(language="en", max_phrases=None, split="train"):
|
255 |
+
"""Load phrases from VoxPopuli dataset.
|
256 |
+
|
257 |
+
Args:
|
258 |
+
language: Language code (e.g., 'en', 'de', 'fr', etc.)
|
259 |
+
max_phrases: Maximum number of phrases to load (None for all available)
|
260 |
+
split: Dataset split to use ('train', 'validation', 'test')
|
261 |
+
|
262 |
+
Returns:
|
263 |
+
List of normalized text phrases
|
264 |
+
"""
|
265 |
+
try:
|
266 |
+
from datasets import load_dataset
|
267 |
+
import random
|
268 |
+
|
269 |
+
# Load the specified language dataset
|
270 |
+
ds = load_dataset("facebook/voxpopuli", language, split=split)
|
271 |
+
|
272 |
+
# Extract normalized text phrases
|
273 |
+
phrases = []
|
274 |
+
for example in ds:
|
275 |
+
text = example.get("normalized_text", "").strip()
|
276 |
+
if text and len(text) > 10: # Filter out very short phrases
|
277 |
+
phrases.append(text)
|
278 |
+
|
279 |
+
# Shuffle and limit if specified
|
280 |
+
if max_phrases:
|
281 |
+
phrases = random.sample(phrases, min(max_phrases, len(phrases)))
|
282 |
+
else:
|
283 |
+
# If no limit, shuffle the entire list
|
284 |
+
random.shuffle(phrases)
|
285 |
+
|
286 |
+
return phrases
|
287 |
+
|
288 |
+
except Exception as e:
|
289 |
+
print(f"Error loading VoxPopuli phrases: {e}")
|
290 |
+
# Fallback to some basic phrases if loading fails
|
291 |
+
return [
|
292 |
+
"The quick brown fox jumps over the lazy dog.",
|
293 |
+
"Please say your full name.",
|
294 |
+
"Today is a good day to learn something new.",
|
295 |
+
"Artificial intelligence helps with many tasks.",
|
296 |
+
"I enjoy reading books and listening to music.",
|
297 |
+
]
|
298 |
+
|
299 |
+
# Initialize phrases dynamically
|
300 |
+
VOXPOPULI_LANGUAGE = "en" # Default to English
|
301 |
+
ALL_PHRASES = load_voxpopuli_phrases(VOXPOPULI_LANGUAGE, max_phrases=None)
|
302 |
|
303 |
with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
304 |
has_gpu, gpu_msg = detect_nvidia_driver()
|
|
|
337 |
|
338 |
jsonl_out = gr.Textbox(label="Dataset JSONL path", interactive=False, visible=True)
|
339 |
|
340 |
+
# Language selection for VoxPopuli phrases
|
341 |
+
voxpopuli_lang = gr.Dropdown(
|
342 |
+
choices=["en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr", "sk", "sl", "et", "lt"],
|
343 |
+
value="en",
|
344 |
+
label="VoxPopuli Language",
|
345 |
+
info="Select language for phrases from VoxPopuli dataset"
|
346 |
+
)
|
347 |
+
|
348 |
# Recording grid with dynamic text readouts
|
349 |
+
phrase_texts_state = gr.State(ALL_PHRASES)
|
350 |
+
visible_rows_state = gr.State(10) # Start with 10 visible rows
|
351 |
+
max_rows = len(ALL_PHRASES) # No cap on total rows
|
352 |
phrase_markdowns: list[gr.Markdown] = []
|
353 |
rec_components = []
|
354 |
+
|
355 |
+
def create_recording_grid(phrases, visible_count=10):
|
356 |
+
"""Create recording grid components dynamically"""
|
357 |
+
markdowns = []
|
358 |
+
recordings = []
|
359 |
+
for idx, phrase in enumerate(phrases):
|
360 |
+
visible = idx < visible_count
|
361 |
+
md = gr.Markdown(f"**{idx+1}. {phrase}**", visible=visible)
|
362 |
+
markdowns.append(md)
|
363 |
+
comp = gr.Audio(sources="microphone", type="numpy", label=f"Recording {idx+1}", visible=visible)
|
364 |
+
recordings.append(comp)
|
365 |
+
return markdowns, recordings
|
366 |
+
|
367 |
+
# Initial grid creation
|
368 |
with gr.Column():
|
369 |
+
phrase_markdowns, rec_components = create_recording_grid(ALL_PHRASES, 10)
|
370 |
+
|
371 |
+
# Add more rows button
|
372 |
+
add_rows_btn = gr.Button("➕ Add 10 More Rows", variant="secondary")
|
373 |
+
|
374 |
+
def add_more_rows(current_visible, current_phrases):
|
375 |
+
"""Add 10 more rows by making them visible"""
|
376 |
+
new_visible = min(current_visible + 10, len(current_phrases))
|
377 |
+
visibility_updates = []
|
378 |
+
for i in range(len(current_phrases)):
|
379 |
+
if i < new_visible:
|
380 |
+
visibility_updates.append(gr.update(visible=True))
|
381 |
+
else:
|
382 |
+
visibility_updates.append(gr.update(visible=False))
|
383 |
+
return [new_visible] + visibility_updates
|
384 |
+
|
385 |
+
def change_language(language):
|
386 |
+
"""Change the language and reload phrases from VoxPopuli"""
|
387 |
+
new_phrases = load_voxpopuli_phrases(language, max_phrases=None)
|
388 |
+
# Reset visible rows to 10
|
389 |
+
visible_count = min(10, len(new_phrases))
|
390 |
+
|
391 |
+
# Create combined updates for existing components (up to current length)
|
392 |
+
current_len = len(phrase_markdowns)
|
393 |
+
combined_updates = []
|
394 |
+
|
395 |
+
# Update existing components
|
396 |
+
for i in range(current_len):
|
397 |
+
if i < len(new_phrases):
|
398 |
+
if i < visible_count:
|
399 |
+
combined_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=True))
|
400 |
+
else:
|
401 |
+
combined_updates.append(gr.update(visible=False))
|
402 |
+
else:
|
403 |
+
combined_updates.append(gr.update(visible=False))
|
404 |
+
|
405 |
+
# If we have more phrases than components, we can't update them via Gradio
|
406 |
+
# The interface will need to be reloaded for significantly different phrase counts
|
407 |
+
return [new_phrases, visible_count] + combined_updates
|
408 |
+
|
409 |
+
# Connect language change to phrase reloading
|
410 |
+
voxpopuli_lang.change(
|
411 |
+
change_language,
|
412 |
+
inputs=[voxpopuli_lang],
|
413 |
+
outputs=[phrase_texts_state, visible_rows_state] + phrase_markdowns + rec_components
|
414 |
+
)
|
415 |
+
|
416 |
+
add_rows_btn.click(
|
417 |
+
add_more_rows,
|
418 |
+
inputs=[visible_rows_state, phrase_texts_state],
|
419 |
+
outputs=[visible_rows_state] + phrase_markdowns + rec_components
|
420 |
+
)
|
421 |
|
422 |
# Advanced options accordion
|
423 |
with gr.Accordion("Advanced options", open=False):
|
|
|
473 |
sr, data = rec
|
474 |
out_path = wav_dir / f"rec_{i:04d}.wav"
|
475 |
sf.write(str(out_path), data, sr)
|
476 |
+
# Use the full phrase list (ALL_PHRASES) instead of just PHRASES
|
477 |
+
label_text = (texts[i] if isinstance(texts, list) and i < len(texts) else (ALL_PHRASES[i] if i < len(ALL_PHRASES) else ""))
|
478 |
rows.append({"audio_path": str(out_path), "text": label_text})
|
479 |
jsonl_path = dataset_dir / "data.jsonl"
|
480 |
_write_jsonl(rows, jsonl_path)
|
|
|
517 |
jsonl_path = dataset_dir / "data.jsonl"
|
518 |
_write_jsonl(rows, jsonl_path)
|
519 |
# Build markdown content updates for on-screen prompts
|
520 |
+
combined_updates = []
|
521 |
for i in range(len(phrase_markdowns)):
|
522 |
t = texts[i] if i < len(texts) else ""
|
523 |
+
if i < len(texts):
|
524 |
+
combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
|
525 |
+
else:
|
526 |
+
combined_updates.append(gr.update(visible=False))
|
527 |
+
|
528 |
+
return (str(jsonl_path), texts, *combined_updates)
|
529 |
|
530 |
vp_btn.click(
|
531 |
_collect_voxpopuli,
|
|
|
551 |
if __name__ == "__main__":
|
552 |
server_port = int(os.environ.get("INTERFACE_PORT", "7860"))
|
553 |
server_name = os.environ.get("INTERFACE_HOST", "0.0.0.0")
|
554 |
+
demo.queue().launch(server_name=server_name, server_port=server_port, mcp_server=True, ssr_mode=False)
|
555 |
|
556 |
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
torch
|
2 |
datasets
|
3 |
peft
|
4 |
-
transformers
|
|
|
|
1 |
torch
|
2 |
datasets
|
3 |
peft
|
4 |
+
transformers
|
5 |
+
gradio
|
templates/spaces/demo_voxtral/app.py
CHANGED
@@ -30,6 +30,6 @@ with gr.Blocks() as demo:
|
|
30 |
btn.click(transcribe, inputs=[audio], outputs=[out])
|
31 |
|
32 |
if __name__ == "__main__":
|
33 |
-
demo.launch(mcp_server=True)
|
34 |
|
35 |
|
|
|
30 |
btn.click(transcribe, inputs=[audio], outputs=[out])
|
31 |
|
32 |
if __name__ == "__main__":
|
33 |
+
demo.launch(mcp_server=True, ssr_mode=False)
|
34 |
|
35 |
|