Spaces:
Running
Running
Joseph Pollack
commited on
adds additional components to the interface for reccording
Browse files- interface.py +189 -68
interface.py
CHANGED
@@ -254,9 +254,9 @@ def start_voxtral_training(
|
|
254 |
def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
255 |
"""Load phrases from various multilingual speech datasets.
|
256 |
|
257 |
-
|
258 |
-
1.
|
259 |
-
2.
|
260 |
3. Fallback to basic phrases
|
261 |
|
262 |
Args:
|
@@ -272,70 +272,97 @@ def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
|
272 |
|
273 |
# Language code mapping for different datasets
|
274 |
lang_mappings = {
|
275 |
-
"en": {"
|
276 |
-
"de": {"
|
277 |
-
"fr": {"
|
278 |
-
"es": {"
|
279 |
-
"it": {"
|
280 |
-
"pt": {"
|
281 |
-
"pl": {"
|
282 |
-
"nl": {"
|
283 |
-
"ru": {"
|
284 |
-
"ar": {"
|
285 |
-
"zh": {"
|
286 |
-
"ja": {"
|
287 |
-
"ko": {"
|
288 |
}
|
289 |
|
290 |
-
lang_config = lang_mappings.get(language, {"
|
291 |
|
292 |
-
# Try
|
293 |
try:
|
294 |
-
print(f"Trying
|
295 |
-
|
296 |
-
ds = load_dataset("
|
297 |
|
298 |
phrases = []
|
299 |
count = 0
|
|
|
|
|
300 |
for example in ds:
|
301 |
if max_phrases and count >= max_phrases:
|
302 |
break
|
303 |
-
|
304 |
-
if
|
305 |
-
phrases.append(
|
|
|
306 |
count += 1
|
307 |
|
308 |
if phrases:
|
309 |
-
print(f"Successfully loaded {len(phrases)} phrases from
|
310 |
random.shuffle(phrases)
|
311 |
return phrases
|
312 |
|
313 |
except Exception as e:
|
314 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
|
316 |
-
# Try
|
317 |
try:
|
318 |
-
print(f"Trying
|
319 |
-
|
320 |
-
ds = load_dataset("google/fleurs", fleurs_lang, split=split, streaming=True)
|
321 |
|
322 |
phrases = []
|
323 |
count = 0
|
324 |
for example in ds:
|
325 |
if max_phrases and count >= max_phrases:
|
326 |
break
|
327 |
-
text = example.get("
|
328 |
if text and len(text) > 10: # Filter out very short phrases
|
329 |
phrases.append(text)
|
330 |
count += 1
|
331 |
|
332 |
if phrases:
|
333 |
-
print(f"Successfully loaded {len(phrases)} phrases from
|
334 |
random.shuffle(phrases)
|
335 |
return phrases
|
336 |
|
337 |
except Exception as e:
|
338 |
-
print(f"
|
339 |
|
340 |
# Final fallback to basic phrases
|
341 |
print("All dataset loading attempts failed, using fallback phrases")
|
@@ -434,17 +461,20 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
434 |
# Recording grid with dynamic text readouts
|
435 |
phrase_texts_state = gr.State(ALL_PHRASES)
|
436 |
visible_rows_state = gr.State(10) # Start with 10 visible rows
|
437 |
-
|
|
|
|
|
438 |
phrase_markdowns: list[gr.Markdown] = []
|
439 |
rec_components = []
|
440 |
|
441 |
-
def create_recording_grid(
|
442 |
-
"""Create recording grid components
|
443 |
markdowns = []
|
444 |
recordings = []
|
445 |
-
for idx
|
446 |
-
visible = idx <
|
447 |
-
|
|
|
448 |
markdowns.append(md)
|
449 |
comp = gr.Audio(sources="microphone", type="numpy", label=f"Recording {idx+1}", visible=visible)
|
450 |
recordings.append(comp)
|
@@ -452,44 +482,41 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
452 |
|
453 |
# Initial grid creation
|
454 |
with gr.Column():
|
455 |
-
phrase_markdowns, rec_components = create_recording_grid(
|
456 |
|
457 |
# Add more rows button
|
458 |
add_rows_btn = gr.Button("➕ Add 10 More Rows", variant="secondary")
|
459 |
|
460 |
def add_more_rows(current_visible, current_phrases):
|
461 |
"""Add 10 more rows by making them visible"""
|
462 |
-
new_visible = min(current_visible + 10, len(current_phrases))
|
|
|
|
|
463 |
visibility_updates = []
|
464 |
-
for i in range(
|
465 |
-
if i < new_visible:
|
466 |
visibility_updates.append(gr.update(visible=True))
|
467 |
else:
|
468 |
visibility_updates.append(gr.update(visible=False))
|
|
|
469 |
return [new_visible] + visibility_updates
|
470 |
|
471 |
def change_language(language):
|
472 |
"""Change the language and reload phrases from multilingual datasets"""
|
473 |
new_phrases = load_multilingual_phrases(language, max_phrases=None)
|
474 |
# Reset visible rows to 10
|
475 |
-
visible_count = min(10, len(new_phrases))
|
476 |
|
477 |
-
# Create
|
478 |
-
current_len = len(phrase_markdowns)
|
479 |
combined_updates = []
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
combined_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=True))
|
486 |
-
else:
|
487 |
-
combined_updates.append(gr.update(visible=False))
|
488 |
else:
|
489 |
-
combined_updates.append(gr.update(visible=False))
|
490 |
|
491 |
-
# If we have more phrases than components, we can't update them via Gradio
|
492 |
-
# The interface will need to be reloaded for significantly different phrase counts
|
493 |
return [new_phrases, visible_count] + combined_updates
|
494 |
|
495 |
# Connect language change to phrase reloading
|
@@ -505,6 +532,56 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
505 |
outputs=[visible_rows_state] + phrase_markdowns + rec_components
|
506 |
)
|
507 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
508 |
# Advanced options accordion
|
509 |
with gr.Accordion("Advanced options", open=False):
|
510 |
base_model = gr.Textbox(value="mistralai/Voxtral-Mini-3B-2507", label="Base Voxtral model")
|
@@ -576,22 +653,66 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
576 |
vp_btn = gr.Button("Use Multilingual Dataset Sample")
|
577 |
|
578 |
def _collect_multilingual_sample(lang_code: str, num_samples: int, split: str):
|
579 |
-
"""Load sample from multilingual datasets (
|
580 |
from datasets import load_dataset, Audio
|
581 |
import random
|
582 |
|
583 |
-
# Language code mapping for
|
584 |
-
|
585 |
"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it",
|
586 |
"pl": "pl", "pt": "pt", "nl": "nl", "ru": "ru", "ar": "ar",
|
587 |
-
"zh": "zh
|
588 |
}
|
589 |
|
590 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
591 |
|
|
|
592 |
try:
|
593 |
-
|
594 |
-
ds = load_dataset("mozilla-foundation/common_voice_11_0", cv_lang, split=split, streaming=True)
|
595 |
ds = ds.cast_column("audio", Audio(sampling_rate=16000))
|
596 |
|
597 |
dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
|
@@ -605,7 +726,7 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
605 |
|
606 |
audio = ex.get("audio") or {}
|
607 |
path = audio.get("path")
|
608 |
-
text = ex.get("
|
609 |
|
610 |
if path and text and len(text) > 10:
|
611 |
rows.append({"audio_path": path, "text": text})
|
@@ -618,7 +739,7 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
618 |
|
619 |
# Build markdown content updates for on-screen prompts
|
620 |
combined_updates = []
|
621 |
-
for i in range(
|
622 |
t = texts[i] if i < len(texts) else ""
|
623 |
if i < len(texts):
|
624 |
combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
|
@@ -628,7 +749,7 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
628 |
return (str(jsonl_path), texts, *combined_updates)
|
629 |
|
630 |
except Exception as e:
|
631 |
-
print(f"
|
632 |
|
633 |
# Fallback: generate synthetic samples with text only
|
634 |
print("Using fallback: generating text-only samples")
|
@@ -642,7 +763,7 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
642 |
|
643 |
# Build markdown content updates for on-screen prompts
|
644 |
combined_updates = []
|
645 |
-
for i in range(
|
646 |
t = texts[i] if i < len(texts) else ""
|
647 |
if i < len(texts):
|
648 |
combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
|
|
|
254 |
def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
255 |
"""Load phrases from various multilingual speech datasets.
|
256 |
|
257 |
+
Uses datasets that work with current library versions:
|
258 |
+
1. ML Commons Speech (modern format)
|
259 |
+
2. Multilingual LibriSpeech (modern format)
|
260 |
3. Fallback to basic phrases
|
261 |
|
262 |
Args:
|
|
|
272 |
|
273 |
# Language code mapping for different datasets
|
274 |
lang_mappings = {
|
275 |
+
"en": {"ml_speech": "en", "librispeech": "clean"},
|
276 |
+
"de": {"ml_speech": "de", "librispeech": None},
|
277 |
+
"fr": {"ml_speech": "fr", "librispeech": None},
|
278 |
+
"es": {"ml_speech": "es", "librispeech": None},
|
279 |
+
"it": {"ml_speech": "it", "librispeech": None},
|
280 |
+
"pt": {"ml_speech": "pt", "librispeech": None},
|
281 |
+
"pl": {"ml_speech": "pl", "librispeech": None},
|
282 |
+
"nl": {"ml_speech": "nl", "librispeech": None},
|
283 |
+
"ru": {"ml_speech": "ru", "librispeech": None},
|
284 |
+
"ar": {"ml_speech": "ar", "librispeech": None},
|
285 |
+
"zh": {"ml_speech": "zh", "librispeech": None},
|
286 |
+
"ja": {"ml_speech": "ja", "librispeech": None},
|
287 |
+
"ko": {"ml_speech": "ko", "librispeech": None},
|
288 |
}
|
289 |
|
290 |
+
lang_config = lang_mappings.get(language, {"ml_speech": language, "librispeech": None})
|
291 |
|
292 |
+
# Try ML Commons Speech first (modern format)
|
293 |
try:
|
294 |
+
print(f"Trying ML Commons Speech dataset for language: {language}")
|
295 |
+
ml_lang = lang_config["ml_speech"]
|
296 |
+
ds = load_dataset("mlcommons/ml_spoken_words", f"speech_commands_{ml_lang}", split=split, streaming=True)
|
297 |
|
298 |
phrases = []
|
299 |
count = 0
|
300 |
+
seen_words = set()
|
301 |
+
|
302 |
for example in ds:
|
303 |
if max_phrases and count >= max_phrases:
|
304 |
break
|
305 |
+
word = example.get("word", "").strip()
|
306 |
+
if word and len(word) > 2 and word not in seen_words: # Filter duplicates and short words
|
307 |
+
phrases.append(word)
|
308 |
+
seen_words.add(word)
|
309 |
count += 1
|
310 |
|
311 |
if phrases:
|
312 |
+
print(f"Successfully loaded {len(phrases)} phrases from ML Commons Speech")
|
313 |
random.shuffle(phrases)
|
314 |
return phrases
|
315 |
|
316 |
except Exception as e:
|
317 |
+
print(f"ML Commons Speech failed: {e}")
|
318 |
+
|
319 |
+
# Try Multilingual LibriSpeech as backup
|
320 |
+
try:
|
321 |
+
if lang_config["librispeech"]:
|
322 |
+
print(f"Trying Multilingual LibriSpeech dataset for language: {language}")
|
323 |
+
librispeech_lang = lang_config["librispeech"]
|
324 |
+
ds = load_dataset("facebook/multilingual_librispeech", f"{language}", split=split, streaming=True)
|
325 |
+
|
326 |
+
phrases = []
|
327 |
+
count = 0
|
328 |
+
for example in ds:
|
329 |
+
if max_phrases and count >= max_phrases:
|
330 |
+
break
|
331 |
+
text = example.get("text", "").strip()
|
332 |
+
if text and len(text) > 10: # Filter out very short phrases
|
333 |
+
phrases.append(text)
|
334 |
+
count += 1
|
335 |
+
|
336 |
+
if phrases:
|
337 |
+
print(f"Successfully loaded {len(phrases)} phrases from Multilingual LibriSpeech")
|
338 |
+
random.shuffle(phrases)
|
339 |
+
return phrases
|
340 |
+
|
341 |
+
except Exception as e:
|
342 |
+
print(f"Multilingual LibriSpeech failed: {e}")
|
343 |
|
344 |
+
# Try TED Talk translations (works for many languages)
|
345 |
try:
|
346 |
+
print(f"Trying TED Talk translations for language: {language}")
|
347 |
+
ds = load_dataset("ted_talks_iwslt", language=[f"{language}_en"], split=split, streaming=True)
|
|
|
348 |
|
349 |
phrases = []
|
350 |
count = 0
|
351 |
for example in ds:
|
352 |
if max_phrases and count >= max_phrases:
|
353 |
break
|
354 |
+
text = example.get("translation", {}).get(language, "").strip()
|
355 |
if text and len(text) > 10: # Filter out very short phrases
|
356 |
phrases.append(text)
|
357 |
count += 1
|
358 |
|
359 |
if phrases:
|
360 |
+
print(f"Successfully loaded {len(phrases)} phrases from TED Talks")
|
361 |
random.shuffle(phrases)
|
362 |
return phrases
|
363 |
|
364 |
except Exception as e:
|
365 |
+
print(f"TED Talks failed: {e}")
|
366 |
|
367 |
# Final fallback to basic phrases
|
368 |
print("All dataset loading attempts failed, using fallback phrases")
|
|
|
461 |
# Recording grid with dynamic text readouts
|
462 |
phrase_texts_state = gr.State(ALL_PHRASES)
|
463 |
visible_rows_state = gr.State(10) # Start with 10 visible rows
|
464 |
+
MAX_COMPONENTS = 100 # Fixed maximum number of components
|
465 |
+
|
466 |
+
# Create fixed number of components upfront
|
467 |
phrase_markdowns: list[gr.Markdown] = []
|
468 |
rec_components = []
|
469 |
|
470 |
+
def create_recording_grid(max_components=MAX_COMPONENTS):
|
471 |
+
"""Create recording grid components with fixed maximum"""
|
472 |
markdowns = []
|
473 |
recordings = []
|
474 |
+
for idx in range(max_components):
|
475 |
+
visible = idx < 10 # Only first 10 visible initially
|
476 |
+
phrase_text = ALL_PHRASES[idx] if idx < len(ALL_PHRASES) else ""
|
477 |
+
md = gr.Markdown(f"**{idx+1}. {phrase_text}**", visible=visible)
|
478 |
markdowns.append(md)
|
479 |
comp = gr.Audio(sources="microphone", type="numpy", label=f"Recording {idx+1}", visible=visible)
|
480 |
recordings.append(comp)
|
|
|
482 |
|
483 |
# Initial grid creation
|
484 |
with gr.Column():
|
485 |
+
phrase_markdowns, rec_components = create_recording_grid(MAX_COMPONENTS)
|
486 |
|
487 |
# Add more rows button
|
488 |
add_rows_btn = gr.Button("➕ Add 10 More Rows", variant="secondary")
|
489 |
|
490 |
def add_more_rows(current_visible, current_phrases):
|
491 |
"""Add 10 more rows by making them visible"""
|
492 |
+
new_visible = min(current_visible + 10, MAX_COMPONENTS, len(current_phrases))
|
493 |
+
|
494 |
+
# Create updates for all MAX_COMPONENTS
|
495 |
visibility_updates = []
|
496 |
+
for i in range(MAX_COMPONENTS):
|
497 |
+
if i < len(current_phrases) and i < new_visible:
|
498 |
visibility_updates.append(gr.update(visible=True))
|
499 |
else:
|
500 |
visibility_updates.append(gr.update(visible=False))
|
501 |
+
|
502 |
return [new_visible] + visibility_updates
|
503 |
|
504 |
def change_language(language):
|
505 |
"""Change the language and reload phrases from multilingual datasets"""
|
506 |
new_phrases = load_multilingual_phrases(language, max_phrases=None)
|
507 |
# Reset visible rows to 10
|
508 |
+
visible_count = min(10, len(new_phrases), MAX_COMPONENTS)
|
509 |
|
510 |
+
# Create updates for all MAX_COMPONENTS
|
|
|
511 |
combined_updates = []
|
512 |
+
for i in range(MAX_COMPONENTS):
|
513 |
+
if i < len(new_phrases) and i < visible_count:
|
514 |
+
combined_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=True))
|
515 |
+
elif i < len(new_phrases):
|
516 |
+
combined_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=False))
|
|
|
|
|
|
|
517 |
else:
|
518 |
+
combined_updates.append(gr.update(value=f"**{i+1}. **", visible=False))
|
519 |
|
|
|
|
|
520 |
return [new_phrases, visible_count] + combined_updates
|
521 |
|
522 |
# Connect language change to phrase reloading
|
|
|
532 |
outputs=[visible_rows_state] + phrase_markdowns + rec_components
|
533 |
)
|
534 |
|
535 |
+
# Recording dataset creation button
|
536 |
+
record_dataset_btn = gr.Button("🎙️ Create Dataset from Recordings", variant="primary")
|
537 |
+
|
538 |
+
def create_recording_dataset(*recordings_and_state):
|
539 |
+
"""Create dataset from visible recordings and phrases"""
|
540 |
+
try:
|
541 |
+
import soundfile as sf
|
542 |
+
|
543 |
+
# Extract recordings and state
|
544 |
+
recordings = recordings_and_state[:-1] # All except the last item (phrases)
|
545 |
+
phrases = recordings_and_state[-1] # Last item is phrases
|
546 |
+
|
547 |
+
dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
|
548 |
+
wav_dir = dataset_dir / "wavs"
|
549 |
+
wav_dir.mkdir(parents=True, exist_ok=True)
|
550 |
+
|
551 |
+
rows = []
|
552 |
+
successful_recordings = 0
|
553 |
+
|
554 |
+
# Process each recording
|
555 |
+
for i, rec in enumerate(recordings):
|
556 |
+
if rec is not None and i < len(phrases):
|
557 |
+
try:
|
558 |
+
sr, data = rec
|
559 |
+
out_path = wav_dir / f"recording_{i:04d}.wav"
|
560 |
+
sf.write(str(out_path), data, sr)
|
561 |
+
rows.append({"audio_path": str(out_path), "text": phrases[i]})
|
562 |
+
successful_recordings += 1
|
563 |
+
except Exception as e:
|
564 |
+
print(f"Error processing recording {i}: {e}")
|
565 |
+
|
566 |
+
if rows:
|
567 |
+
jsonl_path = dataset_dir / "recorded_data.jsonl"
|
568 |
+
_write_jsonl(rows, jsonl_path)
|
569 |
+
return f"✅ Dataset created successfully! {successful_recordings} recordings saved to {jsonl_path}"
|
570 |
+
else:
|
571 |
+
return "❌ No recordings found. Please record some audio first."
|
572 |
+
|
573 |
+
except Exception as e:
|
574 |
+
return f"❌ Error creating dataset: {str(e)}"
|
575 |
+
|
576 |
+
# Status display for dataset creation
|
577 |
+
dataset_status = gr.Textbox(label="Dataset Creation Status", interactive=False, visible=True)
|
578 |
+
|
579 |
+
record_dataset_btn.click(
|
580 |
+
create_recording_dataset,
|
581 |
+
inputs=rec_components + [phrase_texts_state],
|
582 |
+
outputs=[dataset_status]
|
583 |
+
)
|
584 |
+
|
585 |
# Advanced options accordion
|
586 |
with gr.Accordion("Advanced options", open=False):
|
587 |
base_model = gr.Textbox(value="mistralai/Voxtral-Mini-3B-2507", label="Base Voxtral model")
|
|
|
653 |
vp_btn = gr.Button("Use Multilingual Dataset Sample")
|
654 |
|
655 |
def _collect_multilingual_sample(lang_code: str, num_samples: int, split: str):
|
656 |
+
"""Load sample from multilingual datasets (ML Commons preferred)"""
|
657 |
from datasets import load_dataset, Audio
|
658 |
import random
|
659 |
|
660 |
+
# Language code mapping for ML Commons Speech
|
661 |
+
ml_lang_map = {
|
662 |
"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it",
|
663 |
"pl": "pl", "pt": "pt", "nl": "nl", "ru": "ru", "ar": "ar",
|
664 |
+
"zh": "zh", "ja": "ja", "ko": "ko"
|
665 |
}
|
666 |
|
667 |
+
ml_lang = ml_lang_map.get(lang_code, lang_code)
|
668 |
+
|
669 |
+
try:
|
670 |
+
# Try ML Commons Speech first
|
671 |
+
ds = load_dataset("mlcommons/ml_spoken_words", f"speech_commands_{ml_lang}", split=split, streaming=True)
|
672 |
+
ds = ds.cast_column("audio", Audio(sampling_rate=16000))
|
673 |
+
|
674 |
+
dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
|
675 |
+
rows: list[dict] = []
|
676 |
+
texts: list[str] = []
|
677 |
+
|
678 |
+
count = 0
|
679 |
+
seen_words = set()
|
680 |
+
|
681 |
+
for ex in ds:
|
682 |
+
if count >= num_samples:
|
683 |
+
break
|
684 |
+
|
685 |
+
audio = ex.get("audio") or {}
|
686 |
+
path = audio.get("path")
|
687 |
+
word = ex.get("word", "").strip()
|
688 |
+
|
689 |
+
if path and word and len(word) > 2 and word not in seen_words:
|
690 |
+
rows.append({"audio_path": path, "text": word})
|
691 |
+
texts.append(str(word))
|
692 |
+
seen_words.add(word)
|
693 |
+
count += 1
|
694 |
+
|
695 |
+
if rows:
|
696 |
+
jsonl_path = dataset_dir / "data.jsonl"
|
697 |
+
_write_jsonl(rows, jsonl_path)
|
698 |
+
|
699 |
+
# Build markdown content updates for on-screen prompts
|
700 |
+
combined_updates = []
|
701 |
+
for i in range(MAX_COMPONENTS):
|
702 |
+
t = texts[i] if i < len(texts) else ""
|
703 |
+
if i < len(texts):
|
704 |
+
combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
|
705 |
+
else:
|
706 |
+
combined_updates.append(gr.update(visible=False))
|
707 |
+
|
708 |
+
return (str(jsonl_path), texts, *combined_updates)
|
709 |
+
|
710 |
+
except Exception as e:
|
711 |
+
print(f"ML Commons Speech sample loading failed: {e}")
|
712 |
|
713 |
+
# Try Multilingual LibriSpeech as backup
|
714 |
try:
|
715 |
+
ds = load_dataset("facebook/multilingual_librispeech", f"{lang_code}", split=split, streaming=True)
|
|
|
716 |
ds = ds.cast_column("audio", Audio(sampling_rate=16000))
|
717 |
|
718 |
dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
|
|
|
726 |
|
727 |
audio = ex.get("audio") or {}
|
728 |
path = audio.get("path")
|
729 |
+
text = ex.get("text", "").strip()
|
730 |
|
731 |
if path and text and len(text) > 10:
|
732 |
rows.append({"audio_path": path, "text": text})
|
|
|
739 |
|
740 |
# Build markdown content updates for on-screen prompts
|
741 |
combined_updates = []
|
742 |
+
for i in range(MAX_COMPONENTS):
|
743 |
t = texts[i] if i < len(texts) else ""
|
744 |
if i < len(texts):
|
745 |
combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
|
|
|
749 |
return (str(jsonl_path), texts, *combined_updates)
|
750 |
|
751 |
except Exception as e:
|
752 |
+
print(f"Multilingual LibriSpeech failed: {e}")
|
753 |
|
754 |
# Fallback: generate synthetic samples with text only
|
755 |
print("Using fallback: generating text-only samples")
|
|
|
763 |
|
764 |
# Build markdown content updates for on-screen prompts
|
765 |
combined_updates = []
|
766 |
+
for i in range(MAX_COMPONENTS):
|
767 |
t = texts[i] if i < len(texts) else ""
|
768 |
if i < len(texts):
|
769 |
combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
|