Spaces:
Running
Running
Joseph Pollack
commited on
adds granary dataset for european languages
Browse files- interface.py +181 -206
interface.py
CHANGED
@@ -252,159 +252,115 @@ def start_voxtral_training(
|
|
252 |
|
253 |
|
254 |
def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
255 |
-
"""Load phrases from
|
256 |
|
257 |
-
Uses
|
258 |
-
|
259 |
-
2. Multilingual LibriSpeech (modern format)
|
260 |
-
3. Fallback to basic phrases
|
261 |
|
262 |
Args:
|
263 |
language: Language code (e.g., 'en', 'de', 'fr', etc.)
|
264 |
-
max_phrases: Maximum number of phrases to load (None for
|
265 |
split: Dataset split to use ('train', 'validation', 'test')
|
266 |
|
267 |
Returns:
|
268 |
-
List of
|
269 |
"""
|
270 |
from datasets import load_dataset
|
271 |
import random
|
272 |
|
273 |
-
#
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
"
|
281 |
-
"pl":
|
282 |
-
"
|
283 |
-
"
|
284 |
-
"
|
285 |
-
"
|
286 |
-
"ja": {"ml_speech": "ja", "librispeech": None},
|
287 |
-
"ko": {"ml_speech": "ko", "librispeech": None},
|
288 |
}
|
289 |
|
290 |
-
|
|
|
291 |
|
292 |
-
# Try ML Commons Speech first (modern format)
|
293 |
try:
|
294 |
-
print(f"
|
295 |
-
|
296 |
-
|
|
|
|
|
297 |
|
298 |
phrases = []
|
299 |
count = 0
|
300 |
-
|
301 |
|
|
|
302 |
for example in ds:
|
303 |
-
if
|
304 |
break
|
305 |
-
word = example.get("word", "").strip()
|
306 |
-
if word and len(word) > 2 and word not in seen_words: # Filter duplicates and short words
|
307 |
-
phrases.append(word)
|
308 |
-
seen_words.add(word)
|
309 |
-
count += 1
|
310 |
|
311 |
-
|
312 |
-
|
313 |
-
random.shuffle(phrases)
|
314 |
-
return phrases
|
315 |
|
316 |
-
|
317 |
-
|
|
|
|
|
|
|
|
|
|
|
318 |
|
319 |
-
# Try Multilingual LibriSpeech as backup
|
320 |
-
try:
|
321 |
-
if lang_config["librispeech"]:
|
322 |
-
print(f"Trying Multilingual LibriSpeech dataset for language: {language}")
|
323 |
-
librispeech_lang = lang_config["librispeech"]
|
324 |
-
ds = load_dataset("facebook/multilingual_librispeech", f"{language}", split=split, streaming=True)
|
325 |
-
|
326 |
-
phrases = []
|
327 |
-
count = 0
|
328 |
-
for example in ds:
|
329 |
-
if max_phrases and count >= max_phrases:
|
330 |
-
break
|
331 |
-
text = example.get("text", "").strip()
|
332 |
-
if text and len(text) > 10: # Filter out very short phrases
|
333 |
-
phrases.append(text)
|
334 |
-
count += 1
|
335 |
-
|
336 |
-
if phrases:
|
337 |
-
print(f"Successfully loaded {len(phrases)} phrases from Multilingual LibriSpeech")
|
338 |
-
random.shuffle(phrases)
|
339 |
-
return phrases
|
340 |
-
|
341 |
-
except Exception as e:
|
342 |
-
print(f"Multilingual LibriSpeech failed: {e}")
|
343 |
-
|
344 |
-
# Try TED Talk translations (works for many languages)
|
345 |
-
try:
|
346 |
-
print(f"Trying TED Talk translations for language: {language}")
|
347 |
-
ds = load_dataset("ted_talks_iwslt", language=[f"{language}_en"], split=split, streaming=True)
|
348 |
-
|
349 |
-
phrases = []
|
350 |
-
count = 0
|
351 |
-
for example in ds:
|
352 |
-
if max_phrases and count >= max_phrases:
|
353 |
-
break
|
354 |
-
text = example.get("translation", {}).get(language, "").strip()
|
355 |
-
if text and len(text) > 10: # Filter out very short phrases
|
356 |
phrases.append(text)
|
|
|
357 |
count += 1
|
358 |
|
359 |
if phrases:
|
360 |
-
|
361 |
random.shuffle(phrases)
|
|
|
362 |
return phrases
|
363 |
|
|
|
|
|
|
|
|
|
364 |
except Exception as e:
|
365 |
-
print(f"
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
"I forgot my password again.",
|
392 |
-
"Please send me the invoice.",
|
393 |
-
"The project is almost complete.",
|
394 |
-
"I appreciate your hard work.",
|
395 |
-
"Let's schedule a meeting next week.",
|
396 |
-
"The food tastes delicious.",
|
397 |
-
"I need to buy some groceries.",
|
398 |
-
"Please turn off the lights.",
|
399 |
-
"The presentation went very well.",
|
400 |
-
]
|
401 |
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
|
407 |
-
|
408 |
|
409 |
# Initialize phrases dynamically
|
410 |
DEFAULT_LANGUAGE = "en" # Default to English
|
@@ -447,15 +403,43 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
447 |
|
448 |
jsonl_out = gr.Textbox(label="Dataset JSONL path", interactive=False, visible=True)
|
449 |
|
450 |
-
# Language selection for
|
451 |
language_selector = gr.Dropdown(
|
452 |
choices=[
|
453 |
-
"
|
454 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
455 |
],
|
456 |
value="en",
|
457 |
label="Language for Speech Phrases",
|
458 |
-
info="Select language for phrases from
|
459 |
)
|
460 |
|
461 |
# Recording grid with dynamic text readouts
|
@@ -491,15 +475,20 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
491 |
"""Add 10 more rows by making them visible"""
|
492 |
new_visible = min(current_visible + 10, MAX_COMPONENTS, len(current_phrases))
|
493 |
|
494 |
-
# Create updates for all MAX_COMPONENTS
|
495 |
-
|
|
|
|
|
496 |
for i in range(MAX_COMPONENTS):
|
497 |
if i < len(current_phrases) and i < new_visible:
|
498 |
-
|
|
|
499 |
else:
|
500 |
-
|
|
|
501 |
|
502 |
-
|
|
|
503 |
|
504 |
def change_language(language):
|
505 |
"""Change the language and reload phrases from multilingual datasets"""
|
@@ -507,17 +496,23 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
507 |
# Reset visible rows to 10
|
508 |
visible_count = min(10, len(new_phrases), MAX_COMPONENTS)
|
509 |
|
510 |
-
# Create updates for
|
511 |
-
|
|
|
|
|
512 |
for i in range(MAX_COMPONENTS):
|
513 |
if i < len(new_phrases) and i < visible_count:
|
514 |
-
|
|
|
515 |
elif i < len(new_phrases):
|
516 |
-
|
|
|
517 |
else:
|
518 |
-
|
|
|
519 |
|
520 |
-
|
|
|
521 |
|
522 |
# Connect language change to phrase reloading
|
523 |
language_selector.change(
|
@@ -647,112 +642,87 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
647 |
|
648 |
# Quick sample from multilingual datasets (Common Voice, etc.)
|
649 |
with gr.Row():
|
650 |
-
vp_lang = gr.Dropdown(choices=["en", "de", "fr", "es", "it", "pl", "pt", "nl", "ru", "ar", "zh", "ja", "ko"], value="en", label="Sample Language")
|
651 |
vp_samples = gr.Number(value=20, precision=0, label="Num samples")
|
652 |
vp_split = gr.Dropdown(choices=["train", "validation", "test"], value="train", label="Split")
|
653 |
vp_btn = gr.Button("Use Multilingual Dataset Sample")
|
654 |
|
655 |
def _collect_multilingual_sample(lang_code: str, num_samples: int, split: str):
|
656 |
-
"""
|
657 |
from datasets import load_dataset, Audio
|
658 |
import random
|
659 |
|
660 |
-
#
|
661 |
-
|
662 |
"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it",
|
663 |
"pl": "pl", "pt": "pt", "nl": "nl", "ru": "ru", "ar": "ar",
|
664 |
-
"zh": "zh", "ja": "ja", "ko": "ko"
|
|
|
|
|
|
|
665 |
}
|
666 |
|
667 |
-
|
668 |
|
669 |
try:
|
670 |
-
|
671 |
-
ds = load_dataset("mlcommons/ml_spoken_words", f"speech_commands_{ml_lang}", split=split, streaming=True)
|
672 |
-
ds = ds.cast_column("audio", Audio(sampling_rate=16000))
|
673 |
|
674 |
-
|
675 |
-
|
676 |
-
texts: list[str] = []
|
677 |
|
|
|
|
|
|
|
678 |
count = 0
|
679 |
-
seen_words = set()
|
680 |
|
681 |
-
|
|
|
682 |
if count >= num_samples:
|
683 |
break
|
684 |
|
685 |
-
|
686 |
-
|
687 |
-
word = ex.get("word", "").strip()
|
688 |
-
|
689 |
-
if path and word and len(word) > 2 and word not in seen_words:
|
690 |
-
rows.append({"audio_path": path, "text": word})
|
691 |
-
texts.append(str(word))
|
692 |
-
seen_words.add(word)
|
693 |
-
count += 1
|
694 |
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
combined_updates = []
|
701 |
-
for i in range(MAX_COMPONENTS):
|
702 |
-
t = texts[i] if i < len(texts) else ""
|
703 |
-
if i < len(texts):
|
704 |
-
combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
|
705 |
-
else:
|
706 |
-
combined_updates.append(gr.update(visible=False))
|
707 |
-
|
708 |
-
return (str(jsonl_path), texts, *combined_updates)
|
709 |
-
|
710 |
-
except Exception as e:
|
711 |
-
print(f"ML Commons Speech sample loading failed: {e}")
|
712 |
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
|
719 |
-
rows: list[dict] = []
|
720 |
-
texts: list[str] = []
|
721 |
-
|
722 |
-
count = 0
|
723 |
-
for ex in ds:
|
724 |
-
if count >= num_samples:
|
725 |
-
break
|
726 |
-
|
727 |
-
audio = ex.get("audio") or {}
|
728 |
-
path = audio.get("path")
|
729 |
-
text = ex.get("text", "").strip()
|
730 |
-
|
731 |
-
if path and text and len(text) > 10:
|
732 |
-
rows.append({"audio_path": path, "text": text})
|
733 |
-
texts.append(str(text))
|
734 |
count += 1
|
735 |
|
736 |
if rows:
|
737 |
jsonl_path = dataset_dir / "data.jsonl"
|
738 |
_write_jsonl(rows, jsonl_path)
|
739 |
|
740 |
-
|
741 |
-
|
|
|
|
|
|
|
742 |
for i in range(MAX_COMPONENTS):
|
743 |
t = texts[i] if i < len(texts) else ""
|
744 |
if i < len(texts):
|
745 |
-
|
|
|
746 |
else:
|
747 |
-
|
|
|
|
|
|
|
748 |
|
749 |
return (str(jsonl_path), texts, *combined_updates)
|
750 |
|
751 |
except Exception as e:
|
752 |
-
print(f"
|
753 |
|
754 |
-
# Fallback: generate
|
755 |
-
print("Using fallback: generating text-only samples")
|
756 |
phrases = load_multilingual_phrases(lang_code, max_phrases=num_samples)
|
757 |
texts = phrases[:num_samples]
|
758 |
|
@@ -761,14 +731,19 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
761 |
jsonl_path = dataset_dir / "data.jsonl"
|
762 |
_write_jsonl(rows, jsonl_path)
|
763 |
|
764 |
-
# Build markdown content updates for on-screen prompts
|
765 |
-
|
|
|
766 |
for i in range(MAX_COMPONENTS):
|
767 |
t = texts[i] if i < len(texts) else ""
|
768 |
if i < len(texts):
|
769 |
-
|
|
|
770 |
else:
|
771 |
-
|
|
|
|
|
|
|
772 |
|
773 |
return (str(jsonl_path), texts, *combined_updates)
|
774 |
|
|
|
252 |
|
253 |
|
254 |
def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
255 |
+
"""Load phrases from NVIDIA Granary dataset.
|
256 |
|
257 |
+
Uses the high-quality Granary dataset which contains speech recognition
|
258 |
+
and translation data for 25 European languages.
|
|
|
|
|
259 |
|
260 |
Args:
|
261 |
language: Language code (e.g., 'en', 'de', 'fr', etc.)
|
262 |
+
max_phrases: Maximum number of phrases to load (None for default 1000)
|
263 |
split: Dataset split to use ('train', 'validation', 'test')
|
264 |
|
265 |
Returns:
|
266 |
+
List of transcription phrases from Granary dataset
|
267 |
"""
|
268 |
from datasets import load_dataset
|
269 |
import random
|
270 |
|
271 |
+
# Default to 1000 phrases if not specified
|
272 |
+
if max_phrases is None:
|
273 |
+
max_phrases = 1000
|
274 |
+
|
275 |
+
# Language code mapping for Granary dataset
|
276 |
+
# Granary supports these language codes directly
|
277 |
+
granary_supported_langs = {
|
278 |
+
"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it",
|
279 |
+
"pl": "pl", "pt": "pt", "nl": "nl", "ru": "ru", "ar": "ar",
|
280 |
+
"zh": "zh", "ja": "ja", "ko": "ko", "da": "da", "sv": "sv",
|
281 |
+
"no": "no", "fi": "fi", "et": "et", "lv": "lv", "lt": "lt",
|
282 |
+
"sl": "sl", "sk": "sk", "cs": "cs", "hr": "hr", "bg": "bg",
|
283 |
+
"uk": "uk", "ro": "ro", "hu": "hu", "el": "el", "mt": "mt"
|
|
|
|
|
284 |
}
|
285 |
|
286 |
+
# Map input language to Granary configuration
|
287 |
+
granary_lang = granary_supported_langs.get(language, "en") # Default to English
|
288 |
|
|
|
289 |
try:
|
290 |
+
print(f"Loading phrases from NVIDIA Granary dataset for language: {language}")
|
291 |
+
|
292 |
+
# Load Granary dataset with ASR (speech recognition) split
|
293 |
+
# Use streaming to handle large datasets efficiently
|
294 |
+
ds = load_dataset("nvidia/Granary", granary_lang, split="asr", streaming=True)
|
295 |
|
296 |
phrases = []
|
297 |
count = 0
|
298 |
+
seen_phrases = set()
|
299 |
|
300 |
+
# Sample phrases from the dataset
|
301 |
for example in ds:
|
302 |
+
if count >= max_phrases:
|
303 |
break
|
|
|
|
|
|
|
|
|
|
|
304 |
|
305 |
+
# Extract the text transcription
|
306 |
+
text = example.get("text", "").strip()
|
|
|
|
|
307 |
|
308 |
+
# Filter for quality phrases
|
309 |
+
if (text and
|
310 |
+
len(text) > 10 and # Minimum length
|
311 |
+
len(text) < 200 and # Maximum length to avoid very long utterances
|
312 |
+
text not in seen_phrases and # Avoid duplicates
|
313 |
+
not text.isdigit() and # Avoid pure numbers
|
314 |
+
not all(c in "0123456789., " for c in text)): # Avoid mostly numeric
|
315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
phrases.append(text)
|
317 |
+
seen_phrases.add(text)
|
318 |
count += 1
|
319 |
|
320 |
if phrases:
|
321 |
+
# Shuffle the phrases for variety
|
322 |
random.shuffle(phrases)
|
323 |
+
print(f"Successfully loaded {len(phrases)} phrases from Granary dataset for {language}")
|
324 |
return phrases
|
325 |
|
326 |
+
else:
|
327 |
+
print(f"No suitable phrases found in Granary dataset for {language}")
|
328 |
+
raise Exception("No phrases found")
|
329 |
+
|
330 |
except Exception as e:
|
331 |
+
print(f"Granary dataset loading failed for {language}: {e}")
|
332 |
+
|
333 |
+
# Fallback to basic phrases if Granary fails
|
334 |
+
print("Using fallback phrases")
|
335 |
+
fallback_phrases = [
|
336 |
+
"The quick brown fox jumps over the lazy dog.",
|
337 |
+
"Please say your full name.",
|
338 |
+
"Today is a good day to learn something new.",
|
339 |
+
"Artificial intelligence helps with many tasks.",
|
340 |
+
"I enjoy reading books and listening to music.",
|
341 |
+
"This is a sample sentence for testing speech.",
|
342 |
+
"Speak clearly and at a normal pace.",
|
343 |
+
"Numbers like one, two, three are easy to say.",
|
344 |
+
"The weather is sunny with a chance of rain.",
|
345 |
+
"Thank you for taking the time to help.",
|
346 |
+
"Hello, how are you today?",
|
347 |
+
"I would like to order a pizza.",
|
348 |
+
"The meeting is scheduled for tomorrow.",
|
349 |
+
"Please call me back as soon as possible.",
|
350 |
+
"Thank you for your assistance.",
|
351 |
+
"Can you help me with this problem?",
|
352 |
+
"I need to make a reservation.",
|
353 |
+
"The weather looks beautiful outside.",
|
354 |
+
"Let's go for a walk in the park.",
|
355 |
+
"I enjoy listening to classical music.",
|
356 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
|
358 |
+
if max_phrases:
|
359 |
+
fallback_phrases = random.sample(fallback_phrases, min(max_phrases, len(fallback_phrases)))
|
360 |
+
else:
|
361 |
+
random.shuffle(fallback_phrases)
|
362 |
|
363 |
+
return fallback_phrases
|
364 |
|
365 |
# Initialize phrases dynamically
|
366 |
DEFAULT_LANGUAGE = "en" # Default to English
|
|
|
403 |
|
404 |
jsonl_out = gr.Textbox(label="Dataset JSONL path", interactive=False, visible=True)
|
405 |
|
406 |
+
# Language selection for NVIDIA Granary phrases
|
407 |
language_selector = gr.Dropdown(
|
408 |
choices=[
|
409 |
+
("English", "en"),
|
410 |
+
("German", "de"),
|
411 |
+
("French", "fr"),
|
412 |
+
("Spanish", "es"),
|
413 |
+
("Italian", "it"),
|
414 |
+
("Portuguese", "pt"),
|
415 |
+
("Polish", "pl"),
|
416 |
+
("Dutch", "nl"),
|
417 |
+
("Russian", "ru"),
|
418 |
+
("Arabic", "ar"),
|
419 |
+
("Chinese", "zh"),
|
420 |
+
("Japanese", "ja"),
|
421 |
+
("Korean", "ko"),
|
422 |
+
("Danish", "da"),
|
423 |
+
("Swedish", "sv"),
|
424 |
+
("Norwegian", "no"),
|
425 |
+
("Finnish", "fi"),
|
426 |
+
("Estonian", "et"),
|
427 |
+
("Latvian", "lv"),
|
428 |
+
("Lithuanian", "lt"),
|
429 |
+
("Slovenian", "sl"),
|
430 |
+
("Slovak", "sk"),
|
431 |
+
("Czech", "cs"),
|
432 |
+
("Croatian", "hr"),
|
433 |
+
("Bulgarian", "bg"),
|
434 |
+
("Ukrainian", "uk"),
|
435 |
+
("Romanian", "ro"),
|
436 |
+
("Hungarian", "hu"),
|
437 |
+
("Greek", "el"),
|
438 |
+
("Maltese", "mt")
|
439 |
],
|
440 |
value="en",
|
441 |
label="Language for Speech Phrases",
|
442 |
+
info="Select language for authentic phrases from NVIDIA Granary dataset (25 European languages)"
|
443 |
)
|
444 |
|
445 |
# Recording grid with dynamic text readouts
|
|
|
475 |
"""Add 10 more rows by making them visible"""
|
476 |
new_visible = min(current_visible + 10, MAX_COMPONENTS, len(current_phrases))
|
477 |
|
478 |
+
# Create updates for all MAX_COMPONENTS (both markdown and audio components)
|
479 |
+
markdown_updates = []
|
480 |
+
audio_updates = []
|
481 |
+
|
482 |
for i in range(MAX_COMPONENTS):
|
483 |
if i < len(current_phrases) and i < new_visible:
|
484 |
+
markdown_updates.append(gr.update(visible=True))
|
485 |
+
audio_updates.append(gr.update(visible=True))
|
486 |
else:
|
487 |
+
markdown_updates.append(gr.update(visible=False))
|
488 |
+
audio_updates.append(gr.update(visible=False))
|
489 |
|
490 |
+
# Return: [state] + markdown_updates + audio_updates
|
491 |
+
return [new_visible] + markdown_updates + audio_updates
|
492 |
|
493 |
def change_language(language):
|
494 |
"""Change the language and reload phrases from multilingual datasets"""
|
|
|
496 |
# Reset visible rows to 10
|
497 |
visible_count = min(10, len(new_phrases), MAX_COMPONENTS)
|
498 |
|
499 |
+
# Create separate updates for markdown and audio components
|
500 |
+
markdown_updates = []
|
501 |
+
audio_updates = []
|
502 |
+
|
503 |
for i in range(MAX_COMPONENTS):
|
504 |
if i < len(new_phrases) and i < visible_count:
|
505 |
+
markdown_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=True))
|
506 |
+
audio_updates.append(gr.update(visible=True))
|
507 |
elif i < len(new_phrases):
|
508 |
+
markdown_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=False))
|
509 |
+
audio_updates.append(gr.update(visible=False))
|
510 |
else:
|
511 |
+
markdown_updates.append(gr.update(value=f"**{i+1}. **", visible=False))
|
512 |
+
audio_updates.append(gr.update(visible=False))
|
513 |
|
514 |
+
# Return: [phrases_state, visible_state] + markdown_updates + audio_updates
|
515 |
+
return [new_phrases, visible_count] + markdown_updates + audio_updates
|
516 |
|
517 |
# Connect language change to phrase reloading
|
518 |
language_selector.change(
|
|
|
642 |
|
643 |
# Quick sample from multilingual datasets (Common Voice, etc.)
|
644 |
with gr.Row():
|
645 |
+
vp_lang = gr.Dropdown(choices=["en", "de", "fr", "es", "it", "pl", "pt", "nl", "ru", "ar", "zh", "ja", "ko", "da", "sv", "fi", "et", "cs", "hr", "bg", "uk", "ro", "hu", "el"], value="en", label="Sample Language")
|
646 |
vp_samples = gr.Number(value=20, precision=0, label="Num samples")
|
647 |
vp_split = gr.Dropdown(choices=["train", "validation", "test"], value="train", label="Split")
|
648 |
vp_btn = gr.Button("Use Multilingual Dataset Sample")
|
649 |
|
650 |
def _collect_multilingual_sample(lang_code: str, num_samples: int, split: str):
|
651 |
+
"""Collect sample audio and text from NVIDIA Granary dataset"""
|
652 |
from datasets import load_dataset, Audio
|
653 |
import random
|
654 |
|
655 |
+
# Map language code to Granary format
|
656 |
+
granary_lang_map = {
|
657 |
"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it",
|
658 |
"pl": "pl", "pt": "pt", "nl": "nl", "ru": "ru", "ar": "ar",
|
659 |
+
"zh": "zh", "ja": "ja", "ko": "ko", "da": "da", "sv": "sv",
|
660 |
+
"no": "no", "fi": "fi", "et": "et", "lv": "lv", "lt": "lt",
|
661 |
+
"sl": "sl", "sk": "sk", "cs": "cs", "hr": "hr", "bg": "bg",
|
662 |
+
"uk": "uk", "ro": "ro", "hu": "hu", "el": "el", "mt": "mt"
|
663 |
}
|
664 |
|
665 |
+
granary_lang = granary_lang_map.get(lang_code, "en")
|
666 |
|
667 |
try:
|
668 |
+
print(f"Collecting {num_samples} samples from NVIDIA Granary dataset for language: {lang_code}")
|
|
|
|
|
669 |
|
670 |
+
# Load Granary dataset with ASR split
|
671 |
+
ds = load_dataset("nvidia/Granary", granary_lang, split="asr", streaming=True)
|
|
|
672 |
|
673 |
+
dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
|
674 |
+
rows = []
|
675 |
+
texts = []
|
676 |
count = 0
|
|
|
677 |
|
678 |
+
# Sample from the dataset
|
679 |
+
for example in ds:
|
680 |
if count >= num_samples:
|
681 |
break
|
682 |
|
683 |
+
text = example.get("text", "").strip()
|
684 |
+
audio_path = example.get("audio_filepath", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
685 |
|
686 |
+
# Filter for quality samples
|
687 |
+
if (text and
|
688 |
+
len(text) > 10 and
|
689 |
+
len(text) < 200 and
|
690 |
+
audio_path): # Must have audio file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
691 |
|
692 |
+
rows.append({
|
693 |
+
"audio_path": audio_path,
|
694 |
+
"text": text
|
695 |
+
})
|
696 |
+
texts.append(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
697 |
count += 1
|
698 |
|
699 |
if rows:
|
700 |
jsonl_path = dataset_dir / "data.jsonl"
|
701 |
_write_jsonl(rows, jsonl_path)
|
702 |
|
703 |
+
print(f"Successfully collected {len(rows)} samples from Granary dataset")
|
704 |
+
|
705 |
+
# Build markdown and audio content updates for on-screen prompts
|
706 |
+
markdown_updates = []
|
707 |
+
audio_updates = []
|
708 |
for i in range(MAX_COMPONENTS):
|
709 |
t = texts[i] if i < len(texts) else ""
|
710 |
if i < len(texts):
|
711 |
+
markdown_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
|
712 |
+
audio_updates.append(gr.update(visible=True))
|
713 |
else:
|
714 |
+
markdown_updates.append(gr.update(visible=False))
|
715 |
+
audio_updates.append(gr.update(visible=False))
|
716 |
+
|
717 |
+
combined_updates = markdown_updates + audio_updates
|
718 |
|
719 |
return (str(jsonl_path), texts, *combined_updates)
|
720 |
|
721 |
except Exception as e:
|
722 |
+
print(f"Granary sample collection failed for {lang_code}: {e}")
|
723 |
|
724 |
+
# Fallback: generate text-only samples if Granary fails
|
725 |
+
print(f"Using fallback: generating text-only samples for {lang_code}")
|
726 |
phrases = load_multilingual_phrases(lang_code, max_phrases=num_samples)
|
727 |
texts = phrases[:num_samples]
|
728 |
|
|
|
731 |
jsonl_path = dataset_dir / "data.jsonl"
|
732 |
_write_jsonl(rows, jsonl_path)
|
733 |
|
734 |
+
# Build markdown and audio content updates for on-screen prompts
|
735 |
+
markdown_updates = []
|
736 |
+
audio_updates = []
|
737 |
for i in range(MAX_COMPONENTS):
|
738 |
t = texts[i] if i < len(texts) else ""
|
739 |
if i < len(texts):
|
740 |
+
markdown_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
|
741 |
+
audio_updates.append(gr.update(visible=True))
|
742 |
else:
|
743 |
+
markdown_updates.append(gr.update(visible=False))
|
744 |
+
audio_updates.append(gr.update(visible=False))
|
745 |
+
|
746 |
+
combined_updates = markdown_updates + audio_updates
|
747 |
|
748 |
return (str(jsonl_path), texts, *combined_updates)
|
749 |
|