Spaces:
Running
Running
Joseph Pollack
commited on
adds granary dataset for european languages
Browse files- interface.py +181 -206
interface.py
CHANGED
|
@@ -252,159 +252,115 @@ def start_voxtral_training(
|
|
| 252 |
|
| 253 |
|
| 254 |
def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
| 255 |
-
"""Load phrases from
|
| 256 |
|
| 257 |
-
Uses
|
| 258 |
-
|
| 259 |
-
2. Multilingual LibriSpeech (modern format)
|
| 260 |
-
3. Fallback to basic phrases
|
| 261 |
|
| 262 |
Args:
|
| 263 |
language: Language code (e.g., 'en', 'de', 'fr', etc.)
|
| 264 |
-
max_phrases: Maximum number of phrases to load (None for
|
| 265 |
split: Dataset split to use ('train', 'validation', 'test')
|
| 266 |
|
| 267 |
Returns:
|
| 268 |
-
List of
|
| 269 |
"""
|
| 270 |
from datasets import load_dataset
|
| 271 |
import random
|
| 272 |
|
| 273 |
-
#
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
"
|
| 281 |
-
"pl":
|
| 282 |
-
"
|
| 283 |
-
"
|
| 284 |
-
"
|
| 285 |
-
"
|
| 286 |
-
"ja": {"ml_speech": "ja", "librispeech": None},
|
| 287 |
-
"ko": {"ml_speech": "ko", "librispeech": None},
|
| 288 |
}
|
| 289 |
|
| 290 |
-
|
|
|
|
| 291 |
|
| 292 |
-
# Try ML Commons Speech first (modern format)
|
| 293 |
try:
|
| 294 |
-
print(f"
|
| 295 |
-
|
| 296 |
-
|
|
|
|
|
|
|
| 297 |
|
| 298 |
phrases = []
|
| 299 |
count = 0
|
| 300 |
-
|
| 301 |
|
|
|
|
| 302 |
for example in ds:
|
| 303 |
-
if
|
| 304 |
break
|
| 305 |
-
word = example.get("word", "").strip()
|
| 306 |
-
if word and len(word) > 2 and word not in seen_words: # Filter duplicates and short words
|
| 307 |
-
phrases.append(word)
|
| 308 |
-
seen_words.add(word)
|
| 309 |
-
count += 1
|
| 310 |
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
random.shuffle(phrases)
|
| 314 |
-
return phrases
|
| 315 |
|
| 316 |
-
|
| 317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
-
# Try Multilingual LibriSpeech as backup
|
| 320 |
-
try:
|
| 321 |
-
if lang_config["librispeech"]:
|
| 322 |
-
print(f"Trying Multilingual LibriSpeech dataset for language: {language}")
|
| 323 |
-
librispeech_lang = lang_config["librispeech"]
|
| 324 |
-
ds = load_dataset("facebook/multilingual_librispeech", f"{language}", split=split, streaming=True)
|
| 325 |
-
|
| 326 |
-
phrases = []
|
| 327 |
-
count = 0
|
| 328 |
-
for example in ds:
|
| 329 |
-
if max_phrases and count >= max_phrases:
|
| 330 |
-
break
|
| 331 |
-
text = example.get("text", "").strip()
|
| 332 |
-
if text and len(text) > 10: # Filter out very short phrases
|
| 333 |
-
phrases.append(text)
|
| 334 |
-
count += 1
|
| 335 |
-
|
| 336 |
-
if phrases:
|
| 337 |
-
print(f"Successfully loaded {len(phrases)} phrases from Multilingual LibriSpeech")
|
| 338 |
-
random.shuffle(phrases)
|
| 339 |
-
return phrases
|
| 340 |
-
|
| 341 |
-
except Exception as e:
|
| 342 |
-
print(f"Multilingual LibriSpeech failed: {e}")
|
| 343 |
-
|
| 344 |
-
# Try TED Talk translations (works for many languages)
|
| 345 |
-
try:
|
| 346 |
-
print(f"Trying TED Talk translations for language: {language}")
|
| 347 |
-
ds = load_dataset("ted_talks_iwslt", language=[f"{language}_en"], split=split, streaming=True)
|
| 348 |
-
|
| 349 |
-
phrases = []
|
| 350 |
-
count = 0
|
| 351 |
-
for example in ds:
|
| 352 |
-
if max_phrases and count >= max_phrases:
|
| 353 |
-
break
|
| 354 |
-
text = example.get("translation", {}).get(language, "").strip()
|
| 355 |
-
if text and len(text) > 10: # Filter out very short phrases
|
| 356 |
phrases.append(text)
|
|
|
|
| 357 |
count += 1
|
| 358 |
|
| 359 |
if phrases:
|
| 360 |
-
|
| 361 |
random.shuffle(phrases)
|
|
|
|
| 362 |
return phrases
|
| 363 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
except Exception as e:
|
| 365 |
-
print(f"
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
"I forgot my password again.",
|
| 392 |
-
"Please send me the invoice.",
|
| 393 |
-
"The project is almost complete.",
|
| 394 |
-
"I appreciate your hard work.",
|
| 395 |
-
"Let's schedule a meeting next week.",
|
| 396 |
-
"The food tastes delicious.",
|
| 397 |
-
"I need to buy some groceries.",
|
| 398 |
-
"Please turn off the lights.",
|
| 399 |
-
"The presentation went very well.",
|
| 400 |
-
]
|
| 401 |
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
|
| 407 |
-
|
| 408 |
|
| 409 |
# Initialize phrases dynamically
|
| 410 |
DEFAULT_LANGUAGE = "en" # Default to English
|
|
@@ -447,15 +403,43 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
| 447 |
|
| 448 |
jsonl_out = gr.Textbox(label="Dataset JSONL path", interactive=False, visible=True)
|
| 449 |
|
| 450 |
-
# Language selection for
|
| 451 |
language_selector = gr.Dropdown(
|
| 452 |
choices=[
|
| 453 |
-
"
|
| 454 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
],
|
| 456 |
value="en",
|
| 457 |
label="Language for Speech Phrases",
|
| 458 |
-
info="Select language for phrases from
|
| 459 |
)
|
| 460 |
|
| 461 |
# Recording grid with dynamic text readouts
|
|
@@ -491,15 +475,20 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
| 491 |
"""Add 10 more rows by making them visible"""
|
| 492 |
new_visible = min(current_visible + 10, MAX_COMPONENTS, len(current_phrases))
|
| 493 |
|
| 494 |
-
# Create updates for all MAX_COMPONENTS
|
| 495 |
-
|
|
|
|
|
|
|
| 496 |
for i in range(MAX_COMPONENTS):
|
| 497 |
if i < len(current_phrases) and i < new_visible:
|
| 498 |
-
|
|
|
|
| 499 |
else:
|
| 500 |
-
|
|
|
|
| 501 |
|
| 502 |
-
|
|
|
|
| 503 |
|
| 504 |
def change_language(language):
|
| 505 |
"""Change the language and reload phrases from multilingual datasets"""
|
|
@@ -507,17 +496,23 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
| 507 |
# Reset visible rows to 10
|
| 508 |
visible_count = min(10, len(new_phrases), MAX_COMPONENTS)
|
| 509 |
|
| 510 |
-
# Create updates for
|
| 511 |
-
|
|
|
|
|
|
|
| 512 |
for i in range(MAX_COMPONENTS):
|
| 513 |
if i < len(new_phrases) and i < visible_count:
|
| 514 |
-
|
|
|
|
| 515 |
elif i < len(new_phrases):
|
| 516 |
-
|
|
|
|
| 517 |
else:
|
| 518 |
-
|
|
|
|
| 519 |
|
| 520 |
-
|
|
|
|
| 521 |
|
| 522 |
# Connect language change to phrase reloading
|
| 523 |
language_selector.change(
|
|
@@ -647,112 +642,87 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
| 647 |
|
| 648 |
# Quick sample from multilingual datasets (Common Voice, etc.)
|
| 649 |
with gr.Row():
|
| 650 |
-
vp_lang = gr.Dropdown(choices=["en", "de", "fr", "es", "it", "pl", "pt", "nl", "ru", "ar", "zh", "ja", "ko"], value="en", label="Sample Language")
|
| 651 |
vp_samples = gr.Number(value=20, precision=0, label="Num samples")
|
| 652 |
vp_split = gr.Dropdown(choices=["train", "validation", "test"], value="train", label="Split")
|
| 653 |
vp_btn = gr.Button("Use Multilingual Dataset Sample")
|
| 654 |
|
| 655 |
def _collect_multilingual_sample(lang_code: str, num_samples: int, split: str):
|
| 656 |
-
"""
|
| 657 |
from datasets import load_dataset, Audio
|
| 658 |
import random
|
| 659 |
|
| 660 |
-
#
|
| 661 |
-
|
| 662 |
"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it",
|
| 663 |
"pl": "pl", "pt": "pt", "nl": "nl", "ru": "ru", "ar": "ar",
|
| 664 |
-
"zh": "zh", "ja": "ja", "ko": "ko"
|
|
|
|
|
|
|
|
|
|
| 665 |
}
|
| 666 |
|
| 667 |
-
|
| 668 |
|
| 669 |
try:
|
| 670 |
-
|
| 671 |
-
ds = load_dataset("mlcommons/ml_spoken_words", f"speech_commands_{ml_lang}", split=split, streaming=True)
|
| 672 |
-
ds = ds.cast_column("audio", Audio(sampling_rate=16000))
|
| 673 |
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
texts: list[str] = []
|
| 677 |
|
|
|
|
|
|
|
|
|
|
| 678 |
count = 0
|
| 679 |
-
seen_words = set()
|
| 680 |
|
| 681 |
-
|
|
|
|
| 682 |
if count >= num_samples:
|
| 683 |
break
|
| 684 |
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
word = ex.get("word", "").strip()
|
| 688 |
-
|
| 689 |
-
if path and word and len(word) > 2 and word not in seen_words:
|
| 690 |
-
rows.append({"audio_path": path, "text": word})
|
| 691 |
-
texts.append(str(word))
|
| 692 |
-
seen_words.add(word)
|
| 693 |
-
count += 1
|
| 694 |
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
combined_updates = []
|
| 701 |
-
for i in range(MAX_COMPONENTS):
|
| 702 |
-
t = texts[i] if i < len(texts) else ""
|
| 703 |
-
if i < len(texts):
|
| 704 |
-
combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
|
| 705 |
-
else:
|
| 706 |
-
combined_updates.append(gr.update(visible=False))
|
| 707 |
-
|
| 708 |
-
return (str(jsonl_path), texts, *combined_updates)
|
| 709 |
-
|
| 710 |
-
except Exception as e:
|
| 711 |
-
print(f"ML Commons Speech sample loading failed: {e}")
|
| 712 |
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
|
| 719 |
-
rows: list[dict] = []
|
| 720 |
-
texts: list[str] = []
|
| 721 |
-
|
| 722 |
-
count = 0
|
| 723 |
-
for ex in ds:
|
| 724 |
-
if count >= num_samples:
|
| 725 |
-
break
|
| 726 |
-
|
| 727 |
-
audio = ex.get("audio") or {}
|
| 728 |
-
path = audio.get("path")
|
| 729 |
-
text = ex.get("text", "").strip()
|
| 730 |
-
|
| 731 |
-
if path and text and len(text) > 10:
|
| 732 |
-
rows.append({"audio_path": path, "text": text})
|
| 733 |
-
texts.append(str(text))
|
| 734 |
count += 1
|
| 735 |
|
| 736 |
if rows:
|
| 737 |
jsonl_path = dataset_dir / "data.jsonl"
|
| 738 |
_write_jsonl(rows, jsonl_path)
|
| 739 |
|
| 740 |
-
|
| 741 |
-
|
|
|
|
|
|
|
|
|
|
| 742 |
for i in range(MAX_COMPONENTS):
|
| 743 |
t = texts[i] if i < len(texts) else ""
|
| 744 |
if i < len(texts):
|
| 745 |
-
|
|
|
|
| 746 |
else:
|
| 747 |
-
|
|
|
|
|
|
|
|
|
|
| 748 |
|
| 749 |
return (str(jsonl_path), texts, *combined_updates)
|
| 750 |
|
| 751 |
except Exception as e:
|
| 752 |
-
print(f"
|
| 753 |
|
| 754 |
-
# Fallback: generate
|
| 755 |
-
print("Using fallback: generating text-only samples")
|
| 756 |
phrases = load_multilingual_phrases(lang_code, max_phrases=num_samples)
|
| 757 |
texts = phrases[:num_samples]
|
| 758 |
|
|
@@ -761,14 +731,19 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
| 761 |
jsonl_path = dataset_dir / "data.jsonl"
|
| 762 |
_write_jsonl(rows, jsonl_path)
|
| 763 |
|
| 764 |
-
# Build markdown content updates for on-screen prompts
|
| 765 |
-
|
|
|
|
| 766 |
for i in range(MAX_COMPONENTS):
|
| 767 |
t = texts[i] if i < len(texts) else ""
|
| 768 |
if i < len(texts):
|
| 769 |
-
|
|
|
|
| 770 |
else:
|
| 771 |
-
|
|
|
|
|
|
|
|
|
|
| 772 |
|
| 773 |
return (str(jsonl_path), texts, *combined_updates)
|
| 774 |
|
|
|
|
| 252 |
|
| 253 |
|
| 254 |
def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
| 255 |
+
"""Load phrases from NVIDIA Granary dataset.
|
| 256 |
|
| 257 |
+
Uses the high-quality Granary dataset which contains speech recognition
|
| 258 |
+
and translation data for 25 European languages.
|
|
|
|
|
|
|
| 259 |
|
| 260 |
Args:
|
| 261 |
language: Language code (e.g., 'en', 'de', 'fr', etc.)
|
| 262 |
+
max_phrases: Maximum number of phrases to load (None for default 1000)
|
| 263 |
split: Dataset split to use ('train', 'validation', 'test')
|
| 264 |
|
| 265 |
Returns:
|
| 266 |
+
List of transcription phrases from Granary dataset
|
| 267 |
"""
|
| 268 |
from datasets import load_dataset
|
| 269 |
import random
|
| 270 |
|
| 271 |
+
# Default to 1000 phrases if not specified
|
| 272 |
+
if max_phrases is None:
|
| 273 |
+
max_phrases = 1000
|
| 274 |
+
|
| 275 |
+
# Language code mapping for Granary dataset
|
| 276 |
+
# Granary supports these language codes directly
|
| 277 |
+
granary_supported_langs = {
|
| 278 |
+
"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it",
|
| 279 |
+
"pl": "pl", "pt": "pt", "nl": "nl", "ru": "ru", "ar": "ar",
|
| 280 |
+
"zh": "zh", "ja": "ja", "ko": "ko", "da": "da", "sv": "sv",
|
| 281 |
+
"no": "no", "fi": "fi", "et": "et", "lv": "lv", "lt": "lt",
|
| 282 |
+
"sl": "sl", "sk": "sk", "cs": "cs", "hr": "hr", "bg": "bg",
|
| 283 |
+
"uk": "uk", "ro": "ro", "hu": "hu", "el": "el", "mt": "mt"
|
|
|
|
|
|
|
| 284 |
}
|
| 285 |
|
| 286 |
+
# Map input language to Granary configuration
|
| 287 |
+
granary_lang = granary_supported_langs.get(language, "en") # Default to English
|
| 288 |
|
|
|
|
| 289 |
try:
|
| 290 |
+
print(f"Loading phrases from NVIDIA Granary dataset for language: {language}")
|
| 291 |
+
|
| 292 |
+
# Load Granary dataset with ASR (speech recognition) split
|
| 293 |
+
# Use streaming to handle large datasets efficiently
|
| 294 |
+
ds = load_dataset("nvidia/Granary", granary_lang, split="asr", streaming=True)
|
| 295 |
|
| 296 |
phrases = []
|
| 297 |
count = 0
|
| 298 |
+
seen_phrases = set()
|
| 299 |
|
| 300 |
+
# Sample phrases from the dataset
|
| 301 |
for example in ds:
|
| 302 |
+
if count >= max_phrases:
|
| 303 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
|
| 305 |
+
# Extract the text transcription
|
| 306 |
+
text = example.get("text", "").strip()
|
|
|
|
|
|
|
| 307 |
|
| 308 |
+
# Filter for quality phrases
|
| 309 |
+
if (text and
|
| 310 |
+
len(text) > 10 and # Minimum length
|
| 311 |
+
len(text) < 200 and # Maximum length to avoid very long utterances
|
| 312 |
+
text not in seen_phrases and # Avoid duplicates
|
| 313 |
+
not text.isdigit() and # Avoid pure numbers
|
| 314 |
+
not all(c in "0123456789., " for c in text)): # Avoid mostly numeric
|
| 315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
phrases.append(text)
|
| 317 |
+
seen_phrases.add(text)
|
| 318 |
count += 1
|
| 319 |
|
| 320 |
if phrases:
|
| 321 |
+
# Shuffle the phrases for variety
|
| 322 |
random.shuffle(phrases)
|
| 323 |
+
print(f"Successfully loaded {len(phrases)} phrases from Granary dataset for {language}")
|
| 324 |
return phrases
|
| 325 |
|
| 326 |
+
else:
|
| 327 |
+
print(f"No suitable phrases found in Granary dataset for {language}")
|
| 328 |
+
raise Exception("No phrases found")
|
| 329 |
+
|
| 330 |
except Exception as e:
|
| 331 |
+
print(f"Granary dataset loading failed for {language}: {e}")
|
| 332 |
+
|
| 333 |
+
# Fallback to basic phrases if Granary fails
|
| 334 |
+
print("Using fallback phrases")
|
| 335 |
+
fallback_phrases = [
|
| 336 |
+
"The quick brown fox jumps over the lazy dog.",
|
| 337 |
+
"Please say your full name.",
|
| 338 |
+
"Today is a good day to learn something new.",
|
| 339 |
+
"Artificial intelligence helps with many tasks.",
|
| 340 |
+
"I enjoy reading books and listening to music.",
|
| 341 |
+
"This is a sample sentence for testing speech.",
|
| 342 |
+
"Speak clearly and at a normal pace.",
|
| 343 |
+
"Numbers like one, two, three are easy to say.",
|
| 344 |
+
"The weather is sunny with a chance of rain.",
|
| 345 |
+
"Thank you for taking the time to help.",
|
| 346 |
+
"Hello, how are you today?",
|
| 347 |
+
"I would like to order a pizza.",
|
| 348 |
+
"The meeting is scheduled for tomorrow.",
|
| 349 |
+
"Please call me back as soon as possible.",
|
| 350 |
+
"Thank you for your assistance.",
|
| 351 |
+
"Can you help me with this problem?",
|
| 352 |
+
"I need to make a reservation.",
|
| 353 |
+
"The weather looks beautiful outside.",
|
| 354 |
+
"Let's go for a walk in the park.",
|
| 355 |
+
"I enjoy listening to classical music.",
|
| 356 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
|
| 358 |
+
if max_phrases:
|
| 359 |
+
fallback_phrases = random.sample(fallback_phrases, min(max_phrases, len(fallback_phrases)))
|
| 360 |
+
else:
|
| 361 |
+
random.shuffle(fallback_phrases)
|
| 362 |
|
| 363 |
+
return fallback_phrases
|
| 364 |
|
| 365 |
# Initialize phrases dynamically
|
| 366 |
DEFAULT_LANGUAGE = "en" # Default to English
|
|
|
|
| 403 |
|
| 404 |
jsonl_out = gr.Textbox(label="Dataset JSONL path", interactive=False, visible=True)
|
| 405 |
|
| 406 |
+
# Language selection for NVIDIA Granary phrases
|
| 407 |
language_selector = gr.Dropdown(
|
| 408 |
choices=[
|
| 409 |
+
("English", "en"),
|
| 410 |
+
("German", "de"),
|
| 411 |
+
("French", "fr"),
|
| 412 |
+
("Spanish", "es"),
|
| 413 |
+
("Italian", "it"),
|
| 414 |
+
("Portuguese", "pt"),
|
| 415 |
+
("Polish", "pl"),
|
| 416 |
+
("Dutch", "nl"),
|
| 417 |
+
("Russian", "ru"),
|
| 418 |
+
("Arabic", "ar"),
|
| 419 |
+
("Chinese", "zh"),
|
| 420 |
+
("Japanese", "ja"),
|
| 421 |
+
("Korean", "ko"),
|
| 422 |
+
("Danish", "da"),
|
| 423 |
+
("Swedish", "sv"),
|
| 424 |
+
("Norwegian", "no"),
|
| 425 |
+
("Finnish", "fi"),
|
| 426 |
+
("Estonian", "et"),
|
| 427 |
+
("Latvian", "lv"),
|
| 428 |
+
("Lithuanian", "lt"),
|
| 429 |
+
("Slovenian", "sl"),
|
| 430 |
+
("Slovak", "sk"),
|
| 431 |
+
("Czech", "cs"),
|
| 432 |
+
("Croatian", "hr"),
|
| 433 |
+
("Bulgarian", "bg"),
|
| 434 |
+
("Ukrainian", "uk"),
|
| 435 |
+
("Romanian", "ro"),
|
| 436 |
+
("Hungarian", "hu"),
|
| 437 |
+
("Greek", "el"),
|
| 438 |
+
("Maltese", "mt")
|
| 439 |
],
|
| 440 |
value="en",
|
| 441 |
label="Language for Speech Phrases",
|
| 442 |
+
info="Select language for authentic phrases from NVIDIA Granary dataset (25 European languages)"
|
| 443 |
)
|
| 444 |
|
| 445 |
# Recording grid with dynamic text readouts
|
|
|
|
| 475 |
"""Add 10 more rows by making them visible"""
|
| 476 |
new_visible = min(current_visible + 10, MAX_COMPONENTS, len(current_phrases))
|
| 477 |
|
| 478 |
+
# Create updates for all MAX_COMPONENTS (both markdown and audio components)
|
| 479 |
+
markdown_updates = []
|
| 480 |
+
audio_updates = []
|
| 481 |
+
|
| 482 |
for i in range(MAX_COMPONENTS):
|
| 483 |
if i < len(current_phrases) and i < new_visible:
|
| 484 |
+
markdown_updates.append(gr.update(visible=True))
|
| 485 |
+
audio_updates.append(gr.update(visible=True))
|
| 486 |
else:
|
| 487 |
+
markdown_updates.append(gr.update(visible=False))
|
| 488 |
+
audio_updates.append(gr.update(visible=False))
|
| 489 |
|
| 490 |
+
# Return: [state] + markdown_updates + audio_updates
|
| 491 |
+
return [new_visible] + markdown_updates + audio_updates
|
| 492 |
|
| 493 |
def change_language(language):
|
| 494 |
"""Change the language and reload phrases from multilingual datasets"""
|
|
|
|
| 496 |
# Reset visible rows to 10
|
| 497 |
visible_count = min(10, len(new_phrases), MAX_COMPONENTS)
|
| 498 |
|
| 499 |
+
# Create separate updates for markdown and audio components
|
| 500 |
+
markdown_updates = []
|
| 501 |
+
audio_updates = []
|
| 502 |
+
|
| 503 |
for i in range(MAX_COMPONENTS):
|
| 504 |
if i < len(new_phrases) and i < visible_count:
|
| 505 |
+
markdown_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=True))
|
| 506 |
+
audio_updates.append(gr.update(visible=True))
|
| 507 |
elif i < len(new_phrases):
|
| 508 |
+
markdown_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=False))
|
| 509 |
+
audio_updates.append(gr.update(visible=False))
|
| 510 |
else:
|
| 511 |
+
markdown_updates.append(gr.update(value=f"**{i+1}. **", visible=False))
|
| 512 |
+
audio_updates.append(gr.update(visible=False))
|
| 513 |
|
| 514 |
+
# Return: [phrases_state, visible_state] + markdown_updates + audio_updates
|
| 515 |
+
return [new_phrases, visible_count] + markdown_updates + audio_updates
|
| 516 |
|
| 517 |
# Connect language change to phrase reloading
|
| 518 |
language_selector.change(
|
|
|
|
| 642 |
|
| 643 |
# Quick sample from multilingual datasets (Common Voice, etc.)
|
| 644 |
with gr.Row():
|
| 645 |
+
vp_lang = gr.Dropdown(choices=["en", "de", "fr", "es", "it", "pl", "pt", "nl", "ru", "ar", "zh", "ja", "ko", "da", "sv", "fi", "et", "cs", "hr", "bg", "uk", "ro", "hu", "el"], value="en", label="Sample Language")
|
| 646 |
vp_samples = gr.Number(value=20, precision=0, label="Num samples")
|
| 647 |
vp_split = gr.Dropdown(choices=["train", "validation", "test"], value="train", label="Split")
|
| 648 |
vp_btn = gr.Button("Use Multilingual Dataset Sample")
|
| 649 |
|
| 650 |
def _collect_multilingual_sample(lang_code: str, num_samples: int, split: str):
|
| 651 |
+
"""Collect sample audio and text from NVIDIA Granary dataset"""
|
| 652 |
from datasets import load_dataset, Audio
|
| 653 |
import random
|
| 654 |
|
| 655 |
+
# Map language code to Granary format
|
| 656 |
+
granary_lang_map = {
|
| 657 |
"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it",
|
| 658 |
"pl": "pl", "pt": "pt", "nl": "nl", "ru": "ru", "ar": "ar",
|
| 659 |
+
"zh": "zh", "ja": "ja", "ko": "ko", "da": "da", "sv": "sv",
|
| 660 |
+
"no": "no", "fi": "fi", "et": "et", "lv": "lv", "lt": "lt",
|
| 661 |
+
"sl": "sl", "sk": "sk", "cs": "cs", "hr": "hr", "bg": "bg",
|
| 662 |
+
"uk": "uk", "ro": "ro", "hu": "hu", "el": "el", "mt": "mt"
|
| 663 |
}
|
| 664 |
|
| 665 |
+
granary_lang = granary_lang_map.get(lang_code, "en")
|
| 666 |
|
| 667 |
try:
|
| 668 |
+
print(f"Collecting {num_samples} samples from NVIDIA Granary dataset for language: {lang_code}")
|
|
|
|
|
|
|
| 669 |
|
| 670 |
+
# Load Granary dataset with ASR split
|
| 671 |
+
ds = load_dataset("nvidia/Granary", granary_lang, split="asr", streaming=True)
|
|
|
|
| 672 |
|
| 673 |
+
dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
|
| 674 |
+
rows = []
|
| 675 |
+
texts = []
|
| 676 |
count = 0
|
|
|
|
| 677 |
|
| 678 |
+
# Sample from the dataset
|
| 679 |
+
for example in ds:
|
| 680 |
if count >= num_samples:
|
| 681 |
break
|
| 682 |
|
| 683 |
+
text = example.get("text", "").strip()
|
| 684 |
+
audio_path = example.get("audio_filepath", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 685 |
|
| 686 |
+
# Filter for quality samples
|
| 687 |
+
if (text and
|
| 688 |
+
len(text) > 10 and
|
| 689 |
+
len(text) < 200 and
|
| 690 |
+
audio_path): # Must have audio file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 691 |
|
| 692 |
+
rows.append({
|
| 693 |
+
"audio_path": audio_path,
|
| 694 |
+
"text": text
|
| 695 |
+
})
|
| 696 |
+
texts.append(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 697 |
count += 1
|
| 698 |
|
| 699 |
if rows:
|
| 700 |
jsonl_path = dataset_dir / "data.jsonl"
|
| 701 |
_write_jsonl(rows, jsonl_path)
|
| 702 |
|
| 703 |
+
print(f"Successfully collected {len(rows)} samples from Granary dataset")
|
| 704 |
+
|
| 705 |
+
# Build markdown and audio content updates for on-screen prompts
|
| 706 |
+
markdown_updates = []
|
| 707 |
+
audio_updates = []
|
| 708 |
for i in range(MAX_COMPONENTS):
|
| 709 |
t = texts[i] if i < len(texts) else ""
|
| 710 |
if i < len(texts):
|
| 711 |
+
markdown_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
|
| 712 |
+
audio_updates.append(gr.update(visible=True))
|
| 713 |
else:
|
| 714 |
+
markdown_updates.append(gr.update(visible=False))
|
| 715 |
+
audio_updates.append(gr.update(visible=False))
|
| 716 |
+
|
| 717 |
+
combined_updates = markdown_updates + audio_updates
|
| 718 |
|
| 719 |
return (str(jsonl_path), texts, *combined_updates)
|
| 720 |
|
| 721 |
except Exception as e:
|
| 722 |
+
print(f"Granary sample collection failed for {lang_code}: {e}")
|
| 723 |
|
| 724 |
+
# Fallback: generate text-only samples if Granary fails
|
| 725 |
+
print(f"Using fallback: generating text-only samples for {lang_code}")
|
| 726 |
phrases = load_multilingual_phrases(lang_code, max_phrases=num_samples)
|
| 727 |
texts = phrases[:num_samples]
|
| 728 |
|
|
|
|
| 731 |
jsonl_path = dataset_dir / "data.jsonl"
|
| 732 |
_write_jsonl(rows, jsonl_path)
|
| 733 |
|
| 734 |
+
# Build markdown and audio content updates for on-screen prompts
|
| 735 |
+
markdown_updates = []
|
| 736 |
+
audio_updates = []
|
| 737 |
for i in range(MAX_COMPONENTS):
|
| 738 |
t = texts[i] if i < len(texts) else ""
|
| 739 |
if i < len(texts):
|
| 740 |
+
markdown_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
|
| 741 |
+
audio_updates.append(gr.update(visible=True))
|
| 742 |
else:
|
| 743 |
+
markdown_updates.append(gr.update(visible=False))
|
| 744 |
+
audio_updates.append(gr.update(visible=False))
|
| 745 |
+
|
| 746 |
+
combined_updates = markdown_updates + audio_updates
|
| 747 |
|
| 748 |
return (str(jsonl_path), texts, *combined_updates)
|
| 749 |
|