Joseph Pollack commited on
Commit
fb12450
·
unverified ·
1 Parent(s): e83891f

adds granary dataset for european languages

Browse files
Files changed (1) hide show
  1. interface.py +181 -206
interface.py CHANGED
@@ -252,159 +252,115 @@ def start_voxtral_training(
252
 
253
 
254
  def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
255
- """Load phrases from various multilingual speech datasets.
256
 
257
- Uses datasets that work with current library versions:
258
- 1. ML Commons Speech (modern format)
259
- 2. Multilingual LibriSpeech (modern format)
260
- 3. Fallback to basic phrases
261
 
262
  Args:
263
  language: Language code (e.g., 'en', 'de', 'fr', etc.)
264
- max_phrases: Maximum number of phrases to load (None for all available)
265
  split: Dataset split to use ('train', 'validation', 'test')
266
 
267
  Returns:
268
- List of normalized text phrases
269
  """
270
  from datasets import load_dataset
271
  import random
272
 
273
- # Language code mapping for different datasets
274
- lang_mappings = {
275
- "en": {"ml_speech": "en", "librispeech": "clean"},
276
- "de": {"ml_speech": "de", "librispeech": None},
277
- "fr": {"ml_speech": "fr", "librispeech": None},
278
- "es": {"ml_speech": "es", "librispeech": None},
279
- "it": {"ml_speech": "it", "librispeech": None},
280
- "pt": {"ml_speech": "pt", "librispeech": None},
281
- "pl": {"ml_speech": "pl", "librispeech": None},
282
- "nl": {"ml_speech": "nl", "librispeech": None},
283
- "ru": {"ml_speech": "ru", "librispeech": None},
284
- "ar": {"ml_speech": "ar", "librispeech": None},
285
- "zh": {"ml_speech": "zh", "librispeech": None},
286
- "ja": {"ml_speech": "ja", "librispeech": None},
287
- "ko": {"ml_speech": "ko", "librispeech": None},
288
  }
289
 
290
- lang_config = lang_mappings.get(language, {"ml_speech": language, "librispeech": None})
 
291
 
292
- # Try ML Commons Speech first (modern format)
293
  try:
294
- print(f"Trying ML Commons Speech dataset for language: {language}")
295
- ml_lang = lang_config["ml_speech"]
296
- ds = load_dataset("mlcommons/ml_spoken_words", f"speech_commands_{ml_lang}", split=split, streaming=True)
 
 
297
 
298
  phrases = []
299
  count = 0
300
- seen_words = set()
301
 
 
302
  for example in ds:
303
- if max_phrases and count >= max_phrases:
304
  break
305
- word = example.get("word", "").strip()
306
- if word and len(word) > 2 and word not in seen_words: # Filter duplicates and short words
307
- phrases.append(word)
308
- seen_words.add(word)
309
- count += 1
310
 
311
- if phrases:
312
- print(f"Successfully loaded {len(phrases)} phrases from ML Commons Speech")
313
- random.shuffle(phrases)
314
- return phrases
315
 
316
- except Exception as e:
317
- print(f"ML Commons Speech failed: {e}")
 
 
 
 
 
318
 
319
- # Try Multilingual LibriSpeech as backup
320
- try:
321
- if lang_config["librispeech"]:
322
- print(f"Trying Multilingual LibriSpeech dataset for language: {language}")
323
- librispeech_lang = lang_config["librispeech"]
324
- ds = load_dataset("facebook/multilingual_librispeech", f"{language}", split=split, streaming=True)
325
-
326
- phrases = []
327
- count = 0
328
- for example in ds:
329
- if max_phrases and count >= max_phrases:
330
- break
331
- text = example.get("text", "").strip()
332
- if text and len(text) > 10: # Filter out very short phrases
333
- phrases.append(text)
334
- count += 1
335
-
336
- if phrases:
337
- print(f"Successfully loaded {len(phrases)} phrases from Multilingual LibriSpeech")
338
- random.shuffle(phrases)
339
- return phrases
340
-
341
- except Exception as e:
342
- print(f"Multilingual LibriSpeech failed: {e}")
343
-
344
- # Try TED Talk translations (works for many languages)
345
- try:
346
- print(f"Trying TED Talk translations for language: {language}")
347
- ds = load_dataset("ted_talks_iwslt", language=[f"{language}_en"], split=split, streaming=True)
348
-
349
- phrases = []
350
- count = 0
351
- for example in ds:
352
- if max_phrases and count >= max_phrases:
353
- break
354
- text = example.get("translation", {}).get(language, "").strip()
355
- if text and len(text) > 10: # Filter out very short phrases
356
  phrases.append(text)
 
357
  count += 1
358
 
359
  if phrases:
360
- print(f"Successfully loaded {len(phrases)} phrases from TED Talks")
361
  random.shuffle(phrases)
 
362
  return phrases
363
 
 
 
 
 
364
  except Exception as e:
365
- print(f"TED Talks failed: {e}")
366
-
367
- # Final fallback to basic phrases
368
- print("All dataset loading attempts failed, using fallback phrases")
369
- fallback_phrases = [
370
- "The quick brown fox jumps over the lazy dog.",
371
- "Please say your full name.",
372
- "Today is a good day to learn something new.",
373
- "Artificial intelligence helps with many tasks.",
374
- "I enjoy reading books and listening to music.",
375
- "This is a sample sentence for testing speech.",
376
- "Speak clearly and at a normal pace.",
377
- "Numbers like one, two, three are easy to say.",
378
- "The weather is sunny with a chance of rain.",
379
- "Thank you for taking the time to help.",
380
- "Hello, how are you today?",
381
- "I would like to order a pizza.",
382
- "The meeting is scheduled for tomorrow.",
383
- "Please call me back as soon as possible.",
384
- "Thank you for your assistance.",
385
- "Can you help me with this problem?",
386
- "I need to make a reservation.",
387
- "The weather looks beautiful outside.",
388
- "Let's go for a walk in the park.",
389
- "I enjoy listening to classical music.",
390
- "What time does the store open?",
391
- "I forgot my password again.",
392
- "Please send me the invoice.",
393
- "The project is almost complete.",
394
- "I appreciate your hard work.",
395
- "Let's schedule a meeting next week.",
396
- "The food tastes delicious.",
397
- "I need to buy some groceries.",
398
- "Please turn off the lights.",
399
- "The presentation went very well.",
400
- ]
401
 
402
- if max_phrases:
403
- fallback_phrases = random.sample(fallback_phrases, min(max_phrases, len(fallback_phrases)))
404
- else:
405
- random.shuffle(fallback_phrases)
406
 
407
- return fallback_phrases
408
 
409
  # Initialize phrases dynamically
410
  DEFAULT_LANGUAGE = "en" # Default to English
@@ -447,15 +403,43 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
447
 
448
  jsonl_out = gr.Textbox(label="Dataset JSONL path", interactive=False, visible=True)
449
 
450
- # Language selection for multilingual phrases
451
  language_selector = gr.Dropdown(
452
  choices=[
453
- "en", "de", "fr", "es", "it", "pt", "pl", "nl", "ru",
454
- "ar", "zh", "ja", "ko", "tr", "ca", "sv", "fi", "da"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
  ],
456
  value="en",
457
  label="Language for Speech Phrases",
458
- info="Select language for phrases from Common Voice, FLEURS, or fallback datasets"
459
  )
460
 
461
  # Recording grid with dynamic text readouts
@@ -491,15 +475,20 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
491
  """Add 10 more rows by making them visible"""
492
  new_visible = min(current_visible + 10, MAX_COMPONENTS, len(current_phrases))
493
 
494
- # Create updates for all MAX_COMPONENTS
495
- visibility_updates = []
 
 
496
  for i in range(MAX_COMPONENTS):
497
  if i < len(current_phrases) and i < new_visible:
498
- visibility_updates.append(gr.update(visible=True))
 
499
  else:
500
- visibility_updates.append(gr.update(visible=False))
 
501
 
502
- return [new_visible] + visibility_updates
 
503
 
504
  def change_language(language):
505
  """Change the language and reload phrases from multilingual datasets"""
@@ -507,17 +496,23 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
507
  # Reset visible rows to 10
508
  visible_count = min(10, len(new_phrases), MAX_COMPONENTS)
509
 
510
- # Create updates for all MAX_COMPONENTS
511
- combined_updates = []
 
 
512
  for i in range(MAX_COMPONENTS):
513
  if i < len(new_phrases) and i < visible_count:
514
- combined_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=True))
 
515
  elif i < len(new_phrases):
516
- combined_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=False))
 
517
  else:
518
- combined_updates.append(gr.update(value=f"**{i+1}. **", visible=False))
 
519
 
520
- return [new_phrases, visible_count] + combined_updates
 
521
 
522
  # Connect language change to phrase reloading
523
  language_selector.change(
@@ -647,112 +642,87 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
647
 
648
  # Quick sample from multilingual datasets (Common Voice, etc.)
649
  with gr.Row():
650
- vp_lang = gr.Dropdown(choices=["en", "de", "fr", "es", "it", "pl", "pt", "nl", "ru", "ar", "zh", "ja", "ko"], value="en", label="Sample Language")
651
  vp_samples = gr.Number(value=20, precision=0, label="Num samples")
652
  vp_split = gr.Dropdown(choices=["train", "validation", "test"], value="train", label="Split")
653
  vp_btn = gr.Button("Use Multilingual Dataset Sample")
654
 
655
  def _collect_multilingual_sample(lang_code: str, num_samples: int, split: str):
656
- """Load sample from multilingual datasets (ML Commons preferred)"""
657
  from datasets import load_dataset, Audio
658
  import random
659
 
660
- # Language code mapping for ML Commons Speech
661
- ml_lang_map = {
662
  "en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it",
663
  "pl": "pl", "pt": "pt", "nl": "nl", "ru": "ru", "ar": "ar",
664
- "zh": "zh", "ja": "ja", "ko": "ko"
 
 
 
665
  }
666
 
667
- ml_lang = ml_lang_map.get(lang_code, lang_code)
668
 
669
  try:
670
- # Try ML Commons Speech first
671
- ds = load_dataset("mlcommons/ml_spoken_words", f"speech_commands_{ml_lang}", split=split, streaming=True)
672
- ds = ds.cast_column("audio", Audio(sampling_rate=16000))
673
 
674
- dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
675
- rows: list[dict] = []
676
- texts: list[str] = []
677
 
 
 
 
678
  count = 0
679
- seen_words = set()
680
 
681
- for ex in ds:
 
682
  if count >= num_samples:
683
  break
684
 
685
- audio = ex.get("audio") or {}
686
- path = audio.get("path")
687
- word = ex.get("word", "").strip()
688
-
689
- if path and word and len(word) > 2 and word not in seen_words:
690
- rows.append({"audio_path": path, "text": word})
691
- texts.append(str(word))
692
- seen_words.add(word)
693
- count += 1
694
 
695
- if rows:
696
- jsonl_path = dataset_dir / "data.jsonl"
697
- _write_jsonl(rows, jsonl_path)
698
-
699
- # Build markdown content updates for on-screen prompts
700
- combined_updates = []
701
- for i in range(MAX_COMPONENTS):
702
- t = texts[i] if i < len(texts) else ""
703
- if i < len(texts):
704
- combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
705
- else:
706
- combined_updates.append(gr.update(visible=False))
707
-
708
- return (str(jsonl_path), texts, *combined_updates)
709
-
710
- except Exception as e:
711
- print(f"ML Commons Speech sample loading failed: {e}")
712
 
713
- # Try Multilingual LibriSpeech as backup
714
- try:
715
- ds = load_dataset("facebook/multilingual_librispeech", f"{lang_code}", split=split, streaming=True)
716
- ds = ds.cast_column("audio", Audio(sampling_rate=16000))
717
-
718
- dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
719
- rows: list[dict] = []
720
- texts: list[str] = []
721
-
722
- count = 0
723
- for ex in ds:
724
- if count >= num_samples:
725
- break
726
-
727
- audio = ex.get("audio") or {}
728
- path = audio.get("path")
729
- text = ex.get("text", "").strip()
730
-
731
- if path and text and len(text) > 10:
732
- rows.append({"audio_path": path, "text": text})
733
- texts.append(str(text))
734
  count += 1
735
 
736
  if rows:
737
  jsonl_path = dataset_dir / "data.jsonl"
738
  _write_jsonl(rows, jsonl_path)
739
 
740
- # Build markdown content updates for on-screen prompts
741
- combined_updates = []
 
 
 
742
  for i in range(MAX_COMPONENTS):
743
  t = texts[i] if i < len(texts) else ""
744
  if i < len(texts):
745
- combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
 
746
  else:
747
- combined_updates.append(gr.update(visible=False))
 
 
 
748
 
749
  return (str(jsonl_path), texts, *combined_updates)
750
 
751
  except Exception as e:
752
- print(f"Multilingual LibriSpeech failed: {e}")
753
 
754
- # Fallback: generate synthetic samples with text only
755
- print("Using fallback: generating text-only samples")
756
  phrases = load_multilingual_phrases(lang_code, max_phrases=num_samples)
757
  texts = phrases[:num_samples]
758
 
@@ -761,14 +731,19 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
761
  jsonl_path = dataset_dir / "data.jsonl"
762
  _write_jsonl(rows, jsonl_path)
763
 
764
- # Build markdown content updates for on-screen prompts
765
- combined_updates = []
 
766
  for i in range(MAX_COMPONENTS):
767
  t = texts[i] if i < len(texts) else ""
768
  if i < len(texts):
769
- combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
 
770
  else:
771
- combined_updates.append(gr.update(visible=False))
 
 
 
772
 
773
  return (str(jsonl_path), texts, *combined_updates)
774
 
 
252
 
253
 
254
  def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
255
+ """Load phrases from NVIDIA Granary dataset.
256
 
257
+ Uses the high-quality Granary dataset which contains speech recognition
258
+ and translation data for 25 European languages.
 
 
259
 
260
  Args:
261
  language: Language code (e.g., 'en', 'de', 'fr', etc.)
262
+ max_phrases: Maximum number of phrases to load (None for default 1000)
263
  split: Dataset split to use ('train', 'validation', 'test')
264
 
265
  Returns:
266
+ List of transcription phrases from Granary dataset
267
  """
268
  from datasets import load_dataset
269
  import random
270
 
271
+ # Default to 1000 phrases if not specified
272
+ if max_phrases is None:
273
+ max_phrases = 1000
274
+
275
+ # Language code mapping for Granary dataset
276
+ # Granary supports these language codes directly
277
+ granary_supported_langs = {
278
+ "en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it",
279
+ "pl": "pl", "pt": "pt", "nl": "nl", "ru": "ru", "ar": "ar",
280
+ "zh": "zh", "ja": "ja", "ko": "ko", "da": "da", "sv": "sv",
281
+ "no": "no", "fi": "fi", "et": "et", "lv": "lv", "lt": "lt",
282
+ "sl": "sl", "sk": "sk", "cs": "cs", "hr": "hr", "bg": "bg",
283
+ "uk": "uk", "ro": "ro", "hu": "hu", "el": "el", "mt": "mt"
 
 
284
  }
285
 
286
+ # Map input language to Granary configuration
287
+ granary_lang = granary_supported_langs.get(language, "en") # Default to English
288
 
 
289
  try:
290
+ print(f"Loading phrases from NVIDIA Granary dataset for language: {language}")
291
+
292
+ # Load Granary dataset with ASR (speech recognition) split
293
+ # Use streaming to handle large datasets efficiently
294
+ ds = load_dataset("nvidia/Granary", granary_lang, split="asr", streaming=True)
295
 
296
  phrases = []
297
  count = 0
298
+ seen_phrases = set()
299
 
300
+ # Sample phrases from the dataset
301
  for example in ds:
302
+ if count >= max_phrases:
303
  break
 
 
 
 
 
304
 
305
+ # Extract the text transcription
306
+ text = example.get("text", "").strip()
 
 
307
 
308
+ # Filter for quality phrases
309
+ if (text and
310
+ len(text) > 10 and # Minimum length
311
+ len(text) < 200 and # Maximum length to avoid very long utterances
312
+ text not in seen_phrases and # Avoid duplicates
313
+ not text.isdigit() and # Avoid pure numbers
314
+ not all(c in "0123456789., " for c in text)): # Avoid mostly numeric
315
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  phrases.append(text)
317
+ seen_phrases.add(text)
318
  count += 1
319
 
320
  if phrases:
321
+ # Shuffle the phrases for variety
322
  random.shuffle(phrases)
323
+ print(f"Successfully loaded {len(phrases)} phrases from Granary dataset for {language}")
324
  return phrases
325
 
326
+ else:
327
+ print(f"No suitable phrases found in Granary dataset for {language}")
328
+ raise Exception("No phrases found")
329
+
330
  except Exception as e:
331
+ print(f"Granary dataset loading failed for {language}: {e}")
332
+
333
+ # Fallback to basic phrases if Granary fails
334
+ print("Using fallback phrases")
335
+ fallback_phrases = [
336
+ "The quick brown fox jumps over the lazy dog.",
337
+ "Please say your full name.",
338
+ "Today is a good day to learn something new.",
339
+ "Artificial intelligence helps with many tasks.",
340
+ "I enjoy reading books and listening to music.",
341
+ "This is a sample sentence for testing speech.",
342
+ "Speak clearly and at a normal pace.",
343
+ "Numbers like one, two, three are easy to say.",
344
+ "The weather is sunny with a chance of rain.",
345
+ "Thank you for taking the time to help.",
346
+ "Hello, how are you today?",
347
+ "I would like to order a pizza.",
348
+ "The meeting is scheduled for tomorrow.",
349
+ "Please call me back as soon as possible.",
350
+ "Thank you for your assistance.",
351
+ "Can you help me with this problem?",
352
+ "I need to make a reservation.",
353
+ "The weather looks beautiful outside.",
354
+ "Let's go for a walk in the park.",
355
+ "I enjoy listening to classical music.",
356
+ ]
 
 
 
 
 
 
 
 
 
 
357
 
358
+ if max_phrases:
359
+ fallback_phrases = random.sample(fallback_phrases, min(max_phrases, len(fallback_phrases)))
360
+ else:
361
+ random.shuffle(fallback_phrases)
362
 
363
+ return fallback_phrases
364
 
365
  # Initialize phrases dynamically
366
  DEFAULT_LANGUAGE = "en" # Default to English
 
403
 
404
  jsonl_out = gr.Textbox(label="Dataset JSONL path", interactive=False, visible=True)
405
 
406
+ # Language selection for NVIDIA Granary phrases
407
  language_selector = gr.Dropdown(
408
  choices=[
409
+ ("English", "en"),
410
+ ("German", "de"),
411
+ ("French", "fr"),
412
+ ("Spanish", "es"),
413
+ ("Italian", "it"),
414
+ ("Portuguese", "pt"),
415
+ ("Polish", "pl"),
416
+ ("Dutch", "nl"),
417
+ ("Russian", "ru"),
418
+ ("Arabic", "ar"),
419
+ ("Chinese", "zh"),
420
+ ("Japanese", "ja"),
421
+ ("Korean", "ko"),
422
+ ("Danish", "da"),
423
+ ("Swedish", "sv"),
424
+ ("Norwegian", "no"),
425
+ ("Finnish", "fi"),
426
+ ("Estonian", "et"),
427
+ ("Latvian", "lv"),
428
+ ("Lithuanian", "lt"),
429
+ ("Slovenian", "sl"),
430
+ ("Slovak", "sk"),
431
+ ("Czech", "cs"),
432
+ ("Croatian", "hr"),
433
+ ("Bulgarian", "bg"),
434
+ ("Ukrainian", "uk"),
435
+ ("Romanian", "ro"),
436
+ ("Hungarian", "hu"),
437
+ ("Greek", "el"),
438
+ ("Maltese", "mt")
439
  ],
440
  value="en",
441
  label="Language for Speech Phrases",
442
+ info="Select language for authentic phrases from NVIDIA Granary dataset (25 European languages)"
443
  )
444
 
445
  # Recording grid with dynamic text readouts
 
475
  """Add 10 more rows by making them visible"""
476
  new_visible = min(current_visible + 10, MAX_COMPONENTS, len(current_phrases))
477
 
478
+ # Create updates for all MAX_COMPONENTS (both markdown and audio components)
479
+ markdown_updates = []
480
+ audio_updates = []
481
+
482
  for i in range(MAX_COMPONENTS):
483
  if i < len(current_phrases) and i < new_visible:
484
+ markdown_updates.append(gr.update(visible=True))
485
+ audio_updates.append(gr.update(visible=True))
486
  else:
487
+ markdown_updates.append(gr.update(visible=False))
488
+ audio_updates.append(gr.update(visible=False))
489
 
490
+ # Return: [state] + markdown_updates + audio_updates
491
+ return [new_visible] + markdown_updates + audio_updates
492
 
493
  def change_language(language):
494
  """Change the language and reload phrases from multilingual datasets"""
 
496
  # Reset visible rows to 10
497
  visible_count = min(10, len(new_phrases), MAX_COMPONENTS)
498
 
499
+ # Create separate updates for markdown and audio components
500
+ markdown_updates = []
501
+ audio_updates = []
502
+
503
  for i in range(MAX_COMPONENTS):
504
  if i < len(new_phrases) and i < visible_count:
505
+ markdown_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=True))
506
+ audio_updates.append(gr.update(visible=True))
507
  elif i < len(new_phrases):
508
+ markdown_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=False))
509
+ audio_updates.append(gr.update(visible=False))
510
  else:
511
+ markdown_updates.append(gr.update(value=f"**{i+1}. **", visible=False))
512
+ audio_updates.append(gr.update(visible=False))
513
 
514
+ # Return: [phrases_state, visible_state] + markdown_updates + audio_updates
515
+ return [new_phrases, visible_count] + markdown_updates + audio_updates
516
 
517
  # Connect language change to phrase reloading
518
  language_selector.change(
 
642
 
643
  # Quick sample from multilingual datasets (Common Voice, etc.)
644
  with gr.Row():
645
+ vp_lang = gr.Dropdown(choices=["en", "de", "fr", "es", "it", "pl", "pt", "nl", "ru", "ar", "zh", "ja", "ko", "da", "sv", "fi", "et", "cs", "hr", "bg", "uk", "ro", "hu", "el"], value="en", label="Sample Language")
646
  vp_samples = gr.Number(value=20, precision=0, label="Num samples")
647
  vp_split = gr.Dropdown(choices=["train", "validation", "test"], value="train", label="Split")
648
  vp_btn = gr.Button("Use Multilingual Dataset Sample")
649
 
650
  def _collect_multilingual_sample(lang_code: str, num_samples: int, split: str):
651
+ """Collect sample audio and text from NVIDIA Granary dataset"""
652
  from datasets import load_dataset, Audio
653
  import random
654
 
655
+ # Map language code to Granary format
656
+ granary_lang_map = {
657
  "en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it",
658
  "pl": "pl", "pt": "pt", "nl": "nl", "ru": "ru", "ar": "ar",
659
+ "zh": "zh", "ja": "ja", "ko": "ko", "da": "da", "sv": "sv",
660
+ "no": "no", "fi": "fi", "et": "et", "lv": "lv", "lt": "lt",
661
+ "sl": "sl", "sk": "sk", "cs": "cs", "hr": "hr", "bg": "bg",
662
+ "uk": "uk", "ro": "ro", "hu": "hu", "el": "el", "mt": "mt"
663
  }
664
 
665
+ granary_lang = granary_lang_map.get(lang_code, "en")
666
 
667
  try:
668
+ print(f"Collecting {num_samples} samples from NVIDIA Granary dataset for language: {lang_code}")
 
 
669
 
670
+ # Load Granary dataset with ASR split
671
+ ds = load_dataset("nvidia/Granary", granary_lang, split="asr", streaming=True)
 
672
 
673
+ dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
674
+ rows = []
675
+ texts = []
676
  count = 0
 
677
 
678
+ # Sample from the dataset
679
+ for example in ds:
680
  if count >= num_samples:
681
  break
682
 
683
+ text = example.get("text", "").strip()
684
+ audio_path = example.get("audio_filepath", "")
 
 
 
 
 
 
 
685
 
686
+ # Filter for quality samples
687
+ if (text and
688
+ len(text) > 10 and
689
+ len(text) < 200 and
690
+ audio_path): # Must have audio file
 
 
 
 
 
 
 
 
 
 
 
 
691
 
692
+ rows.append({
693
+ "audio_path": audio_path,
694
+ "text": text
695
+ })
696
+ texts.append(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
  count += 1
698
 
699
  if rows:
700
  jsonl_path = dataset_dir / "data.jsonl"
701
  _write_jsonl(rows, jsonl_path)
702
 
703
+ print(f"Successfully collected {len(rows)} samples from Granary dataset")
704
+
705
+ # Build markdown and audio content updates for on-screen prompts
706
+ markdown_updates = []
707
+ audio_updates = []
708
  for i in range(MAX_COMPONENTS):
709
  t = texts[i] if i < len(texts) else ""
710
  if i < len(texts):
711
+ markdown_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
712
+ audio_updates.append(gr.update(visible=True))
713
  else:
714
+ markdown_updates.append(gr.update(visible=False))
715
+ audio_updates.append(gr.update(visible=False))
716
+
717
+ combined_updates = markdown_updates + audio_updates
718
 
719
  return (str(jsonl_path), texts, *combined_updates)
720
 
721
  except Exception as e:
722
+ print(f"Granary sample collection failed for {lang_code}: {e}")
723
 
724
+ # Fallback: generate text-only samples if Granary fails
725
+ print(f"Using fallback: generating text-only samples for {lang_code}")
726
  phrases = load_multilingual_phrases(lang_code, max_phrases=num_samples)
727
  texts = phrases[:num_samples]
728
 
 
731
  jsonl_path = dataset_dir / "data.jsonl"
732
  _write_jsonl(rows, jsonl_path)
733
 
734
+ # Build markdown and audio content updates for on-screen prompts
735
+ markdown_updates = []
736
+ audio_updates = []
737
  for i in range(MAX_COMPONENTS):
738
  t = texts[i] if i < len(texts) else ""
739
  if i < len(texts):
740
+ markdown_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
741
+ audio_updates.append(gr.update(visible=True))
742
  else:
743
+ markdown_updates.append(gr.update(visible=False))
744
+ audio_updates.append(gr.update(visible=False))
745
+
746
+ combined_updates = markdown_updates + audio_updates
747
 
748
  return (str(jsonl_path), texts, *combined_updates)
749