Joseph Pollack commited on
Commit
b55e1b0
·
unverified ·
1 Parent(s): a595d5a

adds requirements , improves interface and dataset loading

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. interface.py +344 -142
  3. requirements.txt +7 -1
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ datasets/
interface.py CHANGED
@@ -50,24 +50,45 @@ def get_username_from_token(token: str) -> Optional[str]:
50
  def run_command_stream(args: list[str], env: Dict[str, str], cwd: Optional[Path] = None) -> Generator[str, None, int]:
51
  import subprocess
52
  import shlex
53
- yield f"$ {' '.join(shlex.quote(a) for a in ([get_python()] + args))}"
54
- process = subprocess.Popen(
55
- [get_python()] + args,
56
- stdout=subprocess.PIPE,
57
- stderr=subprocess.STDOUT,
58
- text=True,
59
- env=env,
60
- cwd=str(cwd or PROJECT_ROOT),
61
- bufsize=1,
62
- universal_newlines=True,
63
- )
64
- assert process.stdout is not None
65
- for line in iter(process.stdout.readline, ""):
66
- yield line.rstrip()
67
- process.stdout.close()
68
- code = process.wait()
69
- yield f"[exit_code={code}]"
70
- return code
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
 
73
  def detect_nvidia_driver() -> Tuple[bool, str]:
@@ -290,64 +311,93 @@ def start_voxtral_training(
290
  freeze_audio_tower: bool,
291
  push_to_hub: bool,
292
  deploy_demo: bool,
293
- ) -> Generator[str, None, None]:
 
294
  env = os.environ.copy()
295
  write_token = env.get("HF_WRITE_TOKEN") or env.get("HF_TOKEN")
296
  read_token = env.get("HF_READ_TOKEN")
297
  username = get_username_from_token(write_token or "") or env.get("HF_USERNAME") or ""
298
  output_dir = PROJECT_ROOT / "outputs" / repo_short
299
 
300
- # 1) Train
301
- script = PROJECT_ROOT / ("scripts/train_lora.py" if use_lora else "scripts/train.py")
302
- args = [str(script)]
303
- if jsonl_path:
304
- args += ["--dataset-jsonl", jsonl_path]
305
- args += [
306
- "--model-checkpoint", base_model,
307
- "--train-count", str(train_count),
308
- "--eval-count", str(eval_count),
309
- "--batch-size", str(batch_size),
310
- "--grad-accum", str(grad_accum),
311
- "--learning-rate", str(learning_rate),
312
- "--epochs", str(epochs),
313
- "--output-dir", str(output_dir),
314
- "--save-steps", "50",
315
- ]
316
- if use_lora:
317
  args += [
318
- "--lora-r", str(lora_r),
319
- "--lora-alpha", str(lora_alpha),
320
- "--lora-dropout", str(lora_dropout),
321
- ]
322
- if freeze_audio_tower:
323
- args += ["--freeze-audio-tower"]
324
- for line in run_command_stream(args, env):
325
- yield line
326
-
327
- # 2) Push to Hub
328
- if push_to_hub:
329
- repo_name = f"{username}/{repo_short}" if username else repo_short
330
- push_args = [
331
- str(PROJECT_ROOT / "scripts/push_to_huggingface.py"),
332
- "model",
333
- str(output_dir),
334
- repo_name,
335
  ]
336
- for line in run_command_stream(push_args, env):
337
- yield line
338
-
339
- # 3) Deploy demo Space
340
- if deploy_demo and username:
341
- deploy_args = [
342
- str(PROJECT_ROOT / "scripts/deploy_demo_space.py"),
343
- "--hf-token", write_token or "",
344
- "--hf-username", username,
345
- "--model-id", f"{username}/{repo_short}",
346
- "--demo-type", "voxtral",
347
- "--space-name", f"{repo_short}-demo",
348
- ]
349
- for line in run_command_stream(deploy_args, env):
350
- yield line
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
 
353
  def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
@@ -371,35 +421,45 @@ def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
371
  if max_phrases is None:
372
  max_phrases = 1000
373
 
374
- # Language code mapping for Granary dataset
375
- # Granary supports these language codes directly
376
- granary_supported_langs = {
377
- "en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it",
378
- "pl": "pl", "pt": "pt", "nl": "nl", "ru": "ru", "ar": "ar",
379
- "zh": "zh", "ja": "ja", "ko": "ko", "da": "da", "sv": "sv",
380
- "no": "no", "fi": "fi", "et": "et", "lv": "lv", "lt": "lt",
381
- "sl": "sl", "sk": "sk", "cs": "cs", "hr": "hr", "bg": "bg",
382
- "uk": "uk", "ro": "ro", "hu": "hu", "el": "el", "mt": "mt"
 
 
383
  }
384
 
385
- # Map input language to Granary configuration
386
- granary_lang = granary_supported_langs.get(language, "en") # Default to English
 
 
 
387
 
388
  try:
389
- print(f"Loading phrases from NVIDIA Granary dataset for language: {language}")
390
 
391
  # Check for authentication token
392
  token = os.getenv("HF_TOKEN") or os.getenv("HF_WRITE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
393
 
394
- # Load Granary dataset with ASR (speech recognition) split
395
- # Use streaming to handle large datasets efficiently
396
  if token:
397
- print(f"Using authentication token for Granary dataset access")
398
- ds = load_dataset("nvidia/Granary", granary_lang, split="asr", streaming=True, token=token)
 
 
 
 
 
399
  else:
400
- print(f"No HF_TOKEN found, attempting to load Granary dataset without authentication")
401
- ds = load_dataset("nvidia/Granary", granary_lang, split="asr", streaming=True)
402
 
 
403
  phrases = []
404
  count = 0
405
  seen_phrases = set()
@@ -409,8 +469,10 @@ def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
409
  if count >= max_phrases:
410
  break
411
 
412
- # Extract the text transcription
413
- text = example.get("text", "").strip()
 
 
414
 
415
  # Filter for quality phrases
416
  if (text and
@@ -427,45 +489,206 @@ def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
427
  if phrases:
428
  # Shuffle the phrases for variety
429
  random.shuffle(phrases)
430
- print(f"Successfully loaded {len(phrases)} phrases from Granary dataset for {language}")
 
431
  return phrases
432
 
433
  else:
434
- print(f"No suitable phrases found in Granary dataset for {language}")
435
  raise Exception("No phrases found")
436
 
437
  except Exception as e:
438
  error_msg = str(e).lower()
439
  if "401" in error_msg or "unauthorized" in error_msg:
440
- print(f"Granary dataset authentication failed for {language}: {e}")
441
  print("This dataset requires a Hugging Face token. Please set HF_TOKEN environment variable.")
442
  else:
443
- print(f"Granary dataset loading failed for {language}: {e}")
444
 
445
- # Fallback to basic phrases if Granary fails
446
  print("Using fallback phrases")
447
- fallback_phrases = [
448
- "The quick brown fox jumps over the lazy dog.",
449
- "Please say your full name.",
450
- "Today is a good day to learn something new.",
451
- "Artificial intelligence helps with many tasks.",
452
- "I enjoy reading books and listening to music.",
453
- "This is a sample sentence for testing speech.",
454
- "Speak clearly and at a normal pace.",
455
- "Numbers like one, two, three are easy to say.",
456
- "The weather is sunny with a chance of rain.",
457
- "Thank you for taking the time to help.",
458
- "Hello, how are you today?",
459
- "I would like to order a pizza.",
460
- "The meeting is scheduled for tomorrow.",
461
- "Please call me back as soon as possible.",
462
- "Thank you for your assistance.",
463
- "Can you help me with this problem?",
464
- "I need to make a reservation.",
465
- "The weather looks beautiful outside.",
466
- "Let's go for a walk in the park.",
467
- "I enjoy listening to classical music.",
468
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
  if max_phrases:
471
  fallback_phrases = random.sample(fallback_phrases, min(max_phrases, len(fallback_phrases)))
@@ -523,7 +746,8 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
523
  ⚠️ No HF_TOKEN detected
524
  </p>
525
  <p style="color: rgb(234, 88, 12); margin: 6px 0 0; font-size: 12px;">
526
- Set HF_TOKEN environment variable to access NVIDIA Granary dataset with authentic multilingual phrases.
 
527
  Currently using fallback phrases for demonstration.
528
  </p>
529
  </div>
@@ -533,43 +757,21 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
533
  # Hidden state to track dataset JSONL path
534
  jsonl_path_state = gr.State("")
535
 
536
- # Language selection for NVIDIA Granary phrases
537
  language_selector = gr.Dropdown(
538
  choices=[
539
  ("English", "en"),
540
- ("German", "de"),
541
  ("French", "fr"),
 
542
  ("Spanish", "es"),
543
  ("Italian", "it"),
544
  ("Portuguese", "pt"),
545
- ("Polish", "pl"),
546
  ("Dutch", "nl"),
547
- ("Russian", "ru"),
548
- ("Arabic", "ar"),
549
- ("Chinese", "zh"),
550
- ("Japanese", "ja"),
551
- ("Korean", "ko"),
552
- ("Danish", "da"),
553
- ("Swedish", "sv"),
554
- ("Norwegian", "no"),
555
- ("Finnish", "fi"),
556
- ("Estonian", "et"),
557
- ("Latvian", "lv"),
558
- ("Lithuanian", "lt"),
559
- ("Slovenian", "sl"),
560
- ("Slovak", "sk"),
561
- ("Czech", "cs"),
562
- ("Croatian", "hr"),
563
- ("Bulgarian", "bg"),
564
- ("Ukrainian", "uk"),
565
- ("Romanian", "ro"),
566
- ("Hungarian", "hu"),
567
- ("Greek", "el"),
568
- ("Maltese", "mt")
569
  ],
570
  value="en",
571
  label="Language for Speech Phrases",
572
- info="Select language for authentic phrases from NVIDIA Granary dataset (25 European languages)"
573
  )
574
 
575
  # Recording grid with dynamic text readouts
 
50
  def run_command_stream(args: list[str], env: Dict[str, str], cwd: Optional[Path] = None) -> Generator[str, None, int]:
51
  import subprocess
52
  import shlex
53
+ try:
54
+ cmd_line = ' '.join(shlex.quote(a) for a in ([get_python()] + args))
55
+ yield f"$ {cmd_line}"
56
+
57
+ process = subprocess.Popen(
58
+ [get_python()] + args,
59
+ stdout=subprocess.PIPE,
60
+ stderr=subprocess.STDOUT,
61
+ text=True,
62
+ env=env,
63
+ cwd=str(cwd or PROJECT_ROOT),
64
+ bufsize=1,
65
+ universal_newlines=True,
66
+ )
67
+
68
+ if process.stdout is None:
69
+ yield "❌ Error: Could not capture process output"
70
+ return 1
71
+
72
+ for line in iter(process.stdout.readline, ""):
73
+ if line.strip(): # Only yield non-empty lines
74
+ yield line.rstrip()
75
+
76
+ process.stdout.close()
77
+ code = process.wait()
78
+
79
+ if code != 0:
80
+ yield f"❌ Command failed with exit code: {code}"
81
+ else:
82
+ yield f"✅ Command completed successfully (exit code: {code})"
83
+
84
+ return code
85
+
86
+ except FileNotFoundError as e:
87
+ yield f"❌ Error: Python executable not found: {e}"
88
+ return 1
89
+ except Exception as e:
90
+ yield f"❌ Error running command: {str(e)}"
91
+ return 1
92
 
93
 
94
  def detect_nvidia_driver() -> Tuple[bool, str]:
 
311
  freeze_audio_tower: bool,
312
  push_to_hub: bool,
313
  deploy_demo: bool,
314
+ ) -> str:
315
+ """Start Voxtral training and return collected logs as a string."""
316
  env = os.environ.copy()
317
  write_token = env.get("HF_WRITE_TOKEN") or env.get("HF_TOKEN")
318
  read_token = env.get("HF_READ_TOKEN")
319
  username = get_username_from_token(write_token or "") or env.get("HF_USERNAME") or ""
320
  output_dir = PROJECT_ROOT / "outputs" / repo_short
321
 
322
+ # Collect all logs
323
+ all_logs = []
324
+
325
+ def collect_logs(generator):
326
+ """Helper to collect logs from a generator."""
327
+ for line in generator:
328
+ all_logs.append(line)
329
+ print(line) # Also print to console for debugging
330
+
331
+ try:
332
+ # 1) Train
333
+ script = PROJECT_ROOT / ("scripts/train_lora.py" if use_lora else "scripts/train.py")
334
+ args = [str(script)]
335
+ if jsonl_path:
336
+ args += ["--dataset-jsonl", jsonl_path]
 
 
337
  args += [
338
+ "--model-checkpoint", base_model,
339
+ "--train-count", str(train_count),
340
+ "--eval-count", str(eval_count),
341
+ "--batch-size", str(batch_size),
342
+ "--grad-accum", str(grad_accum),
343
+ "--learning-rate", str(learning_rate),
344
+ "--epochs", str(epochs),
345
+ "--output-dir", str(output_dir),
346
+ "--save-steps", "50",
 
 
 
 
 
 
 
 
347
  ]
348
+ if use_lora:
349
+ args += [
350
+ "--lora-r", str(lora_r),
351
+ "--lora-alpha", str(lora_alpha),
352
+ "--lora-dropout", str(lora_dropout),
353
+ ]
354
+ if freeze_audio_tower:
355
+ args += ["--freeze-audio-tower"]
356
+
357
+ all_logs.append("🚀 Starting Voxtral training...")
358
+ collect_logs(run_command_stream(args, env))
359
+ all_logs.append(" Training completed!")
360
+
361
+ # 2) Push to Hub
362
+ if push_to_hub:
363
+ if not username:
364
+ all_logs.append("❌ Cannot push to Hub: No username available. Set HF_TOKEN or HF_USERNAME.")
365
+ else:
366
+ repo_name = f"{username}/{repo_short}"
367
+ push_args = [
368
+ str(PROJECT_ROOT / "scripts/push_to_huggingface.py"),
369
+ "model",
370
+ str(output_dir),
371
+ repo_name,
372
+ ]
373
+ all_logs.append(f"📤 Pushing model to Hugging Face Hub: {repo_name}")
374
+ collect_logs(run_command_stream(push_args, env))
375
+ all_logs.append("✅ Model pushed successfully!")
376
+
377
+ # 3) Deploy demo Space
378
+ if deploy_demo and username:
379
+ deploy_args = [
380
+ str(PROJECT_ROOT / "scripts/deploy_demo_space.py"),
381
+ "--hf-token", write_token or "",
382
+ "--hf-username", username,
383
+ "--model-id", f"{username}/{repo_short}",
384
+ "--demo-type", "voxtral",
385
+ "--space-name", f"{repo_short}-demo",
386
+ ]
387
+ all_logs.append("🚀 Deploying demo Space...")
388
+ collect_logs(run_command_stream(deploy_args, env))
389
+ all_logs.append("✅ Demo Space deployed!")
390
+
391
+ # Return all collected logs as a single string
392
+ return "\n".join(all_logs)
393
+
394
+ except Exception as e:
395
+ error_msg = f"❌ Error during training: {str(e)}"
396
+ all_logs.append(error_msg)
397
+ print(error_msg) # Also print to console
398
+ import traceback
399
+ traceback.print_exc()
400
+ return "\n".join(all_logs)
401
 
402
 
403
  def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
 
421
  if max_phrases is None:
422
  max_phrases = 1000
423
 
424
+ # Language code mapping for CohereLabs AYA Collection dataset
425
+ # All Voxtral Mini supported languages are available in AYA Collection
426
+ aya_supported_langs = {
427
+ "en": "english", # English
428
+ "fr": "french", # French
429
+ "de": "german", # German
430
+ "es": "spanish", # Spanish
431
+ "it": "italian", # Italian
432
+ "pt": "portuguese", # Portuguese
433
+ "nl": "dutch", # Dutch
434
+ "hi": "hindi" # Hindi
435
  }
436
 
437
+ # Map input language to CohereLabs AYA Collection configuration
438
+ aya_lang = aya_supported_langs.get(language)
439
+
440
+ if not aya_lang:
441
+ raise Exception(f"Language {language} not supported in CohereLabs AYA Collection dataset")
442
 
443
  try:
444
+ print(f"Loading phrases from CohereLabs AYA Collection dataset for language: {language}")
445
 
446
  # Check for authentication token
447
  token = os.getenv("HF_TOKEN") or os.getenv("HF_WRITE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
448
 
449
+ # Try to load CohereLabs AYA Collection dataset for the specified language
 
450
  if token:
451
+ try:
452
+ ds = load_dataset("CohereLabs/aya_collection_language_split", aya_lang, split="train", streaming=True, token=token)
453
+ print(f"Successfully loaded CohereLabs AYA Collection {language} dataset")
454
+ except Exception as e:
455
+ # Fallback to other datasets
456
+ print(f"CohereLabs AYA Collection {language} not available ({e}), trying alternative datasets")
457
+ raise Exception("AYA Collection not available")
458
  else:
459
+ print("No HF_TOKEN found for CohereLabs AYA Collection dataset")
460
+ raise Exception("No token available")
461
 
462
+ # Common processing for both dataset types
463
  phrases = []
464
  count = 0
465
  seen_phrases = set()
 
469
  if count >= max_phrases:
470
  break
471
 
472
+ # Extract text from CohereLabs AYA Collection format: combine inputs and targets
473
+ inputs_text = example.get("inputs", "").strip()
474
+ targets_text = example.get("targets", "").strip()
475
+ text = f"{inputs_text} {targets_text}".strip()
476
 
477
  # Filter for quality phrases
478
  if (text and
 
489
  if phrases:
490
  # Shuffle the phrases for variety
491
  random.shuffle(phrases)
492
+ dataset_name = "CohereLabs AYA Collection"
493
+ print(f"Successfully loaded {len(phrases)} phrases from {dataset_name} dataset for {language}")
494
  return phrases
495
 
496
  else:
497
+ print(f"No suitable phrases found in dataset for {language}")
498
  raise Exception("No phrases found")
499
 
500
  except Exception as e:
501
  error_msg = str(e).lower()
502
  if "401" in error_msg or "unauthorized" in error_msg:
503
+ print(f"CohereLabs AYA Collection authentication failed for {language}: {e}")
504
  print("This dataset requires a Hugging Face token. Please set HF_TOKEN environment variable.")
505
  else:
506
+ print(f"CohereLabs AYA Collection loading failed for {language}: {e}")
507
 
508
+ # Fallback to basic phrases if dataset loading fails
509
  print("Using fallback phrases")
510
+
511
+ # Language-specific fallback phrases
512
+ language_fallbacks = {
513
+ "hi": [
514
+ "नमस्ते, आज आप कैसे हैं?",
515
+ "मेरा नाम राजेश कुमार है।",
516
+ "आज का मौसम बहुत अच्छा है।",
517
+ "मैं हिंदी में बात करना चाहता हूं।",
518
+ "कृपया धीरे और स्पष्ट बोलें।",
519
+ "यह एक परीक्षण वाक्य है।",
520
+ "मैं पुस्तकें पढ़ना पसंद करता हूं।",
521
+ "क्या आप मेरी मदद कर सकते हैं?",
522
+ "आपका फोन नंबर क्या है?",
523
+ "मैं कल सुबह आऊंगा।",
524
+ "धन्यवाद, आपका समय देने के लिए।",
525
+ "यह जगह बहुत सुंदर है।",
526
+ "मैं भोजन तैयार करना सीख रहा हूं।",
527
+ "क्या यह रास्ता स��ी है?",
528
+ "मैं स्कूल जाना चाहता हूं।",
529
+ "आपकी उम्र क्या है?",
530
+ "यह कितने का है?",
531
+ "मैं थक गया हूं।",
532
+ "आप कहां से हैं?",
533
+ "चलिए पार्क में टहलते हैं।"
534
+ ],
535
+ "en": [
536
+ "Hello, how are you today?",
537
+ "My name is John Smith.",
538
+ "The weather is very nice today.",
539
+ "I want to speak in English.",
540
+ "Please speak slowly and clearly.",
541
+ "This is a test sentence.",
542
+ "I enjoy reading books.",
543
+ "Can you help me?",
544
+ "What is your phone number?",
545
+ "I will come tomorrow morning.",
546
+ "Thank you for your time.",
547
+ "This place is very beautiful.",
548
+ "I am learning to cook food.",
549
+ "Is this the right way?",
550
+ "I want to go to school.",
551
+ "How old are you?",
552
+ "How much does this cost?",
553
+ "I am tired.",
554
+ "Where are you from?",
555
+ "Let's go for a walk in the park."
556
+ ],
557
+ "fr": [
558
+ "Bonjour, comment allez-vous aujourd'hui?",
559
+ "Je m'appelle Jean Dupont.",
560
+ "Le temps est très beau aujourd'hui.",
561
+ "Je veux parler en français.",
562
+ "Parlez lentement et clairement s'il vous plaît.",
563
+ "Ceci est une phrase de test.",
564
+ "J'aime lire des livres.",
565
+ "Pouvez-vous m'aider?",
566
+ "Quel est votre numéro de téléphone?",
567
+ "Je viendrai demain matin.",
568
+ "Merci pour votre temps.",
569
+ "Cet endroit est très beau.",
570
+ "J'apprends à cuisiner.",
571
+ "Est-ce le bon chemin?",
572
+ "Je veux aller à l'école.",
573
+ "Quel âge avez-vous?",
574
+ "Combien cela coûte-t-il?",
575
+ "Je suis fatigué.",
576
+ "D'où venez-vous?",
577
+ "Allons nous promener dans le parc."
578
+ ],
579
+ "de": [
580
+ "Hallo, wie geht es Ihnen heute?",
581
+ "Mein Name ist Hans Müller.",
582
+ "Das Wetter ist heute sehr schön.",
583
+ "Ich möchte auf Deutsch sprechen.",
584
+ "Sprechen Sie bitte langsam und deutlich.",
585
+ "Dies ist ein Testsatz.",
586
+ "Ich lese gerne Bücher.",
587
+ "Können Sie mir helfen?",
588
+ "Wie ist Ihre Telefonnummer?",
589
+ "Ich komme morgen früh.",
590
+ "Vielen Dank für Ihre Zeit.",
591
+ "Dieser Ort ist sehr schön.",
592
+ "Ich lerne kochen.",
593
+ "Ist das der richtige Weg?",
594
+ "Ich möchte zur Schule gehen.",
595
+ "Wie alt sind Sie?",
596
+ "Wie viel kostet das?",
597
+ "Ich bin müde.",
598
+ "Woher kommen Sie?",
599
+ "Lassen Sie uns im Park spazieren gehen."
600
+ ],
601
+ "es": [
602
+ "Hola, ¿cómo estás hoy?",
603
+ "Me llamo Juan García.",
604
+ "El tiempo está muy bueno hoy.",
605
+ "Quiero hablar en español.",
606
+ "Por favor habla despacio y claro.",
607
+ "Esta es una oración de prueba.",
608
+ "Me gusta leer libros.",
609
+ "¿Puedes ayudarme?",
610
+ "¿Cuál es tu número de teléfono?",
611
+ "Vendré mañana por la mañana.",
612
+ "Gracias por tu tiempo.",
613
+ "Este lugar es muy bonito.",
614
+ "Estoy aprendiendo a cocinar.",
615
+ "¿Es este el camino correcto?",
616
+ "Quiero ir a la escuela.",
617
+ "¿Cuántos años tienes?",
618
+ "¿Cuánto cuesta esto?",
619
+ "Estoy cansado.",
620
+ "¿De dónde eres?",
621
+ "Vamos a caminar por el parque."
622
+ ],
623
+ "it": [
624
+ "Ciao, come stai oggi?",
625
+ "Mi chiamo Mario Rossi.",
626
+ "Il tempo è molto bello oggi.",
627
+ "Voglio parlare in italiano.",
628
+ "Per favore parla lentamente e chiaramente.",
629
+ "Questa è una frase di prova.",
630
+ "Mi piace leggere libri.",
631
+ "Puoi aiutarmi?",
632
+ "Qual è il tuo numero di telefono?",
633
+ "Verrò domani mattina.",
634
+ "Grazie per il tuo tempo.",
635
+ "Questo posto è molto bello.",
636
+ "Sto imparando a cucinare.",
637
+ "È questa la strada giusta?",
638
+ "Voglio andare a scuola.",
639
+ "Quanti anni hai?",
640
+ "Quanto costa questo?",
641
+ "Sono stanco.",
642
+ "Da dove vieni?",
643
+ "Andiamo a fare una passeggiata nel parco."
644
+ ],
645
+ "pt": [
646
+ "Olá, como você está hoje?",
647
+ "Meu nome é João Silva.",
648
+ "O tempo está muito bom hoje.",
649
+ "Quero falar em português.",
650
+ "Por favor fale devagar e claramente.",
651
+ "Esta é uma frase de teste.",
652
+ "Eu gosto de ler livros.",
653
+ "Você pode me ajudar?",
654
+ "Qual é o seu número de telefone?",
655
+ "Vou vir amanhã de manhã.",
656
+ "Obrigado pelo seu tempo.",
657
+ "Este lugar é muito bonito.",
658
+ "Estou aprendendo a cozinhar.",
659
+ "Este é o caminho certo?",
660
+ "Quero ir para a escola.",
661
+ "Quantos anos você tem?",
662
+ "Quanto custa isso?",
663
+ "Estou cansado.",
664
+ "De onde você é?",
665
+ "Vamos dar um passeio no parque."
666
+ ],
667
+ "nl": [
668
+ "Hallo, hoe gaat het vandaag met je?",
669
+ "Mijn naam is Jan de Vries.",
670
+ "Het weer is vandaag erg mooi.",
671
+ "Ik wil in het Nederlands spreken.",
672
+ "Spreek langzaam en duidelijk alstublieft.",
673
+ "Dit is een testzin.",
674
+ "Ik houd van het lezen van boeken.",
675
+ "Kun je me helpen?",
676
+ "Wat is je telefoonnummer?",
677
+ "Ik kom morgenochtend.",
678
+ "Bedankt voor je tijd.",
679
+ "Deze plek is erg mooi.",
680
+ "Ik leer koken.",
681
+ "Is dit de juiste weg?",
682
+ "Ik wil naar school gaan.",
683
+ "Hoe oud ben je?",
684
+ "Hoeveel kost dit?",
685
+ "Ik ben moe.",
686
+ "Waar kom je vandaan?",
687
+ "Laten we een wandeling maken in het park."
688
+ ]
689
+ }
690
+
691
+ fallback_phrases = language_fallbacks.get(language, language_fallbacks["en"])
692
 
693
  if max_phrases:
694
  fallback_phrases = random.sample(fallback_phrases, min(max_phrases, len(fallback_phrases)))
 
746
  ⚠️ No HF_TOKEN detected
747
  </p>
748
  <p style="color: rgb(234, 88, 12); margin: 6px 0 0; font-size: 12px;">
749
+ Set HF_TOKEN environment variable to access CohereLabs AYA Collection dataset with authentic multilingual phrases.
750
+ This dataset provides high-quality text in 100+ languages for all Voxtral Mini supported languages.
751
  Currently using fallback phrases for demonstration.
752
  </p>
753
  </div>
 
757
  # Hidden state to track dataset JSONL path
758
  jsonl_path_state = gr.State("")
759
 
760
+ # Language selection for Voxtral Mini supported languages
761
  language_selector = gr.Dropdown(
762
  choices=[
763
  ("English", "en"),
 
764
  ("French", "fr"),
765
+ ("German", "de"),
766
  ("Spanish", "es"),
767
  ("Italian", "it"),
768
  ("Portuguese", "pt"),
 
769
  ("Dutch", "nl"),
770
+ ("Hindi", "hi")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
771
  ],
772
  value="en",
773
  label="Language for Speech Phrases",
774
+ info="Select language for authentic phrases (Voxtral Mini supported languages). All languages use CohereLabs AYA Collection dataset when HF_TOKEN is available."
775
  )
776
 
777
  # Recording grid with dynamic text readouts
requirements.txt CHANGED
@@ -1,7 +1,13 @@
1
  torch
 
 
 
2
  datasets
3
  peft
4
  transformers
5
  gradio
6
  trackio
7
- huggingface_hub
 
 
 
 
1
  torch
2
+ triton
3
+ torchvision
4
+ torchaudio
5
  datasets
6
  peft
7
  transformers
8
  gradio
9
  trackio
10
+ huggingface_hub
11
+ soundfile
12
+ librosa
13
+ mistral-common