Spaces:
Running
Running
Joseph Pollack
commited on
adds requirements , improves interface and dataset loading
Browse files- .gitignore +1 -0
- interface.py +344 -142
- requirements.txt +7 -1
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
datasets/
|
interface.py
CHANGED
@@ -50,24 +50,45 @@ def get_username_from_token(token: str) -> Optional[str]:
|
|
50 |
def run_command_stream(args: list[str], env: Dict[str, str], cwd: Optional[Path] = None) -> Generator[str, None, int]:
|
51 |
import subprocess
|
52 |
import shlex
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
|
73 |
def detect_nvidia_driver() -> Tuple[bool, str]:
|
@@ -290,64 +311,93 @@ def start_voxtral_training(
|
|
290 |
freeze_audio_tower: bool,
|
291 |
push_to_hub: bool,
|
292 |
deploy_demo: bool,
|
293 |
-
) ->
|
|
|
294 |
env = os.environ.copy()
|
295 |
write_token = env.get("HF_WRITE_TOKEN") or env.get("HF_TOKEN")
|
296 |
read_token = env.get("HF_READ_TOKEN")
|
297 |
username = get_username_from_token(write_token or "") or env.get("HF_USERNAME") or ""
|
298 |
output_dir = PROJECT_ROOT / "outputs" / repo_short
|
299 |
|
300 |
-
#
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
"
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
]
|
316 |
-
if use_lora:
|
317 |
args += [
|
318 |
-
"--
|
319 |
-
"--
|
320 |
-
"--
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
# 2) Push to Hub
|
328 |
-
if push_to_hub:
|
329 |
-
repo_name = f"{username}/{repo_short}" if username else repo_short
|
330 |
-
push_args = [
|
331 |
-
str(PROJECT_ROOT / "scripts/push_to_huggingface.py"),
|
332 |
-
"model",
|
333 |
-
str(output_dir),
|
334 |
-
repo_name,
|
335 |
]
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
|
352 |
|
353 |
def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
@@ -371,35 +421,45 @@ def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
|
371 |
if max_phrases is None:
|
372 |
max_phrases = 1000
|
373 |
|
374 |
-
# Language code mapping for
|
375 |
-
#
|
376 |
-
|
377 |
-
"en": "
|
378 |
-
"
|
379 |
-
"
|
380 |
-
"
|
381 |
-
"
|
382 |
-
"
|
|
|
|
|
383 |
}
|
384 |
|
385 |
-
# Map input language to
|
386 |
-
|
|
|
|
|
|
|
387 |
|
388 |
try:
|
389 |
-
print(f"Loading phrases from
|
390 |
|
391 |
# Check for authentication token
|
392 |
token = os.getenv("HF_TOKEN") or os.getenv("HF_WRITE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
393 |
|
394 |
-
#
|
395 |
-
# Use streaming to handle large datasets efficiently
|
396 |
if token:
|
397 |
-
|
398 |
-
|
|
|
|
|
|
|
|
|
|
|
399 |
else:
|
400 |
-
print(
|
401 |
-
|
402 |
|
|
|
403 |
phrases = []
|
404 |
count = 0
|
405 |
seen_phrases = set()
|
@@ -409,8 +469,10 @@ def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
|
409 |
if count >= max_phrases:
|
410 |
break
|
411 |
|
412 |
-
# Extract
|
413 |
-
|
|
|
|
|
414 |
|
415 |
# Filter for quality phrases
|
416 |
if (text and
|
@@ -427,45 +489,206 @@ def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
|
427 |
if phrases:
|
428 |
# Shuffle the phrases for variety
|
429 |
random.shuffle(phrases)
|
430 |
-
|
|
|
431 |
return phrases
|
432 |
|
433 |
else:
|
434 |
-
print(f"No suitable phrases found in
|
435 |
raise Exception("No phrases found")
|
436 |
|
437 |
except Exception as e:
|
438 |
error_msg = str(e).lower()
|
439 |
if "401" in error_msg or "unauthorized" in error_msg:
|
440 |
-
print(f"
|
441 |
print("This dataset requires a Hugging Face token. Please set HF_TOKEN environment variable.")
|
442 |
else:
|
443 |
-
print(f"
|
444 |
|
445 |
-
# Fallback to basic phrases if
|
446 |
print("Using fallback phrases")
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
"
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
|
470 |
if max_phrases:
|
471 |
fallback_phrases = random.sample(fallback_phrases, min(max_phrases, len(fallback_phrases)))
|
@@ -523,7 +746,8 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
523 |
⚠️ No HF_TOKEN detected
|
524 |
</p>
|
525 |
<p style="color: rgb(234, 88, 12); margin: 6px 0 0; font-size: 12px;">
|
526 |
-
Set HF_TOKEN environment variable to access
|
|
|
527 |
Currently using fallback phrases for demonstration.
|
528 |
</p>
|
529 |
</div>
|
@@ -533,43 +757,21 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
533 |
# Hidden state to track dataset JSONL path
|
534 |
jsonl_path_state = gr.State("")
|
535 |
|
536 |
-
# Language selection for
|
537 |
language_selector = gr.Dropdown(
|
538 |
choices=[
|
539 |
("English", "en"),
|
540 |
-
("German", "de"),
|
541 |
("French", "fr"),
|
|
|
542 |
("Spanish", "es"),
|
543 |
("Italian", "it"),
|
544 |
("Portuguese", "pt"),
|
545 |
-
("Polish", "pl"),
|
546 |
("Dutch", "nl"),
|
547 |
-
("
|
548 |
-
("Arabic", "ar"),
|
549 |
-
("Chinese", "zh"),
|
550 |
-
("Japanese", "ja"),
|
551 |
-
("Korean", "ko"),
|
552 |
-
("Danish", "da"),
|
553 |
-
("Swedish", "sv"),
|
554 |
-
("Norwegian", "no"),
|
555 |
-
("Finnish", "fi"),
|
556 |
-
("Estonian", "et"),
|
557 |
-
("Latvian", "lv"),
|
558 |
-
("Lithuanian", "lt"),
|
559 |
-
("Slovenian", "sl"),
|
560 |
-
("Slovak", "sk"),
|
561 |
-
("Czech", "cs"),
|
562 |
-
("Croatian", "hr"),
|
563 |
-
("Bulgarian", "bg"),
|
564 |
-
("Ukrainian", "uk"),
|
565 |
-
("Romanian", "ro"),
|
566 |
-
("Hungarian", "hu"),
|
567 |
-
("Greek", "el"),
|
568 |
-
("Maltese", "mt")
|
569 |
],
|
570 |
value="en",
|
571 |
label="Language for Speech Phrases",
|
572 |
-
info="Select language for authentic phrases
|
573 |
)
|
574 |
|
575 |
# Recording grid with dynamic text readouts
|
|
|
50 |
def run_command_stream(args: list[str], env: Dict[str, str], cwd: Optional[Path] = None) -> Generator[str, None, int]:
|
51 |
import subprocess
|
52 |
import shlex
|
53 |
+
try:
|
54 |
+
cmd_line = ' '.join(shlex.quote(a) for a in ([get_python()] + args))
|
55 |
+
yield f"$ {cmd_line}"
|
56 |
+
|
57 |
+
process = subprocess.Popen(
|
58 |
+
[get_python()] + args,
|
59 |
+
stdout=subprocess.PIPE,
|
60 |
+
stderr=subprocess.STDOUT,
|
61 |
+
text=True,
|
62 |
+
env=env,
|
63 |
+
cwd=str(cwd or PROJECT_ROOT),
|
64 |
+
bufsize=1,
|
65 |
+
universal_newlines=True,
|
66 |
+
)
|
67 |
+
|
68 |
+
if process.stdout is None:
|
69 |
+
yield "❌ Error: Could not capture process output"
|
70 |
+
return 1
|
71 |
+
|
72 |
+
for line in iter(process.stdout.readline, ""):
|
73 |
+
if line.strip(): # Only yield non-empty lines
|
74 |
+
yield line.rstrip()
|
75 |
+
|
76 |
+
process.stdout.close()
|
77 |
+
code = process.wait()
|
78 |
+
|
79 |
+
if code != 0:
|
80 |
+
yield f"❌ Command failed with exit code: {code}"
|
81 |
+
else:
|
82 |
+
yield f"✅ Command completed successfully (exit code: {code})"
|
83 |
+
|
84 |
+
return code
|
85 |
+
|
86 |
+
except FileNotFoundError as e:
|
87 |
+
yield f"❌ Error: Python executable not found: {e}"
|
88 |
+
return 1
|
89 |
+
except Exception as e:
|
90 |
+
yield f"❌ Error running command: {str(e)}"
|
91 |
+
return 1
|
92 |
|
93 |
|
94 |
def detect_nvidia_driver() -> Tuple[bool, str]:
|
|
|
311 |
freeze_audio_tower: bool,
|
312 |
push_to_hub: bool,
|
313 |
deploy_demo: bool,
|
314 |
+
) -> str:
|
315 |
+
"""Start Voxtral training and return collected logs as a string."""
|
316 |
env = os.environ.copy()
|
317 |
write_token = env.get("HF_WRITE_TOKEN") or env.get("HF_TOKEN")
|
318 |
read_token = env.get("HF_READ_TOKEN")
|
319 |
username = get_username_from_token(write_token or "") or env.get("HF_USERNAME") or ""
|
320 |
output_dir = PROJECT_ROOT / "outputs" / repo_short
|
321 |
|
322 |
+
# Collect all logs
|
323 |
+
all_logs = []
|
324 |
+
|
325 |
+
def collect_logs(generator):
|
326 |
+
"""Helper to collect logs from a generator."""
|
327 |
+
for line in generator:
|
328 |
+
all_logs.append(line)
|
329 |
+
print(line) # Also print to console for debugging
|
330 |
+
|
331 |
+
try:
|
332 |
+
# 1) Train
|
333 |
+
script = PROJECT_ROOT / ("scripts/train_lora.py" if use_lora else "scripts/train.py")
|
334 |
+
args = [str(script)]
|
335 |
+
if jsonl_path:
|
336 |
+
args += ["--dataset-jsonl", jsonl_path]
|
|
|
|
|
337 |
args += [
|
338 |
+
"--model-checkpoint", base_model,
|
339 |
+
"--train-count", str(train_count),
|
340 |
+
"--eval-count", str(eval_count),
|
341 |
+
"--batch-size", str(batch_size),
|
342 |
+
"--grad-accum", str(grad_accum),
|
343 |
+
"--learning-rate", str(learning_rate),
|
344 |
+
"--epochs", str(epochs),
|
345 |
+
"--output-dir", str(output_dir),
|
346 |
+
"--save-steps", "50",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
347 |
]
|
348 |
+
if use_lora:
|
349 |
+
args += [
|
350 |
+
"--lora-r", str(lora_r),
|
351 |
+
"--lora-alpha", str(lora_alpha),
|
352 |
+
"--lora-dropout", str(lora_dropout),
|
353 |
+
]
|
354 |
+
if freeze_audio_tower:
|
355 |
+
args += ["--freeze-audio-tower"]
|
356 |
+
|
357 |
+
all_logs.append("🚀 Starting Voxtral training...")
|
358 |
+
collect_logs(run_command_stream(args, env))
|
359 |
+
all_logs.append("✅ Training completed!")
|
360 |
+
|
361 |
+
# 2) Push to Hub
|
362 |
+
if push_to_hub:
|
363 |
+
if not username:
|
364 |
+
all_logs.append("❌ Cannot push to Hub: No username available. Set HF_TOKEN or HF_USERNAME.")
|
365 |
+
else:
|
366 |
+
repo_name = f"{username}/{repo_short}"
|
367 |
+
push_args = [
|
368 |
+
str(PROJECT_ROOT / "scripts/push_to_huggingface.py"),
|
369 |
+
"model",
|
370 |
+
str(output_dir),
|
371 |
+
repo_name,
|
372 |
+
]
|
373 |
+
all_logs.append(f"📤 Pushing model to Hugging Face Hub: {repo_name}")
|
374 |
+
collect_logs(run_command_stream(push_args, env))
|
375 |
+
all_logs.append("✅ Model pushed successfully!")
|
376 |
+
|
377 |
+
# 3) Deploy demo Space
|
378 |
+
if deploy_demo and username:
|
379 |
+
deploy_args = [
|
380 |
+
str(PROJECT_ROOT / "scripts/deploy_demo_space.py"),
|
381 |
+
"--hf-token", write_token or "",
|
382 |
+
"--hf-username", username,
|
383 |
+
"--model-id", f"{username}/{repo_short}",
|
384 |
+
"--demo-type", "voxtral",
|
385 |
+
"--space-name", f"{repo_short}-demo",
|
386 |
+
]
|
387 |
+
all_logs.append("🚀 Deploying demo Space...")
|
388 |
+
collect_logs(run_command_stream(deploy_args, env))
|
389 |
+
all_logs.append("✅ Demo Space deployed!")
|
390 |
+
|
391 |
+
# Return all collected logs as a single string
|
392 |
+
return "\n".join(all_logs)
|
393 |
+
|
394 |
+
except Exception as e:
|
395 |
+
error_msg = f"❌ Error during training: {str(e)}"
|
396 |
+
all_logs.append(error_msg)
|
397 |
+
print(error_msg) # Also print to console
|
398 |
+
import traceback
|
399 |
+
traceback.print_exc()
|
400 |
+
return "\n".join(all_logs)
|
401 |
|
402 |
|
403 |
def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
|
|
421 |
if max_phrases is None:
|
422 |
max_phrases = 1000
|
423 |
|
424 |
+
# Language code mapping for CohereLabs AYA Collection dataset
|
425 |
+
# All Voxtral Mini supported languages are available in AYA Collection
|
426 |
+
aya_supported_langs = {
|
427 |
+
"en": "english", # English
|
428 |
+
"fr": "french", # French
|
429 |
+
"de": "german", # German
|
430 |
+
"es": "spanish", # Spanish
|
431 |
+
"it": "italian", # Italian
|
432 |
+
"pt": "portuguese", # Portuguese
|
433 |
+
"nl": "dutch", # Dutch
|
434 |
+
"hi": "hindi" # Hindi
|
435 |
}
|
436 |
|
437 |
+
# Map input language to CohereLabs AYA Collection configuration
|
438 |
+
aya_lang = aya_supported_langs.get(language)
|
439 |
+
|
440 |
+
if not aya_lang:
|
441 |
+
raise Exception(f"Language {language} not supported in CohereLabs AYA Collection dataset")
|
442 |
|
443 |
try:
|
444 |
+
print(f"Loading phrases from CohereLabs AYA Collection dataset for language: {language}")
|
445 |
|
446 |
# Check for authentication token
|
447 |
token = os.getenv("HF_TOKEN") or os.getenv("HF_WRITE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
448 |
|
449 |
+
# Try to load CohereLabs AYA Collection dataset for the specified language
|
|
|
450 |
if token:
|
451 |
+
try:
|
452 |
+
ds = load_dataset("CohereLabs/aya_collection_language_split", aya_lang, split="train", streaming=True, token=token)
|
453 |
+
print(f"Successfully loaded CohereLabs AYA Collection {language} dataset")
|
454 |
+
except Exception as e:
|
455 |
+
# Fallback to other datasets
|
456 |
+
print(f"CohereLabs AYA Collection {language} not available ({e}), trying alternative datasets")
|
457 |
+
raise Exception("AYA Collection not available")
|
458 |
else:
|
459 |
+
print("No HF_TOKEN found for CohereLabs AYA Collection dataset")
|
460 |
+
raise Exception("No token available")
|
461 |
|
462 |
+
# Common processing for both dataset types
|
463 |
phrases = []
|
464 |
count = 0
|
465 |
seen_phrases = set()
|
|
|
469 |
if count >= max_phrases:
|
470 |
break
|
471 |
|
472 |
+
# Extract text from CohereLabs AYA Collection format: combine inputs and targets
|
473 |
+
inputs_text = example.get("inputs", "").strip()
|
474 |
+
targets_text = example.get("targets", "").strip()
|
475 |
+
text = f"{inputs_text} {targets_text}".strip()
|
476 |
|
477 |
# Filter for quality phrases
|
478 |
if (text and
|
|
|
489 |
if phrases:
|
490 |
# Shuffle the phrases for variety
|
491 |
random.shuffle(phrases)
|
492 |
+
dataset_name = "CohereLabs AYA Collection"
|
493 |
+
print(f"Successfully loaded {len(phrases)} phrases from {dataset_name} dataset for {language}")
|
494 |
return phrases
|
495 |
|
496 |
else:
|
497 |
+
print(f"No suitable phrases found in dataset for {language}")
|
498 |
raise Exception("No phrases found")
|
499 |
|
500 |
except Exception as e:
|
501 |
error_msg = str(e).lower()
|
502 |
if "401" in error_msg or "unauthorized" in error_msg:
|
503 |
+
print(f"CohereLabs AYA Collection authentication failed for {language}: {e}")
|
504 |
print("This dataset requires a Hugging Face token. Please set HF_TOKEN environment variable.")
|
505 |
else:
|
506 |
+
print(f"CohereLabs AYA Collection loading failed for {language}: {e}")
|
507 |
|
508 |
+
# Fallback to basic phrases if dataset loading fails
|
509 |
print("Using fallback phrases")
|
510 |
+
|
511 |
+
# Language-specific fallback phrases
|
512 |
+
language_fallbacks = {
|
513 |
+
"hi": [
|
514 |
+
"नमस्ते, आज आप कैसे हैं?",
|
515 |
+
"मेरा नाम राजेश कुमार है।",
|
516 |
+
"आज का मौसम बहुत अच्छा है।",
|
517 |
+
"मैं हिंदी में बात करना चाहता हूं।",
|
518 |
+
"कृपया धीरे और स्पष्ट बोलें।",
|
519 |
+
"यह एक परीक्षण वाक्य है।",
|
520 |
+
"मैं पुस्तकें पढ़ना पसंद करता हूं।",
|
521 |
+
"क्या आप मेरी मदद कर सकते हैं?",
|
522 |
+
"आपका फोन नंबर क्या है?",
|
523 |
+
"मैं कल सुबह आऊंगा।",
|
524 |
+
"धन्यवाद, आपका समय देने के लिए।",
|
525 |
+
"यह जगह बहुत सुंदर है।",
|
526 |
+
"मैं भोजन तैयार करना सीख रहा हूं।",
|
527 |
+
"क्या यह रास्ता स��ी है?",
|
528 |
+
"मैं स्कूल जाना चाहता हूं।",
|
529 |
+
"आपकी उम्र क्या है?",
|
530 |
+
"यह कितने का है?",
|
531 |
+
"मैं थक गया हूं।",
|
532 |
+
"आप कहां से हैं?",
|
533 |
+
"चलिए पार्क में टहलते हैं।"
|
534 |
+
],
|
535 |
+
"en": [
|
536 |
+
"Hello, how are you today?",
|
537 |
+
"My name is John Smith.",
|
538 |
+
"The weather is very nice today.",
|
539 |
+
"I want to speak in English.",
|
540 |
+
"Please speak slowly and clearly.",
|
541 |
+
"This is a test sentence.",
|
542 |
+
"I enjoy reading books.",
|
543 |
+
"Can you help me?",
|
544 |
+
"What is your phone number?",
|
545 |
+
"I will come tomorrow morning.",
|
546 |
+
"Thank you for your time.",
|
547 |
+
"This place is very beautiful.",
|
548 |
+
"I am learning to cook food.",
|
549 |
+
"Is this the right way?",
|
550 |
+
"I want to go to school.",
|
551 |
+
"How old are you?",
|
552 |
+
"How much does this cost?",
|
553 |
+
"I am tired.",
|
554 |
+
"Where are you from?",
|
555 |
+
"Let's go for a walk in the park."
|
556 |
+
],
|
557 |
+
"fr": [
|
558 |
+
"Bonjour, comment allez-vous aujourd'hui?",
|
559 |
+
"Je m'appelle Jean Dupont.",
|
560 |
+
"Le temps est très beau aujourd'hui.",
|
561 |
+
"Je veux parler en français.",
|
562 |
+
"Parlez lentement et clairement s'il vous plaît.",
|
563 |
+
"Ceci est une phrase de test.",
|
564 |
+
"J'aime lire des livres.",
|
565 |
+
"Pouvez-vous m'aider?",
|
566 |
+
"Quel est votre numéro de téléphone?",
|
567 |
+
"Je viendrai demain matin.",
|
568 |
+
"Merci pour votre temps.",
|
569 |
+
"Cet endroit est très beau.",
|
570 |
+
"J'apprends à cuisiner.",
|
571 |
+
"Est-ce le bon chemin?",
|
572 |
+
"Je veux aller à l'école.",
|
573 |
+
"Quel âge avez-vous?",
|
574 |
+
"Combien cela coûte-t-il?",
|
575 |
+
"Je suis fatigué.",
|
576 |
+
"D'où venez-vous?",
|
577 |
+
"Allons nous promener dans le parc."
|
578 |
+
],
|
579 |
+
"de": [
|
580 |
+
"Hallo, wie geht es Ihnen heute?",
|
581 |
+
"Mein Name ist Hans Müller.",
|
582 |
+
"Das Wetter ist heute sehr schön.",
|
583 |
+
"Ich möchte auf Deutsch sprechen.",
|
584 |
+
"Sprechen Sie bitte langsam und deutlich.",
|
585 |
+
"Dies ist ein Testsatz.",
|
586 |
+
"Ich lese gerne Bücher.",
|
587 |
+
"Können Sie mir helfen?",
|
588 |
+
"Wie ist Ihre Telefonnummer?",
|
589 |
+
"Ich komme morgen früh.",
|
590 |
+
"Vielen Dank für Ihre Zeit.",
|
591 |
+
"Dieser Ort ist sehr schön.",
|
592 |
+
"Ich lerne kochen.",
|
593 |
+
"Ist das der richtige Weg?",
|
594 |
+
"Ich möchte zur Schule gehen.",
|
595 |
+
"Wie alt sind Sie?",
|
596 |
+
"Wie viel kostet das?",
|
597 |
+
"Ich bin müde.",
|
598 |
+
"Woher kommen Sie?",
|
599 |
+
"Lassen Sie uns im Park spazieren gehen."
|
600 |
+
],
|
601 |
+
"es": [
|
602 |
+
"Hola, ¿cómo estás hoy?",
|
603 |
+
"Me llamo Juan García.",
|
604 |
+
"El tiempo está muy bueno hoy.",
|
605 |
+
"Quiero hablar en español.",
|
606 |
+
"Por favor habla despacio y claro.",
|
607 |
+
"Esta es una oración de prueba.",
|
608 |
+
"Me gusta leer libros.",
|
609 |
+
"¿Puedes ayudarme?",
|
610 |
+
"¿Cuál es tu número de teléfono?",
|
611 |
+
"Vendré mañana por la mañana.",
|
612 |
+
"Gracias por tu tiempo.",
|
613 |
+
"Este lugar es muy bonito.",
|
614 |
+
"Estoy aprendiendo a cocinar.",
|
615 |
+
"¿Es este el camino correcto?",
|
616 |
+
"Quiero ir a la escuela.",
|
617 |
+
"¿Cuántos años tienes?",
|
618 |
+
"¿Cuánto cuesta esto?",
|
619 |
+
"Estoy cansado.",
|
620 |
+
"¿De dónde eres?",
|
621 |
+
"Vamos a caminar por el parque."
|
622 |
+
],
|
623 |
+
"it": [
|
624 |
+
"Ciao, come stai oggi?",
|
625 |
+
"Mi chiamo Mario Rossi.",
|
626 |
+
"Il tempo è molto bello oggi.",
|
627 |
+
"Voglio parlare in italiano.",
|
628 |
+
"Per favore parla lentamente e chiaramente.",
|
629 |
+
"Questa è una frase di prova.",
|
630 |
+
"Mi piace leggere libri.",
|
631 |
+
"Puoi aiutarmi?",
|
632 |
+
"Qual è il tuo numero di telefono?",
|
633 |
+
"Verrò domani mattina.",
|
634 |
+
"Grazie per il tuo tempo.",
|
635 |
+
"Questo posto è molto bello.",
|
636 |
+
"Sto imparando a cucinare.",
|
637 |
+
"È questa la strada giusta?",
|
638 |
+
"Voglio andare a scuola.",
|
639 |
+
"Quanti anni hai?",
|
640 |
+
"Quanto costa questo?",
|
641 |
+
"Sono stanco.",
|
642 |
+
"Da dove vieni?",
|
643 |
+
"Andiamo a fare una passeggiata nel parco."
|
644 |
+
],
|
645 |
+
"pt": [
|
646 |
+
"Olá, como você está hoje?",
|
647 |
+
"Meu nome é João Silva.",
|
648 |
+
"O tempo está muito bom hoje.",
|
649 |
+
"Quero falar em português.",
|
650 |
+
"Por favor fale devagar e claramente.",
|
651 |
+
"Esta é uma frase de teste.",
|
652 |
+
"Eu gosto de ler livros.",
|
653 |
+
"Você pode me ajudar?",
|
654 |
+
"Qual é o seu número de telefone?",
|
655 |
+
"Vou vir amanhã de manhã.",
|
656 |
+
"Obrigado pelo seu tempo.",
|
657 |
+
"Este lugar é muito bonito.",
|
658 |
+
"Estou aprendendo a cozinhar.",
|
659 |
+
"Este é o caminho certo?",
|
660 |
+
"Quero ir para a escola.",
|
661 |
+
"Quantos anos você tem?",
|
662 |
+
"Quanto custa isso?",
|
663 |
+
"Estou cansado.",
|
664 |
+
"De onde você é?",
|
665 |
+
"Vamos dar um passeio no parque."
|
666 |
+
],
|
667 |
+
"nl": [
|
668 |
+
"Hallo, hoe gaat het vandaag met je?",
|
669 |
+
"Mijn naam is Jan de Vries.",
|
670 |
+
"Het weer is vandaag erg mooi.",
|
671 |
+
"Ik wil in het Nederlands spreken.",
|
672 |
+
"Spreek langzaam en duidelijk alstublieft.",
|
673 |
+
"Dit is een testzin.",
|
674 |
+
"Ik houd van het lezen van boeken.",
|
675 |
+
"Kun je me helpen?",
|
676 |
+
"Wat is je telefoonnummer?",
|
677 |
+
"Ik kom morgenochtend.",
|
678 |
+
"Bedankt voor je tijd.",
|
679 |
+
"Deze plek is erg mooi.",
|
680 |
+
"Ik leer koken.",
|
681 |
+
"Is dit de juiste weg?",
|
682 |
+
"Ik wil naar school gaan.",
|
683 |
+
"Hoe oud ben je?",
|
684 |
+
"Hoeveel kost dit?",
|
685 |
+
"Ik ben moe.",
|
686 |
+
"Waar kom je vandaan?",
|
687 |
+
"Laten we een wandeling maken in het park."
|
688 |
+
]
|
689 |
+
}
|
690 |
+
|
691 |
+
fallback_phrases = language_fallbacks.get(language, language_fallbacks["en"])
|
692 |
|
693 |
if max_phrases:
|
694 |
fallback_phrases = random.sample(fallback_phrases, min(max_phrases, len(fallback_phrases)))
|
|
|
746 |
⚠️ No HF_TOKEN detected
|
747 |
</p>
|
748 |
<p style="color: rgb(234, 88, 12); margin: 6px 0 0; font-size: 12px;">
|
749 |
+
Set HF_TOKEN environment variable to access CohereLabs AYA Collection dataset with authentic multilingual phrases.
|
750 |
+
This dataset provides high-quality text in 100+ languages for all Voxtral Mini supported languages.
|
751 |
Currently using fallback phrases for demonstration.
|
752 |
</p>
|
753 |
</div>
|
|
|
757 |
# Hidden state to track dataset JSONL path
|
758 |
jsonl_path_state = gr.State("")
|
759 |
|
760 |
+
# Language selection for Voxtral Mini supported languages
|
761 |
language_selector = gr.Dropdown(
|
762 |
choices=[
|
763 |
("English", "en"),
|
|
|
764 |
("French", "fr"),
|
765 |
+
("German", "de"),
|
766 |
("Spanish", "es"),
|
767 |
("Italian", "it"),
|
768 |
("Portuguese", "pt"),
|
|
|
769 |
("Dutch", "nl"),
|
770 |
+
("Hindi", "hi")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
771 |
],
|
772 |
value="en",
|
773 |
label="Language for Speech Phrases",
|
774 |
+
info="Select language for authentic phrases (Voxtral Mini supported languages). All languages use CohereLabs AYA Collection dataset when HF_TOKEN is available."
|
775 |
)
|
776 |
|
777 |
# Recording grid with dynamic text readouts
|
requirements.txt
CHANGED
@@ -1,7 +1,13 @@
|
|
1 |
torch
|
|
|
|
|
|
|
2 |
datasets
|
3 |
peft
|
4 |
transformers
|
5 |
gradio
|
6 |
trackio
|
7 |
-
huggingface_hub
|
|
|
|
|
|
|
|
1 |
torch
|
2 |
+
triton
|
3 |
+
torchvision
|
4 |
+
torchaudio
|
5 |
datasets
|
6 |
peft
|
7 |
transformers
|
8 |
gradio
|
9 |
trackio
|
10 |
+
huggingface_hub
|
11 |
+
soundfile
|
12 |
+
librosa
|
13 |
+
mistral-common
|