Spaces:
Running
Running
Joseph Pollack
commited on
adds requirements , improves interface and dataset loading
Browse files- .gitignore +1 -0
- interface.py +344 -142
- requirements.txt +7 -1
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
datasets/
|
interface.py
CHANGED
|
@@ -50,24 +50,45 @@ def get_username_from_token(token: str) -> Optional[str]:
|
|
| 50 |
def run_command_stream(args: list[str], env: Dict[str, str], cwd: Optional[Path] = None) -> Generator[str, None, int]:
|
| 51 |
import subprocess
|
| 52 |
import shlex
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
def detect_nvidia_driver() -> Tuple[bool, str]:
|
|
@@ -290,64 +311,93 @@ def start_voxtral_training(
|
|
| 290 |
freeze_audio_tower: bool,
|
| 291 |
push_to_hub: bool,
|
| 292 |
deploy_demo: bool,
|
| 293 |
-
) ->
|
|
|
|
| 294 |
env = os.environ.copy()
|
| 295 |
write_token = env.get("HF_WRITE_TOKEN") or env.get("HF_TOKEN")
|
| 296 |
read_token = env.get("HF_READ_TOKEN")
|
| 297 |
username = get_username_from_token(write_token or "") or env.get("HF_USERNAME") or ""
|
| 298 |
output_dir = PROJECT_ROOT / "outputs" / repo_short
|
| 299 |
|
| 300 |
-
#
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
"
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
]
|
| 316 |
-
if use_lora:
|
| 317 |
args += [
|
| 318 |
-
"--
|
| 319 |
-
"--
|
| 320 |
-
"--
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
# 2) Push to Hub
|
| 328 |
-
if push_to_hub:
|
| 329 |
-
repo_name = f"{username}/{repo_short}" if username else repo_short
|
| 330 |
-
push_args = [
|
| 331 |
-
str(PROJECT_ROOT / "scripts/push_to_huggingface.py"),
|
| 332 |
-
"model",
|
| 333 |
-
str(output_dir),
|
| 334 |
-
repo_name,
|
| 335 |
]
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
|
| 352 |
|
| 353 |
def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
|
@@ -371,35 +421,45 @@ def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
|
| 371 |
if max_phrases is None:
|
| 372 |
max_phrases = 1000
|
| 373 |
|
| 374 |
-
# Language code mapping for
|
| 375 |
-
#
|
| 376 |
-
|
| 377 |
-
"en": "
|
| 378 |
-
"
|
| 379 |
-
"
|
| 380 |
-
"
|
| 381 |
-
"
|
| 382 |
-
"
|
|
|
|
|
|
|
| 383 |
}
|
| 384 |
|
| 385 |
-
# Map input language to
|
| 386 |
-
|
|
|
|
|
|
|
|
|
|
| 387 |
|
| 388 |
try:
|
| 389 |
-
print(f"Loading phrases from
|
| 390 |
|
| 391 |
# Check for authentication token
|
| 392 |
token = os.getenv("HF_TOKEN") or os.getenv("HF_WRITE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
| 393 |
|
| 394 |
-
#
|
| 395 |
-
# Use streaming to handle large datasets efficiently
|
| 396 |
if token:
|
| 397 |
-
|
| 398 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
else:
|
| 400 |
-
print(
|
| 401 |
-
|
| 402 |
|
|
|
|
| 403 |
phrases = []
|
| 404 |
count = 0
|
| 405 |
seen_phrases = set()
|
|
@@ -409,8 +469,10 @@ def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
|
| 409 |
if count >= max_phrases:
|
| 410 |
break
|
| 411 |
|
| 412 |
-
# Extract
|
| 413 |
-
|
|
|
|
|
|
|
| 414 |
|
| 415 |
# Filter for quality phrases
|
| 416 |
if (text and
|
|
@@ -427,45 +489,206 @@ def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
|
| 427 |
if phrases:
|
| 428 |
# Shuffle the phrases for variety
|
| 429 |
random.shuffle(phrases)
|
| 430 |
-
|
|
|
|
| 431 |
return phrases
|
| 432 |
|
| 433 |
else:
|
| 434 |
-
print(f"No suitable phrases found in
|
| 435 |
raise Exception("No phrases found")
|
| 436 |
|
| 437 |
except Exception as e:
|
| 438 |
error_msg = str(e).lower()
|
| 439 |
if "401" in error_msg or "unauthorized" in error_msg:
|
| 440 |
-
print(f"
|
| 441 |
print("This dataset requires a Hugging Face token. Please set HF_TOKEN environment variable.")
|
| 442 |
else:
|
| 443 |
-
print(f"
|
| 444 |
|
| 445 |
-
# Fallback to basic phrases if
|
| 446 |
print("Using fallback phrases")
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
"
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
|
| 470 |
if max_phrases:
|
| 471 |
fallback_phrases = random.sample(fallback_phrases, min(max_phrases, len(fallback_phrases)))
|
|
@@ -523,7 +746,8 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
| 523 |
⚠️ No HF_TOKEN detected
|
| 524 |
</p>
|
| 525 |
<p style="color: rgb(234, 88, 12); margin: 6px 0 0; font-size: 12px;">
|
| 526 |
-
Set HF_TOKEN environment variable to access
|
|
|
|
| 527 |
Currently using fallback phrases for demonstration.
|
| 528 |
</p>
|
| 529 |
</div>
|
|
@@ -533,43 +757,21 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
| 533 |
# Hidden state to track dataset JSONL path
|
| 534 |
jsonl_path_state = gr.State("")
|
| 535 |
|
| 536 |
-
# Language selection for
|
| 537 |
language_selector = gr.Dropdown(
|
| 538 |
choices=[
|
| 539 |
("English", "en"),
|
| 540 |
-
("German", "de"),
|
| 541 |
("French", "fr"),
|
|
|
|
| 542 |
("Spanish", "es"),
|
| 543 |
("Italian", "it"),
|
| 544 |
("Portuguese", "pt"),
|
| 545 |
-
("Polish", "pl"),
|
| 546 |
("Dutch", "nl"),
|
| 547 |
-
("
|
| 548 |
-
("Arabic", "ar"),
|
| 549 |
-
("Chinese", "zh"),
|
| 550 |
-
("Japanese", "ja"),
|
| 551 |
-
("Korean", "ko"),
|
| 552 |
-
("Danish", "da"),
|
| 553 |
-
("Swedish", "sv"),
|
| 554 |
-
("Norwegian", "no"),
|
| 555 |
-
("Finnish", "fi"),
|
| 556 |
-
("Estonian", "et"),
|
| 557 |
-
("Latvian", "lv"),
|
| 558 |
-
("Lithuanian", "lt"),
|
| 559 |
-
("Slovenian", "sl"),
|
| 560 |
-
("Slovak", "sk"),
|
| 561 |
-
("Czech", "cs"),
|
| 562 |
-
("Croatian", "hr"),
|
| 563 |
-
("Bulgarian", "bg"),
|
| 564 |
-
("Ukrainian", "uk"),
|
| 565 |
-
("Romanian", "ro"),
|
| 566 |
-
("Hungarian", "hu"),
|
| 567 |
-
("Greek", "el"),
|
| 568 |
-
("Maltese", "mt")
|
| 569 |
],
|
| 570 |
value="en",
|
| 571 |
label="Language for Speech Phrases",
|
| 572 |
-
info="Select language for authentic phrases
|
| 573 |
)
|
| 574 |
|
| 575 |
# Recording grid with dynamic text readouts
|
|
|
|
| 50 |
def run_command_stream(args: list[str], env: Dict[str, str], cwd: Optional[Path] = None) -> Generator[str, None, int]:
|
| 51 |
import subprocess
|
| 52 |
import shlex
|
| 53 |
+
try:
|
| 54 |
+
cmd_line = ' '.join(shlex.quote(a) for a in ([get_python()] + args))
|
| 55 |
+
yield f"$ {cmd_line}"
|
| 56 |
+
|
| 57 |
+
process = subprocess.Popen(
|
| 58 |
+
[get_python()] + args,
|
| 59 |
+
stdout=subprocess.PIPE,
|
| 60 |
+
stderr=subprocess.STDOUT,
|
| 61 |
+
text=True,
|
| 62 |
+
env=env,
|
| 63 |
+
cwd=str(cwd or PROJECT_ROOT),
|
| 64 |
+
bufsize=1,
|
| 65 |
+
universal_newlines=True,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
if process.stdout is None:
|
| 69 |
+
yield "❌ Error: Could not capture process output"
|
| 70 |
+
return 1
|
| 71 |
+
|
| 72 |
+
for line in iter(process.stdout.readline, ""):
|
| 73 |
+
if line.strip(): # Only yield non-empty lines
|
| 74 |
+
yield line.rstrip()
|
| 75 |
+
|
| 76 |
+
process.stdout.close()
|
| 77 |
+
code = process.wait()
|
| 78 |
+
|
| 79 |
+
if code != 0:
|
| 80 |
+
yield f"❌ Command failed with exit code: {code}"
|
| 81 |
+
else:
|
| 82 |
+
yield f"✅ Command completed successfully (exit code: {code})"
|
| 83 |
+
|
| 84 |
+
return code
|
| 85 |
+
|
| 86 |
+
except FileNotFoundError as e:
|
| 87 |
+
yield f"❌ Error: Python executable not found: {e}"
|
| 88 |
+
return 1
|
| 89 |
+
except Exception as e:
|
| 90 |
+
yield f"❌ Error running command: {str(e)}"
|
| 91 |
+
return 1
|
| 92 |
|
| 93 |
|
| 94 |
def detect_nvidia_driver() -> Tuple[bool, str]:
|
|
|
|
| 311 |
freeze_audio_tower: bool,
|
| 312 |
push_to_hub: bool,
|
| 313 |
deploy_demo: bool,
|
| 314 |
+
) -> str:
|
| 315 |
+
"""Start Voxtral training and return collected logs as a string."""
|
| 316 |
env = os.environ.copy()
|
| 317 |
write_token = env.get("HF_WRITE_TOKEN") or env.get("HF_TOKEN")
|
| 318 |
read_token = env.get("HF_READ_TOKEN")
|
| 319 |
username = get_username_from_token(write_token or "") or env.get("HF_USERNAME") or ""
|
| 320 |
output_dir = PROJECT_ROOT / "outputs" / repo_short
|
| 321 |
|
| 322 |
+
# Collect all logs
|
| 323 |
+
all_logs = []
|
| 324 |
+
|
| 325 |
+
def collect_logs(generator):
|
| 326 |
+
"""Helper to collect logs from a generator."""
|
| 327 |
+
for line in generator:
|
| 328 |
+
all_logs.append(line)
|
| 329 |
+
print(line) # Also print to console for debugging
|
| 330 |
+
|
| 331 |
+
try:
|
| 332 |
+
# 1) Train
|
| 333 |
+
script = PROJECT_ROOT / ("scripts/train_lora.py" if use_lora else "scripts/train.py")
|
| 334 |
+
args = [str(script)]
|
| 335 |
+
if jsonl_path:
|
| 336 |
+
args += ["--dataset-jsonl", jsonl_path]
|
|
|
|
|
|
|
| 337 |
args += [
|
| 338 |
+
"--model-checkpoint", base_model,
|
| 339 |
+
"--train-count", str(train_count),
|
| 340 |
+
"--eval-count", str(eval_count),
|
| 341 |
+
"--batch-size", str(batch_size),
|
| 342 |
+
"--grad-accum", str(grad_accum),
|
| 343 |
+
"--learning-rate", str(learning_rate),
|
| 344 |
+
"--epochs", str(epochs),
|
| 345 |
+
"--output-dir", str(output_dir),
|
| 346 |
+
"--save-steps", "50",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
]
|
| 348 |
+
if use_lora:
|
| 349 |
+
args += [
|
| 350 |
+
"--lora-r", str(lora_r),
|
| 351 |
+
"--lora-alpha", str(lora_alpha),
|
| 352 |
+
"--lora-dropout", str(lora_dropout),
|
| 353 |
+
]
|
| 354 |
+
if freeze_audio_tower:
|
| 355 |
+
args += ["--freeze-audio-tower"]
|
| 356 |
+
|
| 357 |
+
all_logs.append("🚀 Starting Voxtral training...")
|
| 358 |
+
collect_logs(run_command_stream(args, env))
|
| 359 |
+
all_logs.append("✅ Training completed!")
|
| 360 |
+
|
| 361 |
+
# 2) Push to Hub
|
| 362 |
+
if push_to_hub:
|
| 363 |
+
if not username:
|
| 364 |
+
all_logs.append("❌ Cannot push to Hub: No username available. Set HF_TOKEN or HF_USERNAME.")
|
| 365 |
+
else:
|
| 366 |
+
repo_name = f"{username}/{repo_short}"
|
| 367 |
+
push_args = [
|
| 368 |
+
str(PROJECT_ROOT / "scripts/push_to_huggingface.py"),
|
| 369 |
+
"model",
|
| 370 |
+
str(output_dir),
|
| 371 |
+
repo_name,
|
| 372 |
+
]
|
| 373 |
+
all_logs.append(f"📤 Pushing model to Hugging Face Hub: {repo_name}")
|
| 374 |
+
collect_logs(run_command_stream(push_args, env))
|
| 375 |
+
all_logs.append("✅ Model pushed successfully!")
|
| 376 |
+
|
| 377 |
+
# 3) Deploy demo Space
|
| 378 |
+
if deploy_demo and username:
|
| 379 |
+
deploy_args = [
|
| 380 |
+
str(PROJECT_ROOT / "scripts/deploy_demo_space.py"),
|
| 381 |
+
"--hf-token", write_token or "",
|
| 382 |
+
"--hf-username", username,
|
| 383 |
+
"--model-id", f"{username}/{repo_short}",
|
| 384 |
+
"--demo-type", "voxtral",
|
| 385 |
+
"--space-name", f"{repo_short}-demo",
|
| 386 |
+
]
|
| 387 |
+
all_logs.append("🚀 Deploying demo Space...")
|
| 388 |
+
collect_logs(run_command_stream(deploy_args, env))
|
| 389 |
+
all_logs.append("✅ Demo Space deployed!")
|
| 390 |
+
|
| 391 |
+
# Return all collected logs as a single string
|
| 392 |
+
return "\n".join(all_logs)
|
| 393 |
+
|
| 394 |
+
except Exception as e:
|
| 395 |
+
error_msg = f"❌ Error during training: {str(e)}"
|
| 396 |
+
all_logs.append(error_msg)
|
| 397 |
+
print(error_msg) # Also print to console
|
| 398 |
+
import traceback
|
| 399 |
+
traceback.print_exc()
|
| 400 |
+
return "\n".join(all_logs)
|
| 401 |
|
| 402 |
|
| 403 |
def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
|
|
|
|
| 421 |
if max_phrases is None:
|
| 422 |
max_phrases = 1000
|
| 423 |
|
| 424 |
+
# Language code mapping for CohereLabs AYA Collection dataset
|
| 425 |
+
# All Voxtral Mini supported languages are available in AYA Collection
|
| 426 |
+
aya_supported_langs = {
|
| 427 |
+
"en": "english", # English
|
| 428 |
+
"fr": "french", # French
|
| 429 |
+
"de": "german", # German
|
| 430 |
+
"es": "spanish", # Spanish
|
| 431 |
+
"it": "italian", # Italian
|
| 432 |
+
"pt": "portuguese", # Portuguese
|
| 433 |
+
"nl": "dutch", # Dutch
|
| 434 |
+
"hi": "hindi" # Hindi
|
| 435 |
}
|
| 436 |
|
| 437 |
+
# Map input language to CohereLabs AYA Collection configuration
|
| 438 |
+
aya_lang = aya_supported_langs.get(language)
|
| 439 |
+
|
| 440 |
+
if not aya_lang:
|
| 441 |
+
raise Exception(f"Language {language} not supported in CohereLabs AYA Collection dataset")
|
| 442 |
|
| 443 |
try:
|
| 444 |
+
print(f"Loading phrases from CohereLabs AYA Collection dataset for language: {language}")
|
| 445 |
|
| 446 |
# Check for authentication token
|
| 447 |
token = os.getenv("HF_TOKEN") or os.getenv("HF_WRITE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
| 448 |
|
| 449 |
+
# Try to load CohereLabs AYA Collection dataset for the specified language
|
|
|
|
| 450 |
if token:
|
| 451 |
+
try:
|
| 452 |
+
ds = load_dataset("CohereLabs/aya_collection_language_split", aya_lang, split="train", streaming=True, token=token)
|
| 453 |
+
print(f"Successfully loaded CohereLabs AYA Collection {language} dataset")
|
| 454 |
+
except Exception as e:
|
| 455 |
+
# Fallback to other datasets
|
| 456 |
+
print(f"CohereLabs AYA Collection {language} not available ({e}), trying alternative datasets")
|
| 457 |
+
raise Exception("AYA Collection not available")
|
| 458 |
else:
|
| 459 |
+
print("No HF_TOKEN found for CohereLabs AYA Collection dataset")
|
| 460 |
+
raise Exception("No token available")
|
| 461 |
|
| 462 |
+
# Common processing for both dataset types
|
| 463 |
phrases = []
|
| 464 |
count = 0
|
| 465 |
seen_phrases = set()
|
|
|
|
| 469 |
if count >= max_phrases:
|
| 470 |
break
|
| 471 |
|
| 472 |
+
# Extract text from CohereLabs AYA Collection format: combine inputs and targets
|
| 473 |
+
inputs_text = example.get("inputs", "").strip()
|
| 474 |
+
targets_text = example.get("targets", "").strip()
|
| 475 |
+
text = f"{inputs_text} {targets_text}".strip()
|
| 476 |
|
| 477 |
# Filter for quality phrases
|
| 478 |
if (text and
|
|
|
|
| 489 |
if phrases:
|
| 490 |
# Shuffle the phrases for variety
|
| 491 |
random.shuffle(phrases)
|
| 492 |
+
dataset_name = "CohereLabs AYA Collection"
|
| 493 |
+
print(f"Successfully loaded {len(phrases)} phrases from {dataset_name} dataset for {language}")
|
| 494 |
return phrases
|
| 495 |
|
| 496 |
else:
|
| 497 |
+
print(f"No suitable phrases found in dataset for {language}")
|
| 498 |
raise Exception("No phrases found")
|
| 499 |
|
| 500 |
except Exception as e:
|
| 501 |
error_msg = str(e).lower()
|
| 502 |
if "401" in error_msg or "unauthorized" in error_msg:
|
| 503 |
+
print(f"CohereLabs AYA Collection authentication failed for {language}: {e}")
|
| 504 |
print("This dataset requires a Hugging Face token. Please set HF_TOKEN environment variable.")
|
| 505 |
else:
|
| 506 |
+
print(f"CohereLabs AYA Collection loading failed for {language}: {e}")
|
| 507 |
|
| 508 |
+
# Fallback to basic phrases if dataset loading fails
|
| 509 |
print("Using fallback phrases")
|
| 510 |
+
|
| 511 |
+
# Language-specific fallback phrases
|
| 512 |
+
language_fallbacks = {
|
| 513 |
+
"hi": [
|
| 514 |
+
"नमस्ते, आज आप कैसे हैं?",
|
| 515 |
+
"मेरा नाम राजेश कुमार है।",
|
| 516 |
+
"आज का मौसम बहुत अच्छा है।",
|
| 517 |
+
"मैं हिंदी में बात करना चाहता हूं।",
|
| 518 |
+
"कृपया धीरे और स्पष्ट बोलें।",
|
| 519 |
+
"यह एक परीक्षण वाक्य है।",
|
| 520 |
+
"मैं पुस्तकें पढ़ना पसंद करता हूं।",
|
| 521 |
+
"क्या आप मेरी मदद कर सकते हैं?",
|
| 522 |
+
"आपका फोन नंबर क्या है?",
|
| 523 |
+
"मैं कल सुबह आऊंगा।",
|
| 524 |
+
"धन्यवाद, आपका समय देने के लिए।",
|
| 525 |
+
"यह जगह बहुत सुंदर है।",
|
| 526 |
+
"मैं भोजन तैयार करना सीख रहा हूं।",
|
| 527 |
+
"क्या यह रास्ता स��ी है?",
|
| 528 |
+
"मैं स्कूल जाना चाहता हूं।",
|
| 529 |
+
"आपकी उम्र क्या है?",
|
| 530 |
+
"यह कितने का है?",
|
| 531 |
+
"मैं थक गया हूं।",
|
| 532 |
+
"आप कहां से हैं?",
|
| 533 |
+
"चलिए पार्क में टहलते हैं।"
|
| 534 |
+
],
|
| 535 |
+
"en": [
|
| 536 |
+
"Hello, how are you today?",
|
| 537 |
+
"My name is John Smith.",
|
| 538 |
+
"The weather is very nice today.",
|
| 539 |
+
"I want to speak in English.",
|
| 540 |
+
"Please speak slowly and clearly.",
|
| 541 |
+
"This is a test sentence.",
|
| 542 |
+
"I enjoy reading books.",
|
| 543 |
+
"Can you help me?",
|
| 544 |
+
"What is your phone number?",
|
| 545 |
+
"I will come tomorrow morning.",
|
| 546 |
+
"Thank you for your time.",
|
| 547 |
+
"This place is very beautiful.",
|
| 548 |
+
"I am learning to cook food.",
|
| 549 |
+
"Is this the right way?",
|
| 550 |
+
"I want to go to school.",
|
| 551 |
+
"How old are you?",
|
| 552 |
+
"How much does this cost?",
|
| 553 |
+
"I am tired.",
|
| 554 |
+
"Where are you from?",
|
| 555 |
+
"Let's go for a walk in the park."
|
| 556 |
+
],
|
| 557 |
+
"fr": [
|
| 558 |
+
"Bonjour, comment allez-vous aujourd'hui?",
|
| 559 |
+
"Je m'appelle Jean Dupont.",
|
| 560 |
+
"Le temps est très beau aujourd'hui.",
|
| 561 |
+
"Je veux parler en français.",
|
| 562 |
+
"Parlez lentement et clairement s'il vous plaît.",
|
| 563 |
+
"Ceci est une phrase de test.",
|
| 564 |
+
"J'aime lire des livres.",
|
| 565 |
+
"Pouvez-vous m'aider?",
|
| 566 |
+
"Quel est votre numéro de téléphone?",
|
| 567 |
+
"Je viendrai demain matin.",
|
| 568 |
+
"Merci pour votre temps.",
|
| 569 |
+
"Cet endroit est très beau.",
|
| 570 |
+
"J'apprends à cuisiner.",
|
| 571 |
+
"Est-ce le bon chemin?",
|
| 572 |
+
"Je veux aller à l'école.",
|
| 573 |
+
"Quel âge avez-vous?",
|
| 574 |
+
"Combien cela coûte-t-il?",
|
| 575 |
+
"Je suis fatigué.",
|
| 576 |
+
"D'où venez-vous?",
|
| 577 |
+
"Allons nous promener dans le parc."
|
| 578 |
+
],
|
| 579 |
+
"de": [
|
| 580 |
+
"Hallo, wie geht es Ihnen heute?",
|
| 581 |
+
"Mein Name ist Hans Müller.",
|
| 582 |
+
"Das Wetter ist heute sehr schön.",
|
| 583 |
+
"Ich möchte auf Deutsch sprechen.",
|
| 584 |
+
"Sprechen Sie bitte langsam und deutlich.",
|
| 585 |
+
"Dies ist ein Testsatz.",
|
| 586 |
+
"Ich lese gerne Bücher.",
|
| 587 |
+
"Können Sie mir helfen?",
|
| 588 |
+
"Wie ist Ihre Telefonnummer?",
|
| 589 |
+
"Ich komme morgen früh.",
|
| 590 |
+
"Vielen Dank für Ihre Zeit.",
|
| 591 |
+
"Dieser Ort ist sehr schön.",
|
| 592 |
+
"Ich lerne kochen.",
|
| 593 |
+
"Ist das der richtige Weg?",
|
| 594 |
+
"Ich möchte zur Schule gehen.",
|
| 595 |
+
"Wie alt sind Sie?",
|
| 596 |
+
"Wie viel kostet das?",
|
| 597 |
+
"Ich bin müde.",
|
| 598 |
+
"Woher kommen Sie?",
|
| 599 |
+
"Lassen Sie uns im Park spazieren gehen."
|
| 600 |
+
],
|
| 601 |
+
"es": [
|
| 602 |
+
"Hola, ¿cómo estás hoy?",
|
| 603 |
+
"Me llamo Juan García.",
|
| 604 |
+
"El tiempo está muy bueno hoy.",
|
| 605 |
+
"Quiero hablar en español.",
|
| 606 |
+
"Por favor habla despacio y claro.",
|
| 607 |
+
"Esta es una oración de prueba.",
|
| 608 |
+
"Me gusta leer libros.",
|
| 609 |
+
"¿Puedes ayudarme?",
|
| 610 |
+
"¿Cuál es tu número de teléfono?",
|
| 611 |
+
"Vendré mañana por la mañana.",
|
| 612 |
+
"Gracias por tu tiempo.",
|
| 613 |
+
"Este lugar es muy bonito.",
|
| 614 |
+
"Estoy aprendiendo a cocinar.",
|
| 615 |
+
"¿Es este el camino correcto?",
|
| 616 |
+
"Quiero ir a la escuela.",
|
| 617 |
+
"¿Cuántos años tienes?",
|
| 618 |
+
"¿Cuánto cuesta esto?",
|
| 619 |
+
"Estoy cansado.",
|
| 620 |
+
"¿De dónde eres?",
|
| 621 |
+
"Vamos a caminar por el parque."
|
| 622 |
+
],
|
| 623 |
+
"it": [
|
| 624 |
+
"Ciao, come stai oggi?",
|
| 625 |
+
"Mi chiamo Mario Rossi.",
|
| 626 |
+
"Il tempo è molto bello oggi.",
|
| 627 |
+
"Voglio parlare in italiano.",
|
| 628 |
+
"Per favore parla lentamente e chiaramente.",
|
| 629 |
+
"Questa è una frase di prova.",
|
| 630 |
+
"Mi piace leggere libri.",
|
| 631 |
+
"Puoi aiutarmi?",
|
| 632 |
+
"Qual è il tuo numero di telefono?",
|
| 633 |
+
"Verrò domani mattina.",
|
| 634 |
+
"Grazie per il tuo tempo.",
|
| 635 |
+
"Questo posto è molto bello.",
|
| 636 |
+
"Sto imparando a cucinare.",
|
| 637 |
+
"È questa la strada giusta?",
|
| 638 |
+
"Voglio andare a scuola.",
|
| 639 |
+
"Quanti anni hai?",
|
| 640 |
+
"Quanto costa questo?",
|
| 641 |
+
"Sono stanco.",
|
| 642 |
+
"Da dove vieni?",
|
| 643 |
+
"Andiamo a fare una passeggiata nel parco."
|
| 644 |
+
],
|
| 645 |
+
"pt": [
|
| 646 |
+
"Olá, como você está hoje?",
|
| 647 |
+
"Meu nome é João Silva.",
|
| 648 |
+
"O tempo está muito bom hoje.",
|
| 649 |
+
"Quero falar em português.",
|
| 650 |
+
"Por favor fale devagar e claramente.",
|
| 651 |
+
"Esta é uma frase de teste.",
|
| 652 |
+
"Eu gosto de ler livros.",
|
| 653 |
+
"Você pode me ajudar?",
|
| 654 |
+
"Qual é o seu número de telefone?",
|
| 655 |
+
"Vou vir amanhã de manhã.",
|
| 656 |
+
"Obrigado pelo seu tempo.",
|
| 657 |
+
"Este lugar é muito bonito.",
|
| 658 |
+
"Estou aprendendo a cozinhar.",
|
| 659 |
+
"Este é o caminho certo?",
|
| 660 |
+
"Quero ir para a escola.",
|
| 661 |
+
"Quantos anos você tem?",
|
| 662 |
+
"Quanto custa isso?",
|
| 663 |
+
"Estou cansado.",
|
| 664 |
+
"De onde você é?",
|
| 665 |
+
"Vamos dar um passeio no parque."
|
| 666 |
+
],
|
| 667 |
+
"nl": [
|
| 668 |
+
"Hallo, hoe gaat het vandaag met je?",
|
| 669 |
+
"Mijn naam is Jan de Vries.",
|
| 670 |
+
"Het weer is vandaag erg mooi.",
|
| 671 |
+
"Ik wil in het Nederlands spreken.",
|
| 672 |
+
"Spreek langzaam en duidelijk alstublieft.",
|
| 673 |
+
"Dit is een testzin.",
|
| 674 |
+
"Ik houd van het lezen van boeken.",
|
| 675 |
+
"Kun je me helpen?",
|
| 676 |
+
"Wat is je telefoonnummer?",
|
| 677 |
+
"Ik kom morgenochtend.",
|
| 678 |
+
"Bedankt voor je tijd.",
|
| 679 |
+
"Deze plek is erg mooi.",
|
| 680 |
+
"Ik leer koken.",
|
| 681 |
+
"Is dit de juiste weg?",
|
| 682 |
+
"Ik wil naar school gaan.",
|
| 683 |
+
"Hoe oud ben je?",
|
| 684 |
+
"Hoeveel kost dit?",
|
| 685 |
+
"Ik ben moe.",
|
| 686 |
+
"Waar kom je vandaan?",
|
| 687 |
+
"Laten we een wandeling maken in het park."
|
| 688 |
+
]
|
| 689 |
+
}
|
| 690 |
+
|
| 691 |
+
fallback_phrases = language_fallbacks.get(language, language_fallbacks["en"])
|
| 692 |
|
| 693 |
if max_phrases:
|
| 694 |
fallback_phrases = random.sample(fallback_phrases, min(max_phrases, len(fallback_phrases)))
|
|
|
|
| 746 |
⚠️ No HF_TOKEN detected
|
| 747 |
</p>
|
| 748 |
<p style="color: rgb(234, 88, 12); margin: 6px 0 0; font-size: 12px;">
|
| 749 |
+
Set HF_TOKEN environment variable to access CohereLabs AYA Collection dataset with authentic multilingual phrases.
|
| 750 |
+
This dataset provides high-quality text in 100+ languages for all Voxtral Mini supported languages.
|
| 751 |
Currently using fallback phrases for demonstration.
|
| 752 |
</p>
|
| 753 |
</div>
|
|
|
|
| 757 |
# Hidden state to track dataset JSONL path
|
| 758 |
jsonl_path_state = gr.State("")
|
| 759 |
|
| 760 |
+
# Language selection for Voxtral Mini supported languages
|
| 761 |
language_selector = gr.Dropdown(
|
| 762 |
choices=[
|
| 763 |
("English", "en"),
|
|
|
|
| 764 |
("French", "fr"),
|
| 765 |
+
("German", "de"),
|
| 766 |
("Spanish", "es"),
|
| 767 |
("Italian", "it"),
|
| 768 |
("Portuguese", "pt"),
|
|
|
|
| 769 |
("Dutch", "nl"),
|
| 770 |
+
("Hindi", "hi")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 771 |
],
|
| 772 |
value="en",
|
| 773 |
label="Language for Speech Phrases",
|
| 774 |
+
info="Select language for authentic phrases (Voxtral Mini supported languages). All languages use CohereLabs AYA Collection dataset when HF_TOKEN is available."
|
| 775 |
)
|
| 776 |
|
| 777 |
# Recording grid with dynamic text readouts
|
requirements.txt
CHANGED
|
@@ -1,7 +1,13 @@
|
|
| 1 |
torch
|
|
|
|
|
|
|
|
|
|
| 2 |
datasets
|
| 3 |
peft
|
| 4 |
transformers
|
| 5 |
gradio
|
| 6 |
trackio
|
| 7 |
-
huggingface_hub
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
torch
|
| 2 |
+
triton
|
| 3 |
+
torchvision
|
| 4 |
+
torchaudio
|
| 5 |
datasets
|
| 6 |
peft
|
| 7 |
transformers
|
| 8 |
gradio
|
| 9 |
trackio
|
| 10 |
+
huggingface_hub
|
| 11 |
+
soundfile
|
| 12 |
+
librosa
|
| 13 |
+
mistral-common
|