Joseph Pollack commited on
Commit
b3ee71e
·
unverified ·
1 Parent(s): 0f6c755

adds dynamic and multilingual voxpopuli phrases instead, ssr_mode=false in gradio interfaces

Browse files
interface.py CHANGED
@@ -251,18 +251,54 @@ def start_voxtral_training(
251
  yield line
252
 
253
 
254
- PHRASES = [
255
- "The quick brown fox jumps over the lazy dog.",
256
- "Please say your full name.",
257
- "Today is a good day to learn something new.",
258
- "Artificial intelligence helps with many tasks.",
259
- "I enjoy reading books and listening to music.",
260
- "This is a sample sentence for testing speech.",
261
- "Speak clearly and at a normal pace.",
262
- "Numbers like one, two, three are easy to say.",
263
- "The weather is sunny with a chance of rain.",
264
- "Thank you for taking the time to help.",
265
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
 
267
  with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
268
  has_gpu, gpu_msg = detect_nvidia_driver()
@@ -301,16 +337,87 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
301
 
302
  jsonl_out = gr.Textbox(label="Dataset JSONL path", interactive=False, visible=True)
303
 
 
 
 
 
 
 
 
 
304
  # Recording grid with dynamic text readouts
305
- phrase_texts_state = gr.State(PHRASES)
 
 
306
  phrase_markdowns: list[gr.Markdown] = []
307
  rec_components = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  with gr.Column():
309
- for idx, phrase in enumerate(PHRASES):
310
- md = gr.Markdown(f"**{idx+1}. {phrase}**")
311
- phrase_markdowns.append(md)
312
- comp = gr.Audio(sources="microphone", type="numpy", label=f"Recording {idx+1}")
313
- rec_components.append(comp)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
  # Advanced options accordion
316
  with gr.Accordion("Advanced options", open=False):
@@ -366,7 +473,8 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
366
  sr, data = rec
367
  out_path = wav_dir / f"rec_{i:04d}.wav"
368
  sf.write(str(out_path), data, sr)
369
- label_text = (texts[i] if isinstance(texts, list) and i < len(texts) else (PHRASES[i] if i < len(PHRASES) else ""))
 
370
  rows.append({"audio_path": str(out_path), "text": label_text})
371
  jsonl_path = dataset_dir / "data.jsonl"
372
  _write_jsonl(rows, jsonl_path)
@@ -409,11 +517,15 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
409
  jsonl_path = dataset_dir / "data.jsonl"
410
  _write_jsonl(rows, jsonl_path)
411
  # Build markdown content updates for on-screen prompts
412
- md_updates = []
413
  for i in range(len(phrase_markdowns)):
414
  t = texts[i] if i < len(texts) else ""
415
- md_updates.append(f"**{i+1}. {t}**")
416
- return (str(jsonl_path), texts, *md_updates)
 
 
 
 
417
 
418
  vp_btn.click(
419
  _collect_voxpopuli,
@@ -439,6 +551,6 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
439
  if __name__ == "__main__":
440
  server_port = int(os.environ.get("INTERFACE_PORT", "7860"))
441
  server_name = os.environ.get("INTERFACE_HOST", "0.0.0.0")
442
- demo.queue().launch(server_name=server_name, server_port=server_port, mcp_server=True)
443
 
444
 
 
251
  yield line
252
 
253
 
254
+ def load_voxpopuli_phrases(language="en", max_phrases=None, split="train"):
255
+ """Load phrases from VoxPopuli dataset.
256
+
257
+ Args:
258
+ language: Language code (e.g., 'en', 'de', 'fr', etc.)
259
+ max_phrases: Maximum number of phrases to load (None for all available)
260
+ split: Dataset split to use ('train', 'validation', 'test')
261
+
262
+ Returns:
263
+ List of normalized text phrases
264
+ """
265
+ try:
266
+ from datasets import load_dataset
267
+ import random
268
+
269
+ # Load the specified language dataset
270
+ ds = load_dataset("facebook/voxpopuli", language, split=split)
271
+
272
+ # Extract normalized text phrases
273
+ phrases = []
274
+ for example in ds:
275
+ text = example.get("normalized_text", "").strip()
276
+ if text and len(text) > 10: # Filter out very short phrases
277
+ phrases.append(text)
278
+
279
+ # Shuffle and limit if specified
280
+ if max_phrases:
281
+ phrases = random.sample(phrases, min(max_phrases, len(phrases)))
282
+ else:
283
+ # If no limit, shuffle the entire list
284
+ random.shuffle(phrases)
285
+
286
+ return phrases
287
+
288
+ except Exception as e:
289
+ print(f"Error loading VoxPopuli phrases: {e}")
290
+ # Fallback to some basic phrases if loading fails
291
+ return [
292
+ "The quick brown fox jumps over the lazy dog.",
293
+ "Please say your full name.",
294
+ "Today is a good day to learn something new.",
295
+ "Artificial intelligence helps with many tasks.",
296
+ "I enjoy reading books and listening to music.",
297
+ ]
298
+
299
+ # Initialize phrases dynamically
300
+ VOXPOPULI_LANGUAGE = "en" # Default to English
301
+ ALL_PHRASES = load_voxpopuli_phrases(VOXPOPULI_LANGUAGE, max_phrases=None)
302
 
303
  with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
304
  has_gpu, gpu_msg = detect_nvidia_driver()
 
337
 
338
  jsonl_out = gr.Textbox(label="Dataset JSONL path", interactive=False, visible=True)
339
 
340
+ # Language selection for VoxPopuli phrases
341
+ voxpopuli_lang = gr.Dropdown(
342
+ choices=["en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr", "sk", "sl", "et", "lt"],
343
+ value="en",
344
+ label="VoxPopuli Language",
345
+ info="Select language for phrases from VoxPopuli dataset"
346
+ )
347
+
348
  # Recording grid with dynamic text readouts
349
+ phrase_texts_state = gr.State(ALL_PHRASES)
350
+ visible_rows_state = gr.State(10) # Start with 10 visible rows
351
+ max_rows = len(ALL_PHRASES) # No cap on total rows
352
  phrase_markdowns: list[gr.Markdown] = []
353
  rec_components = []
354
+
355
+ def create_recording_grid(phrases, visible_count=10):
356
+ """Create recording grid components dynamically"""
357
+ markdowns = []
358
+ recordings = []
359
+ for idx, phrase in enumerate(phrases):
360
+ visible = idx < visible_count
361
+ md = gr.Markdown(f"**{idx+1}. {phrase}**", visible=visible)
362
+ markdowns.append(md)
363
+ comp = gr.Audio(sources="microphone", type="numpy", label=f"Recording {idx+1}", visible=visible)
364
+ recordings.append(comp)
365
+ return markdowns, recordings
366
+
367
+ # Initial grid creation
368
  with gr.Column():
369
+ phrase_markdowns, rec_components = create_recording_grid(ALL_PHRASES, 10)
370
+
371
+ # Add more rows button
372
+ add_rows_btn = gr.Button(" Add 10 More Rows", variant="secondary")
373
+
374
+ def add_more_rows(current_visible, current_phrases):
375
+ """Add 10 more rows by making them visible"""
376
+ new_visible = min(current_visible + 10, len(current_phrases))
377
+ visibility_updates = []
378
+ for i in range(len(current_phrases)):
379
+ if i < new_visible:
380
+ visibility_updates.append(gr.update(visible=True))
381
+ else:
382
+ visibility_updates.append(gr.update(visible=False))
383
+ return [new_visible] + visibility_updates
384
+
385
+ def change_language(language):
386
+ """Change the language and reload phrases from VoxPopuli"""
387
+ new_phrases = load_voxpopuli_phrases(language, max_phrases=None)
388
+ # Reset visible rows to 10
389
+ visible_count = min(10, len(new_phrases))
390
+
391
+ # Create combined updates for existing components (up to current length)
392
+ current_len = len(phrase_markdowns)
393
+ combined_updates = []
394
+
395
+ # Update existing components
396
+ for i in range(current_len):
397
+ if i < len(new_phrases):
398
+ if i < visible_count:
399
+ combined_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=True))
400
+ else:
401
+ combined_updates.append(gr.update(visible=False))
402
+ else:
403
+ combined_updates.append(gr.update(visible=False))
404
+
405
+ # If we have more phrases than components, we can't update them via Gradio
406
+ # The interface will need to be reloaded for significantly different phrase counts
407
+ return [new_phrases, visible_count] + combined_updates
408
+
409
+ # Connect language change to phrase reloading
410
+ voxpopuli_lang.change(
411
+ change_language,
412
+ inputs=[voxpopuli_lang],
413
+ outputs=[phrase_texts_state, visible_rows_state] + phrase_markdowns + rec_components
414
+ )
415
+
416
+ add_rows_btn.click(
417
+ add_more_rows,
418
+ inputs=[visible_rows_state, phrase_texts_state],
419
+ outputs=[visible_rows_state] + phrase_markdowns + rec_components
420
+ )
421
 
422
  # Advanced options accordion
423
  with gr.Accordion("Advanced options", open=False):
 
473
  sr, data = rec
474
  out_path = wav_dir / f"rec_{i:04d}.wav"
475
  sf.write(str(out_path), data, sr)
476
+ # Use the full phrase list (ALL_PHRASES) instead of just PHRASES
477
+ label_text = (texts[i] if isinstance(texts, list) and i < len(texts) else (ALL_PHRASES[i] if i < len(ALL_PHRASES) else ""))
478
  rows.append({"audio_path": str(out_path), "text": label_text})
479
  jsonl_path = dataset_dir / "data.jsonl"
480
  _write_jsonl(rows, jsonl_path)
 
517
  jsonl_path = dataset_dir / "data.jsonl"
518
  _write_jsonl(rows, jsonl_path)
519
  # Build markdown content updates for on-screen prompts
520
+ combined_updates = []
521
  for i in range(len(phrase_markdowns)):
522
  t = texts[i] if i < len(texts) else ""
523
+ if i < len(texts):
524
+ combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
525
+ else:
526
+ combined_updates.append(gr.update(visible=False))
527
+
528
+ return (str(jsonl_path), texts, *combined_updates)
529
 
530
  vp_btn.click(
531
  _collect_voxpopuli,
 
551
  if __name__ == "__main__":
552
  server_port = int(os.environ.get("INTERFACE_PORT", "7860"))
553
  server_name = os.environ.get("INTERFACE_HOST", "0.0.0.0")
554
+ demo.queue().launch(server_name=server_name, server_port=server_port, mcp_server=True, ssr_mode=False)
555
 
556
 
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  torch
2
  datasets
3
  peft
4
- transformers
 
 
1
  torch
2
  datasets
3
  peft
4
+ transformers
5
+ gradio
templates/spaces/demo_voxtral/app.py CHANGED
@@ -30,6 +30,6 @@ with gr.Blocks() as demo:
30
  btn.click(transcribe, inputs=[audio], outputs=[out])
31
 
32
  if __name__ == "__main__":
33
- demo.launch(mcp_server=True)
34
 
35
 
 
30
  btn.click(transcribe, inputs=[audio], outputs=[out])
31
 
32
  if __name__ == "__main__":
33
+ demo.launch(mcp_server=True, ssr_mode=False)
34
 
35