tee342 commited on
Commit
dc26431
Β·
verified Β·
1 Parent(s): 2f52f6c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -91
app.py CHANGED
@@ -266,7 +266,7 @@ def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, ex
266
  except Exception as e:
267
  return None, f"❌ Batch processing failed: {str(e)}"
268
 
269
- # === Transcribe & Edit Tab ===
270
  whisper_model = WhisperModel("base")
271
 
272
  def transcribe_audio(audio_path):
@@ -274,7 +274,7 @@ def transcribe_audio(audio_path):
274
  text = " ".join([seg.text for seg in segments])
275
  return text
276
 
277
- # === TTS Tab ===
278
  tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
279
 
280
  def generate_tts(text):
@@ -282,6 +282,23 @@ def generate_tts(text):
282
  tts.tts_to_file(text=text, file_path=out_path)
283
  return out_path
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  # === Trim Silence Automatically (VAD) ===
286
  def detect_silence(audio_file, silence_threshold=-50.0, min_silence_len=1000):
287
  audio = AudioSegment.from_file(audio_file)
@@ -309,42 +326,53 @@ def mix_tracks(track1, track2, volume_offset=0):
309
  mixed.export(out_path, format="wav")
310
  return out_path
311
 
312
- # === Save/Load Project File (.aiproj) ===
313
- def save_project(audio_path, preset_name, effects):
314
- project_data = {
315
- "audio": AudioSegment.from_file(audio_path).raw_data,
316
- "preset": preset_name,
317
- "effects": effects
318
- }
319
- out_path = os.path.join(tempfile.gettempdir(), "project.aiproj")
320
- with open(out_path, "wb") as f:
321
- pickle.dump(project_data, f)
322
- return out_path
323
 
324
- def load_project(project_file):
325
- with open(project_file.name, "rb") as f:
326
- data = pickle.load(f)
327
- return data["preset"], data["effects"]
 
328
 
329
- # === Auto-Save / Resume Sessions ===
330
- def save_or_resume_session(audio, preset, effects, action="save"):
331
- if action == "save":
332
- return {"audio": audio, "preset": preset, "effects": effects}, None, None, None
333
- elif action == "load" and isinstance(audio, dict):
334
- return (
335
- None,
336
- audio.get("audio"),
337
- audio.get("preset"),
338
- audio.get("effects")
339
- )
340
- return None, None, None, None
341
 
342
- # === Voice Cloning – Fallback Version for Hugging Face ===
343
- def clone_voice(source_audio, target_audio, text):
344
- print("⚠️ Voice cloning not available in browser version β€” use local install for full support")
345
- return generate_tts(text)
346
 
347
- # === UI Setup ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  effect_options = [
349
  "Noise Reduction",
350
  "Compress Dynamic Range",
@@ -424,17 +452,17 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
424
  clear_btn=None
425
  )
426
 
427
- # --- Transcribe & Edit ---
428
  with gr.Tab("πŸ“ Transcribe & Edit"):
429
  gr.Interface(
430
  fn=transcribe_audio,
431
  inputs=gr.Audio(label="Upload Audio", type="filepath"),
432
  outputs=gr.Textbox(label="Transcribed Text", lines=10),
433
- title="Transcribe Spoken Content",
434
  description="Convert voice to text and edit it before exporting again."
435
  )
436
 
437
- # --- TTS Voice Generator ---
438
  with gr.Tab("πŸ’¬ TTS Voice Generator"):
439
  gr.Interface(
440
  fn=generate_tts,
@@ -444,7 +472,52 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
444
  description="Type anything and turn it into natural-sounding speech."
445
  )
446
 
447
- # --- VAD – Detect & Remove Silence ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  with gr.Tab("βœ‚οΈ Trim Silence Automatically"):
449
  gr.Interface(
450
  fn=detect_silence,
@@ -483,28 +556,6 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
483
  description="Load your saved session"
484
  )
485
 
486
- # --- Auto-Save / Resume Sessions ===
487
- session_state = gr.State()
488
-
489
- with gr.Tab("🧾 Auto-Save & Resume"):
490
- gr.Markdown("Save your current state and resume editing later.")
491
-
492
- action_radio = gr.Radio(["save", "load"], label="Action", value="save")
493
- audio_input = gr.Audio(label="Upload or Load Audio", type="filepath")
494
- preset_dropdown = gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0] if preset_names else None)
495
- effect_checkbox = gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
496
- save_btn = gr.Button("Save or Load Session")
497
-
498
- loaded_audio = gr.Audio(label="Loaded Audio", type="filepath")
499
- loaded_preset = gr.Dropdown(choices=preset_names, label="Loaded Preset")
500
- loaded_effects = gr.CheckboxGroup(choices=effect_options, label="Loaded Effects")
501
-
502
- save_btn.click(
503
- fn=save_or_resume_session,
504
- inputs=[audio_input, preset_dropdown, effect_checkbox, action_radio],
505
- outputs=[session_state, loaded_audio, loaded_preset, loaded_effects]
506
- )
507
-
508
  # --- Mix Two Tracks ===
509
  with gr.Tab("πŸ”€ Mix Two Tracks"):
510
  gr.Interface(
@@ -516,37 +567,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
516
  ],
517
  outputs=gr.File(label="Mixed Output"),
518
  title="Overlay Two Tracks",
519
- description="Mix, blend, or subtract two audio files."
520
- )
521
-
522
- # === Voice Style Transfer (Dummy) ===
523
- def apply_style_transfer(audio_path, mood="Happy"):
524
- return audio_path
525
-
526
- with gr.Tab("🧠 Voice Style Transfer"):
527
- gr.Interface(
528
- fn=apply_style_transfer,
529
- inputs=[
530
- gr.Audio(label="Upload Voice Clip", type="filepath"),
531
- gr.Radio(["Happy", "Sad", "Angry", "Calm"], label="Choose Tone")
532
- ],
533
- outputs=gr.Audio(label="Stylized Output", type="filepath"),
534
- title="Change Emotional Tone of Voice",
535
- description="Shift the emotional style of any voice clip."
536
- )
537
-
538
- # --- Voice Cloning (Fallback) ===
539
- with gr.Tab("🎭 Voice Cloning (Demo)"):
540
- gr.Interface(
541
- fn=clone_voice,
542
- inputs=[
543
- gr.File(label="Source Voice Clip"),
544
- gr.File(label="Target Voice Clip"),
545
- gr.Textbox(label="Text to Clone", lines=5)
546
- ],
547
- outputs=gr.Audio(label="Cloned Output", type="filepath"),
548
- title="Replace One Voice With Another (Demo)",
549
- description="Clone voice from source to target speaker using AI"
550
  )
551
 
552
  demo.launch()
 
266
  except Exception as e:
267
  return None, f"❌ Batch processing failed: {str(e)}"
268
 
269
+ # === Whisper Transcription Tab ===
270
  whisper_model = WhisperModel("base")
271
 
272
  def transcribe_audio(audio_path):
 
274
  text = " ".join([seg.text for seg in segments])
275
  return text
276
 
277
+ # === TTS Voice Generator ===
278
  tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
279
 
280
  def generate_tts(text):
 
282
  tts.tts_to_file(text=text, file_path=out_path)
283
  return out_path
284
 
285
+ # === Save/Load Project File (.aiproj) ===
286
+ def save_project(audio_path, preset_name, effects):
287
+ project_data = {
288
+ "audio": AudioSegment.from_file(audio_path).raw_data,
289
+ "preset": preset_name,
290
+ "effects": effects
291
+ }
292
+ out_path = os.path.join(tempfile.gettempdir(), "project.aiproj")
293
+ with open(out_path, "wb") as f:
294
+ pickle.dump(project_data, f)
295
+ return out_path
296
+
297
+ def load_project(project_file):
298
+ with open(project_file.name, "rb") as f:
299
+ data = pickle.load(f)
300
+ return data["preset"], data["effects"]
301
+
302
  # === Trim Silence Automatically (VAD) ===
303
  def detect_silence(audio_file, silence_threshold=-50.0, min_silence_len=1000):
304
  audio = AudioSegment.from_file(audio_file)
 
326
  mixed.export(out_path, format="wav")
327
  return out_path
328
 
329
+ # === Speaker Diarization ("Who Spoke When?") ===
330
+ try:
331
+ from pyannote.audio import Pipeline as DiarizationPipeline
332
+ from huggingface_hub import login
 
 
 
 
 
 
 
333
 
334
+ hf_token = os.getenv("HF_TOKEN")
335
+ if hf_token:
336
+ login(token=hf_token)
337
+ else:
338
+ print("⚠️ HF_TOKEN not set – speaker diarization disabled")
339
 
340
+ diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token or True)
341
+ except ImportError:
342
+ diarize_pipeline = None
343
+ print("⚠️ PyAnnote not installed – speaker diarization disabled")
 
 
 
 
 
 
 
 
344
 
345
+ def diarize_and_transcribe(audio_path):
346
+ if diarize_pipeline is None:
347
+ return "⚠️ Diarization pipeline not loaded – check HF token or install pyannote.audio"
 
348
 
349
+ # Run diarization
350
+ audio = AudioSegment.from_file(audio_path)
351
+ temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
352
+ audio.export(temp_wav, format="wav")
353
+
354
+ try:
355
+ from pyannote.audio import Pipeline as DiarizationPipeline
356
+ diarization = diarize_pipeline(temp_wav)
357
+
358
+ # Run transcription
359
+ result = whisper.transcribe(temp_wav)
360
+
361
+ segments = []
362
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
363
+ text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
364
+ segments.append({
365
+ "speaker": speaker,
366
+ "start": turn.start,
367
+ "end": turn.end,
368
+ "text": text
369
+ })
370
+
371
+ return segments
372
+ except Exception as e:
373
+ return f"⚠️ Diarization failed: {str(e)}"
374
+
375
+ # === UI ===
376
  effect_options = [
377
  "Noise Reduction",
378
  "Compress Dynamic Range",
 
452
  clear_btn=None
453
  )
454
 
455
+ # --- Transcribe & Edit Tab ===
456
  with gr.Tab("πŸ“ Transcribe & Edit"):
457
  gr.Interface(
458
  fn=transcribe_audio,
459
  inputs=gr.Audio(label="Upload Audio", type="filepath"),
460
  outputs=gr.Textbox(label="Transcribed Text", lines=10),
461
+ title="Transcribe & Edit Spoken Content",
462
  description="Convert voice to text and edit it before exporting again."
463
  )
464
 
465
+ # --- TTS Voice Generator ===
466
  with gr.Tab("πŸ’¬ TTS Voice Generator"):
467
  gr.Interface(
468
  fn=generate_tts,
 
472
  description="Type anything and turn it into natural-sounding speech."
473
  )
474
 
475
+ # --- Speaker Diarization (Who Spoke When?) ===
476
+ with gr.Tab("πŸ§β€β™‚οΈ Who Spoke When?"):
477
+ gr.Interface(
478
+ fn=diarize_and_transcribe,
479
+ inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
480
+ outputs=gr.JSON(label="Diarized Transcript"),
481
+ title="Split By Speaker + Transcribe",
482
+ description="Detect speakers and transcribe their speech automatically."
483
+ )
484
+
485
+ # --- Auto-Save / Resume Sessions ===
486
+ session_state = gr.State()
487
+
488
+ def save_or_resume_session(audio, preset, effects, action="save"):
489
+ if action == "save":
490
+ return {"audio": audio, "preset": preset, "effects": effects}, None, None, None
491
+ elif action == "load" and isinstance(audio, dict):
492
+ return (
493
+ None,
494
+ audio.get("audio"),
495
+ audio.get("preset"),
496
+ audio.get("effects")
497
+ )
498
+ return None, None, None, None
499
+
500
+ with gr.Tab("🧾 Auto-Save & Resume"):
501
+ gr.Markdown("Save your current state and resume later.")
502
+
503
+ action_radio = gr.Radio(["save", "load"], label="Action", value="save")
504
+ audio_input = gr.Audio(label="Upload or Load Audio", type="filepath")
505
+ preset_dropdown = gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0] if preset_names else None)
506
+ effect_checkbox = gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
507
+ action_btn = gr.Button("Save or Load Session")
508
+
509
+ session_data = gr.State()
510
+ loaded_audio = gr.Audio(label="Loaded Audio", type="filepath")
511
+ loaded_preset = gr.Dropdown(choices=preset_names, label="Loaded Preset")
512
+ loaded_effects = gr.CheckboxGroup(choices=effect_options, label="Loaded Effects")
513
+
514
+ action_btn.click(
515
+ fn=save_or_resume_session,
516
+ inputs=[audio_input, preset_dropdown, effect_checkbox, action_radio],
517
+ outputs=[session_data, loaded_audio, loaded_preset, loaded_effects]
518
+ )
519
+
520
+ # --- Trim Silence Automatically (VAD) ===
521
  with gr.Tab("βœ‚οΈ Trim Silence Automatically"):
522
  gr.Interface(
523
  fn=detect_silence,
 
556
  description="Load your saved session"
557
  )
558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
  # --- Mix Two Tracks ===
560
  with gr.Tab("πŸ”€ Mix Two Tracks"):
561
  gr.Interface(
 
567
  ],
568
  outputs=gr.File(label="Mixed Output"),
569
  title="Overlay Two Tracks",
570
+ description="Mix or subtract two audio files."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
  )
572
 
573
  demo.launch()