tee342 commited on
Commit
c260091
Β·
verified Β·
1 Parent(s): e0bb421

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -91
app.py CHANGED
@@ -21,8 +21,12 @@ import warnings
21
  from faster_whisper import WhisperModel
22
  from mutagen.mp3 import MP3
23
  from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
 
 
 
 
24
 
25
- # Suppress warnings for cleaner logs
26
  warnings.filterwarnings("ignore")
27
 
28
  # === Helper Functions ===
@@ -264,70 +268,57 @@ def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, ex
264
  except Exception as e:
265
  return None, f"❌ Batch processing failed: {str(e)}"
266
 
267
- # === Whisper Transcription Tab ===
268
- whisper_model = WhisperModel("base")
269
-
270
- def transcribe_audio(audio_path):
271
- segments, info = whisper_model.transcribe(audio_path, beam_size=5)
272
- text = " ".join([seg.text for seg in segments])
273
- return text
274
-
275
- # === TTS Tab ===
276
- from TTS.api import TTS
277
-
278
- tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
279
 
280
- def generate_tts(text):
281
- out_path = os.path.join(tempfile.gettempdir(), "tts_output.wav")
282
- tts.tts_to_file(text=text, file_path=out_path)
283
- return out_path
284
-
285
- # === Analyze Audio Stats ===
286
- def analyze_audio(audio_path):
287
- y, sr = torchaudio.load(audio_path)
288
- rms = np.mean(librosa.feature.rms(y=y.numpy().flatten()))
289
- tempo, _ = librosa.beat.beat_track(y=y.numpy().flatten(), sr=sr)
290
- silence_ratio = np.mean(np.abs(y.numpy()) < 0.01)
291
-
292
- plt.figure(figsize=(10, 4))
293
- plt.plot(y.numpy().flatten(), color="lightblue")
294
- plt.title("Loudness Over Time")
295
- plt.tight_layout()
296
- buf = BytesIO()
297
- plt.savefig(buf, format="png")
298
- plt.close()
299
- image = Image.open(buf)
300
-
301
- stats = {
302
- "rms_loudness": float(rms),
303
- "silence_ratio": float(silence_ratio),
304
- "tempo_bpm": int(tempo)
305
- }
306
 
307
- return stats, image
 
 
308
 
309
- # === Mix Two Tracks ===
310
- def mix_tracks(track1, track2, volume_offset=0):
311
- a1 = AudioSegment.from_file(track1)
312
- a2 = AudioSegment.from_file(track2)
313
- mixed = a1.overlay(a2 - volume_offset)
314
- out_path = os.path.join(tempfile.gettempdir(), "mixed.wav")
315
- mixed.export(out_path, format="wav")
316
- return out_path
317
 
318
- # === Save/Load Project File (.aiproj) ===
319
- def save_project(audio_path, preset_name, effects):
320
- project_data = {
321
- "audio": AudioSegment.from_file(audio_path).raw_data,
322
- "preset": preset_name,
323
- "effects": effects
324
- }
325
- out_path = os.path.join(tempfile.gettempdir(), "project.aiproj")
326
- with open(out_path, "wb") as f:
327
- pickle.dump(project_data, f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  return out_path
329
 
330
- # UI Setup
331
  effect_options = [
332
  "Noise Reduction",
333
  "Compress Dynamic Range",
@@ -407,17 +398,41 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
407
  clear_btn=None
408
  )
409
 
410
- # --- Transcribe & Edit ---
411
  with gr.Tab("πŸ“ Transcribe & Edit"):
412
  gr.Interface(
413
  fn=transcribe_audio,
414
  inputs=gr.Audio(label="Upload Audio", type="filepath"),
415
  outputs=gr.Textbox(label="Transcribed Text", lines=10),
416
  title="Transcribe & Edit Spoken Content",
417
- description="Convert voice to text, then edit the script before exporting again."
418
  )
419
 
420
- # --- TTS Voice Generator ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  with gr.Tab("πŸ’¬ TTS Voice Generator"):
422
  gr.Interface(
423
  fn=generate_tts,
@@ -427,7 +442,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
427
  description="Type anything and turn it into natural-sounding speech."
428
  )
429
 
430
- # --- Audio Analysis Dashboard ---
431
  with gr.Tab("πŸ“Š Audio Analysis"):
432
  gr.Interface(
433
  fn=analyze_audio,
@@ -440,32 +455,4 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
440
  description="Analyze audio loudness, tempo, and frequency content."
441
  )
442
 
443
- # --- Mix Two Tracks ---
444
- with gr.Tab("πŸ”€ Mix Two Tracks"):
445
- gr.Interface(
446
- fn=mix_tracks,
447
- inputs=[
448
- gr.File(label="Main Track"),
449
- gr.File(label="Background Track"),
450
- gr.Slider(minimum=-10, maximum=10, value=0, label="Volume Offset (dB)")
451
- ],
452
- outputs=gr.File(label="Mixed Output"),
453
- title="Overlay Two Tracks",
454
- description="Mix or subtract two audio files."
455
- )
456
-
457
- # --- Load/Save Project ---
458
- with gr.Tab("πŸ“ Save/Load Project"):
459
- gr.Interface(
460
- fn=save_project,
461
- inputs=[
462
- gr.File(label="Original Audio"),
463
- gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0]),
464
- gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
465
- ],
466
- outputs=gr.File(label="Project File (.aiproj)"),
467
- title="Save Everything Together",
468
- description="Save your session, effects, and settings in one file to reuse later."
469
- )
470
-
471
  demo.launch()
 
21
  from faster_whisper import WhisperModel
22
  from mutagen.mp3 import MP3
23
  from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
24
+ import whisper
25
+ from pyannote.audio import Pipeline as DiarizationPipeline
26
+ from openvoice.api import TTS, ToneColorConverter
27
+ from openvoice.se_extractor import get_se
28
 
29
+ # Suppress warnings
30
  warnings.filterwarnings("ignore")
31
 
32
  # === Helper Functions ===
 
268
  except Exception as e:
269
  return None, f"❌ Batch processing failed: {str(e)}"
270
 
271
+ # === Load Models Once at Start ===
 
 
 
 
 
 
 
 
 
 
 
272
 
273
+ # 🧠 Speaker Diarization Model
274
+ diarize_model = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="YOUR_HF_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
+ # 🎀 OpenVoice TTS + Converter
277
+ tts_model = TTS(lang='en')
278
+ tone_converter = ToneColorConverter().to("cuda" if torch.cuda.is_available() else "cpu")
279
 
280
+ # === Transcribe & Diarize Tab ===
281
+ whisper_model = WhisperModel("base")
 
 
 
 
 
 
282
 
283
+ def diarize_and_transcribe(audio_path):
284
+ # Run diarization
285
+ audio = AudioSegment.from_file(audio_path)
286
+ temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
287
+ audio.export(temp_wav, format="wav")
288
+ diarization = diarize_model(temp_wav)
289
+
290
+ # Run transcription
291
+ result = whisper.transcribe(temp_wav)
292
+
293
+ segments = []
294
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
295
+ text = " ".join([seg.text for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
296
+ segments.append({
297
+ "speaker": speaker,
298
+ "start": turn.start,
299
+ "end": turn.end,
300
+ "text": text
301
+ })
302
+
303
+ return segments
304
+
305
+ # === Voice Cloning (Dubbing) ===
306
+ def clone_voice(source_audio, target_audio, text):
307
+ source_se, _ = get_se(source_audio)
308
+ target_se, _ = get_se(target_audio)
309
+
310
+ out_path = os.path.join(tempfile.gettempdir(), "cloned_output.wav")
311
+
312
+ tts_model.tts_to_file(text=text, file_path=out_path)
313
+ tone_converter.convert(
314
+ audio_src_path=out_path,
315
+ src_se=source_se,
316
+ tgt_se=target_se,
317
+ output_path=out_path
318
+ )
319
  return out_path
320
 
321
+ # === UI ===
322
  effect_options = [
323
  "Noise Reduction",
324
  "Compress Dynamic Range",
 
398
  clear_btn=None
399
  )
400
 
401
+ # --- Transcribe & Edit ===
402
  with gr.Tab("πŸ“ Transcribe & Edit"):
403
  gr.Interface(
404
  fn=transcribe_audio,
405
  inputs=gr.Audio(label="Upload Audio", type="filepath"),
406
  outputs=gr.Textbox(label="Transcribed Text", lines=10),
407
  title="Transcribe & Edit Spoken Content",
408
+ description="Convert voice to text and edit it before exporting again."
409
  )
410
 
411
+ # --- Speaker Diarization ===
412
+ with gr.Tab("πŸ§β€β™‚οΈ Who Spoke When?"):
413
+ gr.Interface(
414
+ fn=diarize_and_transcribe,
415
+ inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
416
+ outputs=gr.JSON(label="Diarized Transcript"),
417
+ title="Split By Speaker + Transcribe",
418
+ description="Detect speakers and transcribe their speech automatically."
419
+ )
420
+
421
+ # --- Voice Cloning (Dubbing) ===
422
+ with gr.Tab("🎭 Voice Cloning (Dubbing)"):
423
+ gr.Interface(
424
+ fn=clone_voice,
425
+ inputs=[
426
+ gr.File(label="Source Voice Clip"),
427
+ gr.File(label="Target Voice Clip"),
428
+ gr.Textbox(label="Text to Clone", lines=5)
429
+ ],
430
+ outputs=gr.Audio(label="Cloned Output", type="filepath"),
431
+ title="Replace One Voice With Another",
432
+ description="Clone voice from source to target speaker using AI"
433
+ )
434
+
435
+ # --- TTS Voice Generator ===
436
  with gr.Tab("πŸ’¬ TTS Voice Generator"):
437
  gr.Interface(
438
  fn=generate_tts,
 
442
  description="Type anything and turn it into natural-sounding speech."
443
  )
444
 
445
+ # --- Audio Analysis Dashboard ===
446
  with gr.Tab("πŸ“Š Audio Analysis"):
447
  gr.Interface(
448
  fn=analyze_audio,
 
455
  description="Analyze audio loudness, tempo, and frequency content."
456
  )
457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
  demo.launch()