Spaces:

tee342
/

AudioMaster

Sleeping

App Files Files Community

tee342 commited on Jun 12

Commit

c260091

verified ·

1 Parent(s): e0bb421

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -91

app.py CHANGED Viewed

@@ -21,8 +21,12 @@ import warnings
 from faster_whisper import WhisperModel
 from mutagen.mp3 import MP3
 from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
-# Suppress warnings for cleaner logs
 warnings.filterwarnings("ignore")
 # === Helper Functions ===
@@ -264,70 +268,57 @@ def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, ex
     except Exception as e:
         return None, f"❌ Batch processing failed: {str(e)}"
-# === Whisper Transcription Tab ===
-whisper_model = WhisperModel("base")
-def transcribe_audio(audio_path):
-    segments, info = whisper_model.transcribe(audio_path, beam_size=5)
-    text = " ".join([seg.text for seg in segments])
-    return text
-# === TTS Tab ===
-from TTS.api import TTS
-tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
-def generate_tts(text):
-    out_path = os.path.join(tempfile.gettempdir(), "tts_output.wav")
-    tts.tts_to_file(text=text, file_path=out_path)
-    return out_path
-# === Analyze Audio Stats ===
-def analyze_audio(audio_path):
-    y, sr = torchaudio.load(audio_path)
-    rms = np.mean(librosa.feature.rms(y=y.numpy().flatten()))
-    tempo, _ = librosa.beat.beat_track(y=y.numpy().flatten(), sr=sr)
-    silence_ratio = np.mean(np.abs(y.numpy()) < 0.01)
-    plt.figure(figsize=(10, 4))
-    plt.plot(y.numpy().flatten(), color="lightblue")
-    plt.title("Loudness Over Time")
-    plt.tight_layout()
-    buf = BytesIO()
-    plt.savefig(buf, format="png")
-    plt.close()
-    image = Image.open(buf)
-    stats = {
-        "rms_loudness": float(rms),
-        "silence_ratio": float(silence_ratio),
-        "tempo_bpm": int(tempo)
-    }
-    return stats, image
-# === Mix Two Tracks ===
-def mix_tracks(track1, track2, volume_offset=0):
-    a1 = AudioSegment.from_file(track1)
-    a2 = AudioSegment.from_file(track2)
-    mixed = a1.overlay(a2 - volume_offset)
-    out_path = os.path.join(tempfile.gettempdir(), "mixed.wav")
-    mixed.export(out_path, format="wav")
-    return out_path
-# === Save/Load Project File (.aiproj) ===
-def save_project(audio_path, preset_name, effects):
-    project_data = {
-        "audio": AudioSegment.from_file(audio_path).raw_data,
-        "preset": preset_name,
-        "effects": effects
-    }
-    out_path = os.path.join(tempfile.gettempdir(), "project.aiproj")
-    with open(out_path, "wb") as f:
-        pickle.dump(project_data, f)
     return out_path
-# UI Setup
 effect_options = [
     "Noise Reduction",
     "Compress Dynamic Range",
@@ -407,17 +398,41 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
             clear_btn=None
         )
-    # --- Transcribe & Edit ---
     with gr.Tab("📝 Transcribe & Edit"):
         gr.Interface(
             fn=transcribe_audio,
             inputs=gr.Audio(label="Upload Audio", type="filepath"),
             outputs=gr.Textbox(label="Transcribed Text", lines=10),
             title="Transcribe & Edit Spoken Content",
-            description="Convert voice to text, then edit the script before exporting again."
         )
-    # --- TTS Voice Generator ---
     with gr.Tab("💬 TTS Voice Generator"):
         gr.Interface(
             fn=generate_tts,
@@ -427,7 +442,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
             description="Type anything and turn it into natural-sounding speech."
         )
-    # --- Audio Analysis Dashboard ---
     with gr.Tab("📊 Audio Analysis"):
         gr.Interface(
             fn=analyze_audio,
@@ -440,32 +455,4 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
             description="Analyze audio loudness, tempo, and frequency content."
         )
-    # --- Mix Two Tracks ---
-    with gr.Tab("🔀 Mix Two Tracks"):
-        gr.Interface(
-            fn=mix_tracks,
-            inputs=[
-                gr.File(label="Main Track"),
-                gr.File(label="Background Track"),
-                gr.Slider(minimum=-10, maximum=10, value=0, label="Volume Offset (dB)")
-            ],
-            outputs=gr.File(label="Mixed Output"),
-            title="Overlay Two Tracks",
-            description="Mix or subtract two audio files."
-        )
-    # --- Load/Save Project ---
-    with gr.Tab("📁 Save/Load Project"):
-        gr.Interface(
-            fn=save_project,
-            inputs=[
-                gr.File(label="Original Audio"),
-                gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0]),
-                gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
-            ],
-            outputs=gr.File(label="Project File (.aiproj)"),
-            title="Save Everything Together",
-            description="Save your session, effects, and settings in one file to reuse later."
-        )
 demo.launch()

 from faster_whisper import WhisperModel
 from mutagen.mp3 import MP3
 from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
+import whisper
+from pyannote.audio import Pipeline as DiarizationPipeline
+from openvoice.api import TTS, ToneColorConverter
+from openvoice.se_extractor import get_se
+# Suppress warnings
 warnings.filterwarnings("ignore")
 # === Helper Functions ===
     except Exception as e:
         return None, f"❌ Batch processing failed: {str(e)}"
+# === Load Models Once at Start ===
+# 🧠 Speaker Diarization Model
+diarize_model = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="YOUR_HF_TOKEN")
+# 🎤 OpenVoice TTS + Converter
+tts_model = TTS(lang='en')
+tone_converter = ToneColorConverter().to("cuda" if torch.cuda.is_available() else "cpu")
+# === Transcribe & Diarize Tab ===
+whisper_model = WhisperModel("base")
+def diarize_and_transcribe(audio_path):
+    # Run diarization
+    audio = AudioSegment.from_file(audio_path)
+    temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
+    audio.export(temp_wav, format="wav")
+    diarization = diarize_model(temp_wav)
+    # Run transcription
+    result = whisper.transcribe(temp_wav)
+    segments = []
+    for turn, _, speaker in diarization.itertracks(yield_label=True):
+        text = " ".join([seg.text for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
+        segments.append({
+            "speaker": speaker,
+            "start": turn.start,
+            "end": turn.end,
+            "text": text
+        })
+    return segments
+# === Voice Cloning (Dubbing) ===
+def clone_voice(source_audio, target_audio, text):
+    source_se, _ = get_se(source_audio)
+    target_se, _ = get_se(target_audio)
+    out_path = os.path.join(tempfile.gettempdir(), "cloned_output.wav")
+    tts_model.tts_to_file(text=text, file_path=out_path)
+    tone_converter.convert(
+        audio_src_path=out_path,
+        src_se=source_se,
+        tgt_se=target_se,
+        output_path=out_path
+    )
     return out_path
+# === UI ===
 effect_options = [
     "Noise Reduction",
     "Compress Dynamic Range",
             clear_btn=None
         )
+    # --- Transcribe & Edit ===
     with gr.Tab("📝 Transcribe & Edit"):
         gr.Interface(
             fn=transcribe_audio,
             inputs=gr.Audio(label="Upload Audio", type="filepath"),
             outputs=gr.Textbox(label="Transcribed Text", lines=10),
             title="Transcribe & Edit Spoken Content",
+            description="Convert voice to text and edit it before exporting again."
         )
+    # --- Speaker Diarization ===
+    with gr.Tab("🧏‍♂️ Who Spoke When?"):
+        gr.Interface(
+            fn=diarize_and_transcribe,
+            inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
+            outputs=gr.JSON(label="Diarized Transcript"),
+            title="Split By Speaker + Transcribe",
+            description="Detect speakers and transcribe their speech automatically."
+        )
+    # --- Voice Cloning (Dubbing) ===
+    with gr.Tab("🎭 Voice Cloning (Dubbing)"):
+        gr.Interface(
+            fn=clone_voice,
+            inputs=[
+                gr.File(label="Source Voice Clip"),
+                gr.File(label="Target Voice Clip"),
+                gr.Textbox(label="Text to Clone", lines=5)
+            ],
+            outputs=gr.Audio(label="Cloned Output", type="filepath"),
+            title="Replace One Voice With Another",
+            description="Clone voice from source to target speaker using AI"
+        )
+    # --- TTS Voice Generator ===
     with gr.Tab("💬 TTS Voice Generator"):
         gr.Interface(
             fn=generate_tts,
             description="Type anything and turn it into natural-sounding speech."
         )
+    # --- Audio Analysis Dashboard ===
     with gr.Tab("📊 Audio Analysis"):
         gr.Interface(
             fn=analyze_audio,
             description="Analyze audio loudness, tempo, and frequency content."
         )
 demo.launch()