Spaces:

tee342
/

AudioMaster

Sleeping

App Files Files Community

tee342 commited on Jun 12

Commit

dc26431

verified ·

1 Parent(s): 2f52f6c

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -91

app.py CHANGED Viewed

@@ -266,7 +266,7 @@ def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, ex
     except Exception as e:
         return None, f"❌ Batch processing failed: {str(e)}"
-# === Transcribe & Edit Tab ===
 whisper_model = WhisperModel("base")
 def transcribe_audio(audio_path):
@@ -274,7 +274,7 @@ def transcribe_audio(audio_path):
     text = " ".join([seg.text for seg in segments])
     return text
-# === TTS Tab ===
 tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
 def generate_tts(text):
@@ -282,6 +282,23 @@ def generate_tts(text):
     tts.tts_to_file(text=text, file_path=out_path)
     return out_path
 # === Trim Silence Automatically (VAD) ===
 def detect_silence(audio_file, silence_threshold=-50.0, min_silence_len=1000):
     audio = AudioSegment.from_file(audio_file)
@@ -309,42 +326,53 @@ def mix_tracks(track1, track2, volume_offset=0):
     mixed.export(out_path, format="wav")
     return out_path
-# === Save/Load Project File (.aiproj) ===
-def save_project(audio_path, preset_name, effects):
-    project_data = {
-        "audio": AudioSegment.from_file(audio_path).raw_data,
-        "preset": preset_name,
-        "effects": effects
-    }
-    out_path = os.path.join(tempfile.gettempdir(), "project.aiproj")
-    with open(out_path, "wb") as f:
-        pickle.dump(project_data, f)
-    return out_path
-def load_project(project_file):
-    with open(project_file.name, "rb") as f:
-        data = pickle.load(f)
-    return data["preset"], data["effects"]
-# === Auto-Save / Resume Sessions ===
-def save_or_resume_session(audio, preset, effects, action="save"):
-    if action == "save":
-        return {"audio": audio, "preset": preset, "effects": effects}, None, None, None
-    elif action == "load" and isinstance(audio, dict):
-        return (
-            None,
-            audio.get("audio"),
-            audio.get("preset"),
-            audio.get("effects")
-        )
-    return None, None, None, None
-# === Voice Cloning – Fallback Version for Hugging Face ===
-def clone_voice(source_audio, target_audio, text):
-    print("⚠️ Voice cloning not available in browser version — use local install for full support")
-    return generate_tts(text)
-# === UI Setup ===
 effect_options = [
     "Noise Reduction",
     "Compress Dynamic Range",
@@ -424,17 +452,17 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
             clear_btn=None
         )
-    # --- Transcribe & Edit ---
     with gr.Tab("📝 Transcribe & Edit"):
         gr.Interface(
             fn=transcribe_audio,
             inputs=gr.Audio(label="Upload Audio", type="filepath"),
             outputs=gr.Textbox(label="Transcribed Text", lines=10),
-            title="Transcribe Spoken Content",
             description="Convert voice to text and edit it before exporting again."
         )
-    # --- TTS Voice Generator ---
     with gr.Tab("💬 TTS Voice Generator"):
         gr.Interface(
             fn=generate_tts,
@@ -444,7 +472,52 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
             description="Type anything and turn it into natural-sounding speech."
         )
-    # --- VAD – Detect & Remove Silence ===
     with gr.Tab("✂️ Trim Silence Automatically"):
         gr.Interface(
             fn=detect_silence,
@@ -483,28 +556,6 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
             description="Load your saved session"
         )
-    # --- Auto-Save / Resume Sessions ===
-    session_state = gr.State()
-    with gr.Tab("🧾 Auto-Save & Resume"):
-        gr.Markdown("Save your current state and resume editing later.")
-        action_radio = gr.Radio(["save", "load"], label="Action", value="save")
-        audio_input = gr.Audio(label="Upload or Load Audio", type="filepath")
-        preset_dropdown = gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0] if preset_names else None)
-        effect_checkbox = gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
-        save_btn = gr.Button("Save or Load Session")
-        loaded_audio = gr.Audio(label="Loaded Audio", type="filepath")
-        loaded_preset = gr.Dropdown(choices=preset_names, label="Loaded Preset")
-        loaded_effects = gr.CheckboxGroup(choices=effect_options, label="Loaded Effects")
-        save_btn.click(
-            fn=save_or_resume_session,
-            inputs=[audio_input, preset_dropdown, effect_checkbox, action_radio],
-            outputs=[session_state, loaded_audio, loaded_preset, loaded_effects]
-        )
     # --- Mix Two Tracks ===
     with gr.Tab("🔀 Mix Two Tracks"):
         gr.Interface(
@@ -516,37 +567,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
             ],
             outputs=gr.File(label="Mixed Output"),
             title="Overlay Two Tracks",
-            description="Mix, blend, or subtract two audio files."
-        )
-    # === Voice Style Transfer (Dummy) ===
-    def apply_style_transfer(audio_path, mood="Happy"):
-        return audio_path
-    with gr.Tab("🧠 Voice Style Transfer"):
-        gr.Interface(
-            fn=apply_style_transfer,
-            inputs=[
-                gr.Audio(label="Upload Voice Clip", type="filepath"),
-                gr.Radio(["Happy", "Sad", "Angry", "Calm"], label="Choose Tone")
-            ],
-            outputs=gr.Audio(label="Stylized Output", type="filepath"),
-            title="Change Emotional Tone of Voice",
-            description="Shift the emotional style of any voice clip."
-        )
-    # --- Voice Cloning (Fallback) ===
-    with gr.Tab("🎭 Voice Cloning (Demo)"):
-        gr.Interface(
-            fn=clone_voice,
-            inputs=[
-                gr.File(label="Source Voice Clip"),
-                gr.File(label="Target Voice Clip"),
-                gr.Textbox(label="Text to Clone", lines=5)
-            ],
-            outputs=gr.Audio(label="Cloned Output", type="filepath"),
-            title="Replace One Voice With Another (Demo)",
-            description="Clone voice from source to target speaker using AI"
         )
 demo.launch()

     except Exception as e:
         return None, f"❌ Batch processing failed: {str(e)}"
+# === Whisper Transcription Tab ===
 whisper_model = WhisperModel("base")
 def transcribe_audio(audio_path):
     text = " ".join([seg.text for seg in segments])
     return text
+# === TTS Voice Generator ===
 tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
 def generate_tts(text):
     tts.tts_to_file(text=text, file_path=out_path)
     return out_path
+# === Save/Load Project File (.aiproj) ===
+def save_project(audio_path, preset_name, effects):
+    project_data = {
+        "audio": AudioSegment.from_file(audio_path).raw_data,
+        "preset": preset_name,
+        "effects": effects
+    }
+    out_path = os.path.join(tempfile.gettempdir(), "project.aiproj")
+    with open(out_path, "wb") as f:
+        pickle.dump(project_data, f)
+    return out_path
+def load_project(project_file):
+    with open(project_file.name, "rb") as f:
+        data = pickle.load(f)
+    return data["preset"], data["effects"]
 # === Trim Silence Automatically (VAD) ===
 def detect_silence(audio_file, silence_threshold=-50.0, min_silence_len=1000):
     audio = AudioSegment.from_file(audio_file)
     mixed.export(out_path, format="wav")
     return out_path
+# === Speaker Diarization ("Who Spoke When?") ===
+try:
+    from pyannote.audio import Pipeline as DiarizationPipeline
+    from huggingface_hub import login
+    hf_token = os.getenv("HF_TOKEN")
+    if hf_token:
+        login(token=hf_token)
+    else:
+        print("⚠️ HF_TOKEN not set – speaker diarization disabled")
+    diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token or True)
+except ImportError:
+    diarize_pipeline = None
+    print("⚠️ PyAnnote not installed – speaker diarization disabled")
+def diarize_and_transcribe(audio_path):
+    if diarize_pipeline is None:
+        return "⚠️ Diarization pipeline not loaded – check HF token or install pyannote.audio"
+    # Run diarization
+    audio = AudioSegment.from_file(audio_path)
+    temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
+    audio.export(temp_wav, format="wav")
+    try:
+        from pyannote.audio import Pipeline as DiarizationPipeline
+        diarization = diarize_pipeline(temp_wav)
+        # Run transcription
+        result = whisper.transcribe(temp_wav)
+        segments = []
+        for turn, _, speaker in diarization.itertracks(yield_label=True):
+            text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
+            segments.append({
+                "speaker": speaker,
+                "start": turn.start,
+                "end": turn.end,
+                "text": text
+            })
+        return segments
+    except Exception as e:
+        return f"⚠️ Diarization failed: {str(e)}"
+# === UI ===
 effect_options = [
     "Noise Reduction",
     "Compress Dynamic Range",
             clear_btn=None
         )
+    # --- Transcribe & Edit Tab ===
     with gr.Tab("📝 Transcribe & Edit"):
         gr.Interface(
             fn=transcribe_audio,
             inputs=gr.Audio(label="Upload Audio", type="filepath"),
             outputs=gr.Textbox(label="Transcribed Text", lines=10),
+            title="Transcribe & Edit Spoken Content",
             description="Convert voice to text and edit it before exporting again."
         )
+    # --- TTS Voice Generator ===
     with gr.Tab("💬 TTS Voice Generator"):
         gr.Interface(
             fn=generate_tts,
             description="Type anything and turn it into natural-sounding speech."
         )
+    # --- Speaker Diarization (Who Spoke When?) ===
+    with gr.Tab("🧏‍♂️ Who Spoke When?"):
+        gr.Interface(
+            fn=diarize_and_transcribe,
+            inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
+            outputs=gr.JSON(label="Diarized Transcript"),
+            title="Split By Speaker + Transcribe",
+            description="Detect speakers and transcribe their speech automatically."
+        )
+    # --- Auto-Save / Resume Sessions ===
+    session_state = gr.State()
+    def save_or_resume_session(audio, preset, effects, action="save"):
+        if action == "save":
+            return {"audio": audio, "preset": preset, "effects": effects}, None, None, None
+        elif action == "load" and isinstance(audio, dict):
+            return (
+                None,
+                audio.get("audio"),
+                audio.get("preset"),
+                audio.get("effects")
+            )
+        return None, None, None, None
+    with gr.Tab("🧾 Auto-Save & Resume"):
+        gr.Markdown("Save your current state and resume later.")
+        action_radio = gr.Radio(["save", "load"], label="Action", value="save")
+        audio_input = gr.Audio(label="Upload or Load Audio", type="filepath")
+        preset_dropdown = gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0] if preset_names else None)
+        effect_checkbox = gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
+        action_btn = gr.Button("Save or Load Session")
+        session_data = gr.State()
+        loaded_audio = gr.Audio(label="Loaded Audio", type="filepath")
+        loaded_preset = gr.Dropdown(choices=preset_names, label="Loaded Preset")
+        loaded_effects = gr.CheckboxGroup(choices=effect_options, label="Loaded Effects")
+        action_btn.click(
+            fn=save_or_resume_session,
+            inputs=[audio_input, preset_dropdown, effect_checkbox, action_radio],
+            outputs=[session_data, loaded_audio, loaded_preset, loaded_effects]
+        )
+    # --- Trim Silence Automatically (VAD) ===
     with gr.Tab("✂️ Trim Silence Automatically"):
         gr.Interface(
             fn=detect_silence,
             description="Load your saved session"
         )
     # --- Mix Two Tracks ===
     with gr.Tab("🔀 Mix Two Tracks"):
         gr.Interface(
             ],
             outputs=gr.File(label="Mixed Output"),
             title="Overlay Two Tracks",
+            description="Mix or subtract two audio files."
         )
 demo.launch()