Spaces:

tee342
/

AudioMaster

Sleeping

App Files Files Community

tee342 commited on Jun 12

Commit

5cdf2bf

verified ·

1 Parent(s): c08f175

Update app.py

Browse files

Files changed (1) hide show

app.py +254 -89

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import numpy as np
 import tempfile
 import os
 import noisereduce as nr
 import torch
 from demucs import pretrained
 from demucs.apply import apply_model
@@ -12,13 +13,12 @@ from pathlib import Path
 import matplotlib.pyplot as plt
 from io import BytesIO
 from PIL import Image
-import whisper
-from faster_whisper import WhisperModel
-import json
 import datetime
 import librosa
 import joblib
 import warnings
 from mutagen.mp3 import MP3
 from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
@@ -121,11 +121,11 @@ def stem_split(audio_path):
     for i, name in enumerate(['drums', 'bass', 'other', 'vocals']):
         path = os.path.join(output_dir, f"{name}.wav")
         save_track(path, sources[i].cpu(), model.samplerate)
-        stem_paths.append(path)
-    return [gr.File(value=path) for path in stem_paths]
-# === Load Presets ===
 def load_presets():
     try:
         preset_files = [f for f in os.listdir("presets") if f.endswith(".json")]
@@ -155,7 +155,7 @@ if not preset_choices:
 preset_names = list(preset_choices.keys())
-# === Waveform Generator ===
 def show_waveform(audio_file):
     try:
         audio = AudioSegment.from_file(audio_file)
@@ -171,18 +171,27 @@ def show_waveform(audio_file):
     except Exception as e:
         return None
 # === Session Info Export ===
-def generate_session_log(audio_path, effects, isolate_vocals, export_format):
     log = {
         "timestamp": str(datetime.datetime.now()),
         "filename": os.path.basename(audio_path),
         "effects_applied": effects,
         "isolate_vocals": isolate_vocals,
-        "export_format": export_format
     }
     return json.dumps(log, indent=2)
-# === Main Processing Function ===
 def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, export_format):
     status = "🔊 Loading audio..."
     try:
@@ -220,16 +229,42 @@ def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, exp
             final_audio.export(output_path, format=export_format.lower())
             waveform_image = show_waveform(output_path)
-            session_log = generate_session_log(audio_file, effects_to_apply, isolate_vocals, export_format)
             status = "🎉 Done!"
-            return output_path, waveform_image, session_log, status
     except Exception as e:
         status = f"❌ Error: {str(e)}"
-        return None, None, status, status
-# === Transcribe & Edit Tab ===
 whisper_model = WhisperModel("base")
 def transcribe_audio(audio_path):
@@ -237,56 +272,126 @@ def transcribe_audio(audio_path):
     text = " ".join([seg.text for seg in segments])
     return text
 # === Speaker Diarization Tab ===
 try:
     from pyannote.audio import Pipeline as DiarizationPipeline
-    from huggingface_hub import login
     hf_token = os.getenv("HF_TOKEN")
     if hf_token:
-        login(token=hf_token)
     else:
-        print("⚠️ HF_TOKEN not set — some models may not load")
-    diarize_pipeline = DiarizationPipeline.from_pretrained(
-        "pyannote/speaker-diarization",
-        use_auth_token=hf_token or True
-    )
-except Exception as e:
-    print(f"⚠️ Failed to load diarization: {e}")
     diarize_pipeline = None
 def diarize_and_transcribe(audio_path):
-    if diarize_pipeline is None:
-        return "⚠️ Diarization model not loaded — check HF_TOKEN"
     # Run diarization
     audio = AudioSegment.from_file(audio_path)
     temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
     audio.export(temp_wav, format="wav")
     try:
-        from pyannote.audio import Pipeline as DiarizationPipeline
-        diarization = diarize_pipeline(temp_wav)
-        # Run transcription
-        result = whisper.transcribe(temp_wav)
-        segments = []
-        for turn, _, speaker in diarization.itertracks(yield_label=True):
-            text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
-            segments.append({
-                "speaker": speaker,
-                "start": turn.start,
-                "end": turn.end,
-                "text": text
-            })
-        return segments
     except Exception as e:
-        return f"⚠️ Diarization failed: {str(e)}"
-# === UI Setup ===
 effect_options = [
     "Noise Reduction",
     "Compress Dynamic Range",
@@ -317,6 +422,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
                 gr.Audio(label="Processed Audio", type="filepath"),
                 gr.Image(label="Waveform Preview"),
                 gr.Textbox(label="Session Log (JSON)", lines=5),
                 gr.Textbox(label="Status", value="✅ Ready", lines=1)
             ],
             title="Edit One File at a Time",
@@ -326,17 +432,66 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
             clear_btn=None
         )
-    # --- Transcribe & Edit Tab ---
     with gr.Tab("📝 Transcribe & Edit"):
         gr.Interface(
             fn=transcribe_audio,
             inputs=gr.Audio(label="Upload Audio", type="filepath"),
             outputs=gr.Textbox(label="Transcribed Text", lines=10),
-            title="Transcribe Spoken Content",
-            description="Convert voice to text and edit it before exporting again."
         )
-    # --- Diarization Tab (Who Spoke When?) ---
     if diarize_pipeline:
         with gr.Tab("🧏‍♂️ Who Spoke When?"):
             gr.Interface(
@@ -344,54 +499,64 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
                 inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
                 outputs=gr.JSON(label="Diarized Transcript"),
                 title="Split By Speaker + Transcribe",
-                description="Use AI to split podcast by speaker and transcribe their speech.",
-                flagging_mode="never"
             )
-    # --- Batch Processing ---
-    def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, export_format):
-        status = "🔊 Loading files..."
-        try:
-            output_dir = tempfile.mkdtemp()
-            results = []
-            session_logs = []
-            for file in files:
-                processed_path, _, log, _ = process_audio(file.name, selected_effects, isolate_vocals, preset_name, export_format)
-                results.append(processed_path)
-                session_logs.append(log)
-            zip_path = os.path.join(output_dir, "batch_output.zip")
-            with zipfile.ZipFile(zip_path, 'w') as zipf:
-                for i, res in enumerate(results):
-                    filename = f"processed_{i}.{export_format.lower()}"
-                    zipf.write(res, filename)
-                    zipf.writestr(f"session_info_{i}.json", session_logs[i])
-            return zip_path, "📦 ZIP created successfully!"
-        except Exception as e:
-            return None, f"❌ Batch processing failed: {str(e)}"
-    with gr.Tab("🔊 Batch Processing"):
         gr.Interface(
-            fn=batch_process_audio,
             inputs=[
-                gr.File(label="Upload Multiple Files", file_count="multiple"),
-                gr.CheckboxGroup(choices=effect_options, label="Apply Effects in Order"),
-                gr.Checkbox(label="Isolate Vocals After Effects"),
-                gr.Dropdown(choices=preset_names, label="Select Preset", value=preset_names[0] if preset_names else None),
-                gr.Dropdown(choices=["MP3", "WAV"], label="Export Format", value="MP3")
             ],
-            outputs=[
-                gr.File(label="Download ZIP of All Processed Files"),
-                gr.Textbox(label="Status", value="✅ Ready", lines=1)
             ],
-            title="Batch Audio Processor",
-            description="Upload multiple files, apply effects in bulk, and download all results in a single ZIP.",
-            flagging_mode="never",
-            submit_btn="Process All Files",
-            clear_btn=None
         )
 demo.launch()

 import tempfile
 import os
 import noisereduce as nr
+import json
 import torch
 from demucs import pretrained
 from demucs.apply import apply_model
 import matplotlib.pyplot as plt
 from io import BytesIO
 from PIL import Image
+import zipfile
 import datetime
 import librosa
 import joblib
 import warnings
+from faster_whisper import WhisperModel
 from mutagen.mp3 import MP3
 from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
     for i, name in enumerate(['drums', 'bass', 'other', 'vocals']):
         path = os.path.join(output_dir, f"{name}.wav")
         save_track(path, sources[i].cpu(), model.samplerate)
+        stem_paths.append(gr.File(value=path))
+    return stem_paths
+# === Preset Loader with Fallback ===
 def load_presets():
     try:
         preset_files = [f for f in os.listdir("presets") if f.endswith(".json")]
 preset_names = list(preset_choices.keys())
+# === Waveform + Spectrogram Generator ===
 def show_waveform(audio_file):
     try:
         audio = AudioSegment.from_file(audio_file)
     except Exception as e:
         return None
+def detect_genre(audio_path):
+    try:
+        y, sr = torchaudio.load(audio_path)
+        mfccs = librosa.feature.mfcc(y=y.numpy().flatten(), sr=sr, n_mfcc=13).mean(axis=1).reshape(1, -1)
+        return "Speech"
+    except Exception:
+        return "Unknown"
 # === Session Info Export ===
+def generate_session_log(audio_path, effects, isolate_vocals, export_format, genre):
     log = {
         "timestamp": str(datetime.datetime.now()),
         "filename": os.path.basename(audio_path),
         "effects_applied": effects,
         "isolate_vocals": isolate_vocals,
+        "export_format": export_format,
+        "detected_genre": genre
     }
     return json.dumps(log, indent=2)
+# === Main Processing Function with Status Updates ===
 def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, export_format):
     status = "🔊 Loading audio..."
     try:
             final_audio.export(output_path, format=export_format.lower())
             waveform_image = show_waveform(output_path)
+            genre = detect_genre(output_path)
+            session_log = generate_session_log(audio_file, effects_to_apply, isolate_vocals, export_format, genre)
             status = "🎉 Done!"
+            return output_path, waveform_image, session_log, genre, status
     except Exception as e:
         status = f"❌ Error: {str(e)}"
+        return None, None, status, "", status
+# === Batch Processing Function ===
+def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, export_format):
+    status = "🔊 Loading files..."
+    try:
+        output_dir = tempfile.mkdtemp()
+        results = []
+        session_logs = []
+        for file in files:
+            processed_path, _, log, _, _ = process_audio(file.name, selected_effects, isolate_vocals, preset_name, export_format)
+            results.append(processed_path)
+            session_logs.append(log)
+        zip_path = os.path.join(output_dir, "batch_output.zip")
+        with zipfile.ZipFile(zip_path, 'w') as zipf:
+            for i, res in enumerate(results):
+                filename = f"processed_{i}.{export_format.lower()}"
+                zipf.write(res, filename)
+                zipf.writestr(f"session_info_{i}.json", session_logs[i])
+        return zip_path, "📦 ZIP created successfully!"
+    except Exception as e:
+        return None, f"❌ Batch processing failed: {str(e)}"
+# === Whisper Transcription Tab ===
 whisper_model = WhisperModel("base")
 def transcribe_audio(audio_path):
     text = " ".join([seg.text for seg in segments])
     return text
+# === TTS Tab ===
+from TTS.api import TTS
+tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
+def generate_tts(text):
+    out_path = os.path.join(tempfile.gettempdir(), "tts_output.wav")
+    tts.tts_to_file(text=text, file_path=out_path)
+    return out_path
+# === Analyze Audio Stats ===
+def analyze_audio(audio_path):
+    y, sr = torchaudio.load(audio_path)
+    rms = np.mean(librosa.feature.rms(y=y.numpy().flatten()))
+    tempo, _ = librosa.beat.beat_track(y=y.numpy().flatten(), sr=sr)
+    silence_ratio = np.mean(np.abs(y.numpy()) < 0.01)
+    plt.figure(figsize=(10, 4))
+    plt.plot(y.numpy().flatten(), color="lightblue")
+    plt.title("Loudness Over Time")
+    plt.tight_layout()
+    buf = BytesIO()
+    plt.savefig(buf, format="png")
+    plt.close()
+    image = Image.open(buf)
+    stats = {
+        "rms_loudness": float(rms),
+        "silence_ratio": float(silence_ratio),
+        "tempo_bpm": int(tempo)
+    }
+    return stats, image
 # === Speaker Diarization Tab ===
 try:
     from pyannote.audio import Pipeline as DiarizationPipeline
     hf_token = os.getenv("HF_TOKEN")
     if hf_token:
+        diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token)
     else:
+        diarize_pipeline = None
+        print("⚠️ No HF_TOKEN set – speaker diarization disabled")
+except ImportError as e:
     diarize_pipeline = None
+    print(f"⚠️ Could not load diarization: {e}")
 def diarize_and_transcribe(audio_path):
+    if not diarize_pipeline:
+        return "⚠️ Diarization pipeline not loaded – check HF token or install pyannote.audio"
     # Run diarization
     audio = AudioSegment.from_file(audio_path)
     temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
     audio.export(temp_wav, format="wav")
+    # Run diarization
+    diarization = diarize_pipeline(temp_wav)
+    # Run transcription
+    result = whisper.transcribe(temp_wav)
+    segments = []
+    for turn, _, speaker in diarization.itertracks(yield_label=True):
+        text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
+        segments.append({
+            "speaker": speaker,
+            "start": turn.start,
+            "end": turn.end,
+            "text": text
+        })
+    return segments
+# === Save/Load Project File (.aiproj) ===
+def save_project(audio_path, preset_name, effects):
+    project_data = {
+        "audio": AudioSegment.from_file(audio_path).raw_data,
+        "preset": preset_name,
+        "effects": effects
+    }
+    out_path = os.path.join(tempfile.gettempdir(), "project.aiproj")
+    with open(out_path, "wb") as f:
+        pickle.dump(project_data, f)
+    return out_path
+# === Mix Two Tracks ===
+def mix_tracks(track1, track2, volume_offset=0):
+    a1 = AudioSegment.from_file(track1)
+    a2 = AudioSegment.from_file(track2)
+    mixed = a1.overlay(a2 - volume_offset)
+    out_path = os.path.join(tempfile.gettempdir(), "mixed.wav")
+    mixed.export(out_path, format="wav")
+    return out_path
+# === Voice Style Transfer (Dummy) ===
+def apply_style_transfer(audio_path, mood="Happy"):
+    return audio_path
+# === Metadata Tagging ===
+def tag_mp3(file_path, title, artist, album, year):
     try:
+        audio = MP3(file_path)
+        try:
+            audio.tags = ID3()
+        except:
+            audio.add_tags()
+        audio.tags.add(TIT2(encoding=3, text=title))
+        audio.tags.add(TPE1(encoding=3, text=artist))
+        if album:
+            audio.tags.add(TALB(encoding=3, text=album))
+        if year:
+            audio.tags.add(TYER(encoding=3, text=str(year)))
+        audio.save()
+        return file_path
     except Exception as e:
+        return None
+# === UI ===
 effect_options = [
     "Noise Reduction",
     "Compress Dynamic Range",
                 gr.Audio(label="Processed Audio", type="filepath"),
                 gr.Image(label="Waveform Preview"),
                 gr.Textbox(label="Session Log (JSON)", lines=5),
+                gr.Textbox(label="Detected Genre", lines=1),
                 gr.Textbox(label="Status", value="✅ Ready", lines=1)
             ],
             title="Edit One File at a Time",
             clear_btn=None
         )
+    # --- Batch Processing ---
+    with gr.Tab("🔊 Batch Processing"):
+        gr.Interface(
+            fn=batch_process_audio,
+            inputs=[
+                gr.File(label="Upload Multiple Files", file_count="multiple"),
+                gr.CheckboxGroup(choices=effect_options, label="Apply Effects in Order"),
+                gr.Checkbox(label="Isolate Vocals After Effects"),
+                gr.Dropdown(choices=preset_names, label="Select Preset", value=preset_names[0] if preset_names else None),
+                gr.Dropdown(choices=["MP3", "WAV"], label="Export Format", value="MP3")
+            ],
+            outputs=[
+                gr.File(label="Download ZIP of All Processed Files"),
+                gr.Textbox(label="Status", value="✅ Ready", lines=1)
+            ],
+            title="Batch Audio Processor",
+            description="Upload multiple files, apply effects in bulk, and download all results in a single ZIP.",
+            flagging_mode="never",
+            submit_btn="Process All Files",
+            clear_btn=None
+        )
+    # --- Remix Mode ---
+    with gr.Tab("🎛 Remix Mode"):
+        gr.Interface(
+            fn=stem_split,
+            inputs=gr.Audio(label="Upload Music Track", type="filepath"),
+            outputs=[
+                gr.File(label="Vocals"),
+                gr.File(label="Drums"),
+                gr.File(label="Bass"),
+                gr.File(label="Other")
+            ],
+            title="Split Into Drums, Bass, Vocals, and More",
+            description="Use AI to separate musical elements like vocals, drums, and bass.",
+            flagging_mode="never",
+            clear_btn=None
+        )
+    # --- Transcribe & Edit ---
     with gr.Tab("📝 Transcribe & Edit"):
         gr.Interface(
             fn=transcribe_audio,
             inputs=gr.Audio(label="Upload Audio", type="filepath"),
             outputs=gr.Textbox(label="Transcribed Text", lines=10),
+            title="Transcribe & Edit Spoken Content",
+            description="Convert voice to text, then edit the script before exporting again."
         )
+    # --- TTS Voice Generator ---
+    with gr.Tab("💬 TTS Voice Generator"):
+        gr.Interface(
+            fn=generate_tts,
+            inputs=gr.Textbox(label="Enter Text", lines=5),
+            outputs=gr.Audio(label="Generated Speech", type="filepath"),
+            title="Text-to-Speech Generator",
+            description="Type anything and turn it into natural-sounding speech."
+        )
+    # --- Speaker Diarization ===
     if diarize_pipeline:
         with gr.Tab("🧏‍♂️ Who Spoke When?"):
             gr.Interface(
                 inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
                 outputs=gr.JSON(label="Diarized Transcript"),
                 title="Split By Speaker + Transcribe",
+                description="Detect speakers and transcribe their speech automatically."
             )
+    # --- Load/Save Project ---
+    with gr.Tab("📁 Save/Load Project"):
+        gr.Interface(
+            fn=save_project,
+            inputs=[
+                gr.File(label="Original Audio"),
+                gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0]),
+                gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
+            ],
+            outputs=gr.File(label="Project File (.aiproj)"),
+            title="Save Everything Together",
+            description="Save your session, effects, and settings in one file to reuse later."
+        )
+    # --- Mix Two Tracks ---
+    with gr.Tab("🔀 Mix Two Tracks"):
+        gr.Interface(
+            fn=mix_tracks,
+            inputs=[
+                gr.File(label="Main Track"),
+                gr.File(label="Background Track"),
+                gr.Slider(minimum=-10, maximum=10, value=0, label="Volume Offset (dB)")
+            ],
+            outputs=gr.File(label="Mixed Output"),
+            title="Overlay Two Tracks",
+            description="Mix or subtract two audio files."
+        )
+    # --- Voice Style Transfer ---
+    with gr.Tab("🧠 Voice Style Transfer"):
         gr.Interface(
+            fn=apply_style_transfer,
             inputs=[
+                gr.Audio(label="Upload Voice Clip", type="filepath"),
+                gr.Radio(["Happy", "Sad", "Angry", "Calm"], label="Choose Tone")
             ],
+            outputs=gr.Audio(label="Stylized Output", type="filepath"),
+            title="Change Emotional Tone of Voice",
+            description="Shift the emotional style of any voice clip."
+        )
+    # --- Metadata Tagging ---
+    with gr.Tab("🗂 Add MP3 Tags"):
+        gr.Interface(
+            fn=tag_mp3,
+            inputs=[
+                gr.File(label="Upload MP3/WAV"),
+                gr.Textbox(label="Title"),
+                gr.Textbox(label="Artist"),
+                gr.Textbox(label="Album"),
+                gr.Number(label="Year")
             ],
+            outputs=gr.File(label="Tagged Audio File"),
+            title="Add Title, Artist, Album, Year to MP3",
+            description="Enhance your exported files with metadata tags"
         )
 demo.launch()