AudioMaster / app.py
tee342's picture
Update app.py
e0bb421 verified
raw
history blame
16.6 kB
import gradio as gr
from pydub import AudioSegment
import numpy as np
import tempfile
import os
import noisereduce as nr
import json
import torch
from demucs import pretrained
from demucs.apply import apply_model
import torchaudio
from pathlib import Path
import matplotlib.pyplot as plt
from io import BytesIO
from PIL import Image
import zipfile
import datetime
import librosa
import joblib
import warnings
from faster_whisper import WhisperModel
from mutagen.mp3 import MP3
from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
# Suppress warnings for cleaner logs
warnings.filterwarnings("ignore")
# === Helper Functions ===
def audiosegment_to_array(audio):
return np.array(audio.get_array_of_samples()), audio.frame_rate
def array_to_audiosegment(samples, frame_rate, channels=1):
return AudioSegment(
samples.tobytes(),
frame_rate=frame_rate,
sample_width=samples.dtype.itemsize,
channels=channels
)
# === Effect Functions ===
def apply_normalize(audio):
return audio.normalize()
def apply_noise_reduction(audio):
samples, frame_rate = audiosegment_to_array(audio)
reduced = nr.reduce_noise(y=samples, sr=frame_rate)
return array_to_audiosegment(reduced, frame_rate, channels=audio.channels)
def apply_compression(audio):
return audio.compress_dynamic_range()
def apply_reverb(audio):
reverb = audio - 10
return audio.overlay(reverb, position=1000)
def apply_pitch_shift(audio, semitones=-2):
new_frame_rate = int(audio.frame_rate * (2 ** (semitones / 12)))
samples = np.array(audio.get_array_of_samples())
resampled = np.interp(
np.arange(0, len(samples), 2 ** (semitones / 12)),
np.arange(len(samples)),
samples
).astype(np.int16)
return AudioSegment(
resampled.tobytes(),
frame_rate=new_frame_rate,
sample_width=audio.sample_width,
channels=audio.channels
)
def apply_echo(audio, delay_ms=500, decay=0.5):
echo = audio - 10
return audio.overlay(echo, position=delay_ms)
def apply_stereo_widen(audio, pan_amount=0.3):
left = audio.pan(-pan_amount)
right = audio.pan(pan_amount)
return AudioSegment.from_mono_audiosegments(left, right)
def apply_bass_boost(audio, gain=10):
return audio.low_pass_filter(100).apply_gain(gain)
def apply_treble_boost(audio, gain=10):
return audio.high_pass_filter(4000).apply_gain(gain)
# === Vocal Isolation Helpers ===
def load_track_local(path, sample_rate, channels=2):
sig, rate = torchaudio.load(path)
if rate != sample_rate:
sig = torchaudio.functional.resample(sig, rate, sample_rate)
if channels == 1:
sig = sig.mean(0)
return sig
def save_track(path, wav, sample_rate):
path = Path(path)
torchaudio.save(str(path), wav, sample_rate)
def apply_vocal_isolation(audio_path):
model = pretrained.get_model(name='htdemucs')
wav = load_track_local(audio_path, model.samplerate, channels=2)
ref = wav.mean(0)
wav -= ref[:, None]
sources = apply_model(model, wav[None])[0]
wav += ref[:, None]
vocal_track = sources[3].cpu()
out_path = os.path.join(tempfile.gettempdir(), "vocals.wav")
save_track(out_path, vocal_track, model.samplerate)
return out_path
# === Stem Splitting (Drums, Bass, Other, Vocals) ===
def stem_split(audio_path):
model = pretrained.get_model(name='htdemucs')
wav = load_track_local(audio_path, model.samplerate, channels=2)
sources = apply_model(model, wav[None])[0]
output_dir = tempfile.mkdtemp()
stem_paths = []
for i, name in enumerate(['drums', 'bass', 'other', 'vocals']):
path = os.path.join(output_dir, f"{name}.wav")
save_track(path, sources[i].cpu(), model.samplerate)
stem_paths.append(path)
return [gr.File(value=path) for path in stem_paths]
# === Preset Loader with Fallback ===
def load_presets():
try:
preset_files = [f for f in os.listdir("presets") if f.endswith(".json")]
presets = {}
for f in preset_files:
path = os.path.join("presets", f)
try:
with open(path, "r") as infile:
data = json.load(infile)
if "name" in data and "effects" in data:
presets[data["name"]] = data["effects"]
except json.JSONDecodeError:
print(f"Invalid JSON: {f}")
return presets
except FileNotFoundError:
print("Presets folder not found")
return {}
preset_choices = load_presets()
if not preset_choices:
preset_choices = {
"Default": [],
"Clean Podcast": ["Noise Reduction", "Normalize"],
"Music Remix": ["Bass Boost", "Stereo Widening"]
}
preset_names = list(preset_choices.keys())
# === Waveform + Spectrogram Generator ===
def show_waveform(audio_file):
try:
audio = AudioSegment.from_file(audio_file)
samples = np.array(audio.get_array_of_samples())
plt.figure(figsize=(10, 2))
plt.plot(samples[:10000], color="blue")
plt.axis("off")
buf = BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight", dpi=100)
plt.close()
buf.seek(0)
return Image.open(buf)
except Exception as e:
return None
def detect_genre(audio_path):
try:
y, sr = torchaudio.load(audio_path)
mfccs = librosa.feature.mfcc(y=y.numpy().flatten(), sr=sr, n_mfcc=13).mean(axis=1).reshape(1, -1)
return "Speech"
except Exception:
return "Unknown"
# === Session Info Export ===
def generate_session_log(audio_path, effects, isolate_vocals, export_format, genre):
log = {
"timestamp": str(datetime.datetime.now()),
"filename": os.path.basename(audio_path),
"effects_applied": effects,
"isolate_vocals": isolate_vocals,
"export_format": export_format,
"detected_genre": genre
}
return json.dumps(log, indent=2)
# === Main Processing Function with Status Updates ===
def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, export_format):
status = "πŸ”Š Loading audio..."
try:
audio = AudioSegment.from_file(audio_file)
status = "πŸ›  Applying effects..."
effect_map = {
"Noise Reduction": apply_noise_reduction,
"Compress Dynamic Range": apply_compression,
"Add Reverb": apply_reverb,
"Pitch Shift": lambda x: apply_pitch_shift(x),
"Echo": apply_echo,
"Stereo Widening": apply_stereo_widen,
"Bass Boost": apply_bass_boost,
"Treble Boost": apply_treble_boost,
"Normalize": apply_normalize,
}
effects_to_apply = preset_choices.get(preset_name, selected_effects)
for effect_name in effects_to_apply:
if effect_name in effect_map:
audio = effect_map[effect_name](audio)
status = "πŸ’Ύ Saving final audio..."
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
if isolate_vocals:
temp_input = os.path.join(tempfile.gettempdir(), "input.wav")
audio.export(temp_input, format="wav")
vocal_path = apply_vocal_isolation(temp_input)
final_audio = AudioSegment.from_wav(vocal_path)
else:
final_audio = audio
output_path = f.name
final_audio.export(output_path, format=export_format.lower())
waveform_image = show_waveform(output_path)
genre = detect_genre(output_path)
session_log = generate_session_log(audio_file, effects_to_apply, isolate_vocals, export_format, genre)
status = "πŸŽ‰ Done!"
return output_path, waveform_image, session_log, genre, status
except Exception as e:
status = f"❌ Error: {str(e)}"
return None, None, status, "", status
# === Batch Processing Function ===
def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, export_format):
status = "πŸ”Š Loading files..."
try:
output_dir = tempfile.mkdtemp()
results = []
session_logs = []
for file in files:
processed_path, _, log, _, _ = process_audio(file.name, selected_effects, isolate_vocals, preset_name, export_format)
results.append(processed_path)
session_logs.append(log)
zip_path = os.path.join(output_dir, "batch_output.zip")
with zipfile.ZipFile(zip_path, 'w') as zipf:
for i, res in enumerate(results):
filename = f"processed_{i}.{export_format.lower()}"
zipf.write(res, filename)
zipf.writestr(f"session_info_{i}.json", session_logs[i])
return zip_path, "πŸ“¦ ZIP created successfully!"
except Exception as e:
return None, f"❌ Batch processing failed: {str(e)}"
# === Whisper Transcription Tab ===
whisper_model = WhisperModel("base")
def transcribe_audio(audio_path):
segments, info = whisper_model.transcribe(audio_path, beam_size=5)
text = " ".join([seg.text for seg in segments])
return text
# === TTS Tab ===
from TTS.api import TTS
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
def generate_tts(text):
out_path = os.path.join(tempfile.gettempdir(), "tts_output.wav")
tts.tts_to_file(text=text, file_path=out_path)
return out_path
# === Analyze Audio Stats ===
def analyze_audio(audio_path):
y, sr = torchaudio.load(audio_path)
rms = np.mean(librosa.feature.rms(y=y.numpy().flatten()))
tempo, _ = librosa.beat.beat_track(y=y.numpy().flatten(), sr=sr)
silence_ratio = np.mean(np.abs(y.numpy()) < 0.01)
plt.figure(figsize=(10, 4))
plt.plot(y.numpy().flatten(), color="lightblue")
plt.title("Loudness Over Time")
plt.tight_layout()
buf = BytesIO()
plt.savefig(buf, format="png")
plt.close()
image = Image.open(buf)
stats = {
"rms_loudness": float(rms),
"silence_ratio": float(silence_ratio),
"tempo_bpm": int(tempo)
}
return stats, image
# === Mix Two Tracks ===
def mix_tracks(track1, track2, volume_offset=0):
a1 = AudioSegment.from_file(track1)
a2 = AudioSegment.from_file(track2)
mixed = a1.overlay(a2 - volume_offset)
out_path = os.path.join(tempfile.gettempdir(), "mixed.wav")
mixed.export(out_path, format="wav")
return out_path
# === Save/Load Project File (.aiproj) ===
def save_project(audio_path, preset_name, effects):
project_data = {
"audio": AudioSegment.from_file(audio_path).raw_data,
"preset": preset_name,
"effects": effects
}
out_path = os.path.join(tempfile.gettempdir(), "project.aiproj")
with open(out_path, "wb") as f:
pickle.dump(project_data, f)
return out_path
# UI Setup
effect_options = [
"Noise Reduction",
"Compress Dynamic Range",
"Add Reverb",
"Pitch Shift",
"Echo",
"Stereo Widening",
"Bass Boost",
"Treble Boost",
"Normalize"
]
with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
gr.Markdown("## 🎧 Ultimate AI Audio Studio\nUpload, edit, export β€” powered by AI!")
# --- Single File Studio ---
with gr.Tab("🎡 Single File Studio"):
gr.Interface(
fn=process_audio,
inputs=[
gr.Audio(label="Upload Audio", type="filepath"),
gr.CheckboxGroup(choices=effect_options, label="Apply Effects in Order"),
gr.Checkbox(label="Isolate Vocals After Effects"),
gr.Dropdown(choices=preset_names, label="Select Preset", value=preset_names[0] if preset_names else None),
gr.Dropdown(choices=["MP3", "WAV"], label="Export Format", value="MP3")
],
outputs=[
gr.Audio(label="Processed Audio", type="filepath"),
gr.Image(label="Waveform Preview"),
gr.Textbox(label="Session Log (JSON)", lines=5),
gr.Textbox(label="Detected Genre", lines=1),
gr.Textbox(label="Status", value="βœ… Ready", lines=1)
],
title="Edit One File at a Time",
description="Apply effects, preview waveform, and get full session log.",
flagging_mode="never",
submit_btn="Process Audio",
clear_btn=None
)
# --- Batch Processing ---
with gr.Tab("πŸ”Š Batch Processing"):
gr.Interface(
fn=batch_process_audio,
inputs=[
gr.File(label="Upload Multiple Files", file_count="multiple"),
gr.CheckboxGroup(choices=effect_options, label="Apply Effects in Order"),
gr.Checkbox(label="Isolate Vocals After Effects"),
gr.Dropdown(choices=preset_names, label="Select Preset", value=preset_names[0] if preset_names else None),
gr.Dropdown(choices=["MP3", "WAV"], label="Export Format", value="MP3")
],
outputs=[
gr.File(label="Download ZIP of All Processed Files"),
gr.Textbox(label="Status", value="βœ… Ready", lines=1)
],
title="Batch Audio Processor",
description="Upload multiple files, apply effects in bulk, and download all results in a single ZIP.",
flagging_mode="never",
submit_btn="Process All Files",
clear_btn=None
)
# --- Remix Mode ---
with gr.Tab("πŸŽ› Remix Mode"):
gr.Interface(
fn=stem_split,
inputs=gr.Audio(label="Upload Music Track", type="filepath"),
outputs=[
gr.File(label="Vocals"),
gr.File(label="Drums"),
gr.File(label="Bass"),
gr.File(label="Other")
],
title="Split Into Drums, Bass, Vocals, and More",
description="Use AI to separate musical elements like vocals, drums, and bass.",
flagging_mode="never",
clear_btn=None
)
# --- Transcribe & Edit ---
with gr.Tab("πŸ“ Transcribe & Edit"):
gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(label="Upload Audio", type="filepath"),
outputs=gr.Textbox(label="Transcribed Text", lines=10),
title="Transcribe & Edit Spoken Content",
description="Convert voice to text, then edit the script before exporting again."
)
# --- TTS Voice Generator ---
with gr.Tab("πŸ’¬ TTS Voice Generator"):
gr.Interface(
fn=generate_tts,
inputs=gr.Textbox(label="Enter Text", lines=5),
outputs=gr.Audio(label="Generated Speech", type="filepath"),
title="Text-to-Speech Generator",
description="Type anything and turn it into natural-sounding speech."
)
# --- Audio Analysis Dashboard ---
with gr.Tab("πŸ“Š Audio Analysis"):
gr.Interface(
fn=analyze_audio,
inputs=gr.Audio(label="Upload Track", type="filepath"),
outputs=[
gr.JSON(label="Audio Stats"),
gr.Image(label="Waveform Graph")
],
title="View Loudness, BPM, Silence, and More",
description="Analyze audio loudness, tempo, and frequency content."
)
# --- Mix Two Tracks ---
with gr.Tab("πŸ”€ Mix Two Tracks"):
gr.Interface(
fn=mix_tracks,
inputs=[
gr.File(label="Main Track"),
gr.File(label="Background Track"),
gr.Slider(minimum=-10, maximum=10, value=0, label="Volume Offset (dB)")
],
outputs=gr.File(label="Mixed Output"),
title="Overlay Two Tracks",
description="Mix or subtract two audio files."
)
# --- Load/Save Project ---
with gr.Tab("πŸ“ Save/Load Project"):
gr.Interface(
fn=save_project,
inputs=[
gr.File(label="Original Audio"),
gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0]),
gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
],
outputs=gr.File(label="Project File (.aiproj)"),
title="Save Everything Together",
description="Save your session, effects, and settings in one file to reuse later."
)
demo.launch()