AudioMaster / app.py
tee342's picture
Update app.py
651e9be verified
raw
history blame
24 kB
import gradio as gr
from pydub import AudioSegment
import numpy as np
import tempfile
import os
import noisereduce as nr
import json
import torch
from demucs import pretrained
from demucs.apply import apply_model
import torchaudio
from pathlib import Path
import matplotlib.pyplot as plt
from io import BytesIO
from PIL import Image
import zipfile
import datetime
import librosa
import warnings
from faster_whisper import WhisperModel
from mutagen.mp3 import MP3
from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
from TTS.api import TTS
import pickle
# Suppress warnings
warnings.filterwarnings("ignore")
# === Helper Functions ===
def audiosegment_to_array(audio):
return np.array(audio.get_array_of_samples()), audio.frame_rate
def array_to_audiosegment(samples, frame_rate, channels=1):
return AudioSegment(
samples.tobytes(),
frame_rate=frame_rate,
sample_width=samples.dtype.itemsize,
channels=channels
)
# === Effect Functions ===
def apply_normalize(audio):
return audio.normalize()
def apply_noise_reduction(audio):
samples, frame_rate = audiosegment_to_array(audio)
reduced = nr.reduce_noise(y=samples, sr=frame_rate)
return array_to_audiosegment(reduced, frame_rate, channels=audio.channels)
def apply_compression(audio):
return audio.compress_dynamic_range()
def apply_reverb(audio):
reverb = audio - 10
return audio.overlay(reverb, position=1000)
def apply_pitch_shift(audio, semitones=-2):
new_frame_rate = int(audio.frame_rate * (2 ** (semitones / 12)))
samples = np.array(audio.get_array_of_samples())
resampled = np.interp(
np.arange(0, len(samples), 2 ** (semitones / 12)),
np.arange(len(samples)),
samples
).astype(np.int16)
return AudioSegment(
resampled.tobytes(),
frame_rate=new_frame_rate,
sample_width=audio.sample_width,
channels=audio.channels
)
def apply_echo(audio, delay_ms=500, decay=0.5):
echo = audio - 10
return audio.overlay(echo, position=delay_ms)
def apply_stereo_widen(audio, pan_amount=0.3):
left = audio.pan(-pan_amount)
right = audio.pan(pan_amount)
return AudioSegment.from_mono_audiosegments(left, right)
def apply_bass_boost(audio, gain=10):
return audio.low_pass_filter(100).apply_gain(gain)
def apply_treble_boost(audio, gain=10):
return audio.high_pass_filter(4000).apply_gain(gain)
def apply_noise_gate(audio, threshold=-50.0, attack=50, release=100):
samples = np.array(audio.get_array_of_samples())
rms = np.sqrt(np.mean(samples**2))
if rms < 1:
return audio
normalized = samples / np.max(np.abs(samples))
envelope = np.abs(normalized)
gated = np.where(envelope > threshold / 100, normalized, 0)
return array_to_audiosegment(gated * np.iinfo(np.int16).max, audio.frame_rate, channels=audio.channels)
def apply_limiter(audio, limit_dB=-1):
limiter = audio._spawn(audio.raw_data, overrides={"frame_rate": audio.frame_rate})
return limiter.apply_gain(limit_dB)
def apply_phaser(audio, rate=0.5, depth=0.7, feedback=0.2, mix=0.5):
return audio._spawn(audio.raw_data, overrides={"frame_rate": int(audio.frame_rate * rate)})
def apply_bitcrush(audio, bit_depth=8):
samples = np.array(audio.get_array_of_samples())
max_val = np.iinfo(np.int16).max
crushed = (samples // (max_val // (2 ** bit_depth))).astype(np.int16)
return array_to_audiosegment(crushed, audio.frame_rate, channels=audio.channels)
def apply_auto_gain(audio, target_dB=-20):
change = target_dB - audio.dBFS
return audio.apply_gain(change)
def apply_vocal_distortion(audio, intensity=0.3):
samples = np.array(audio.get_array_of_samples()).astype(np.float32)
distorted = samples + intensity * np.sin(samples * 2 * np.pi / 32768)
return array_to_audiosegment(distorted.astype(np.int16), audio.frame_rate, channels=audio.channels)
# === Vocal Isolation Helpers ===
def load_track_local(path, sample_rate, channels=2):
sig, rate = torchaudio.load(path)
if rate != sample_rate:
sig = torchaudio.functional.resample(sig, rate, sample_rate)
if channels == 1:
sig = sig.mean(0)
return sig
def save_track(path, wav, sample_rate):
path = Path(path)
torchaudio.save(str(path), wav, sample_rate)
def apply_vocal_isolation(audio_path):
model = pretrained.get_model(name='htdemucs')
wav = load_track_local(audio_path, model.samplerate, channels=2)
ref = wav.mean(0)
wav -= ref[:, None]
sources = apply_model(model, wav[None])[0]
wav += ref[:, None]
vocal_track = sources[3].cpu()
out_path = os.path.join(tempfile.gettempdir(), "vocals.wav")
save_track(out_path, vocal_track, model.samplerate)
return out_path
# === Stem Splitting (Drums, Bass, Other, Vocals) ===
def stem_split(audio_path):
model = pretrained.get_model(name='htdemucs')
wav = load_track_local(audio_path, model.samplerate, channels=2)
sources = apply_model(model, wav[None])[0]
output_dir = tempfile.mkdtemp()
stem_paths = []
for i, name in enumerate(['drums', 'bass', 'other', 'vocals']):
path = os.path.join(output_dir, f"{name}.wav")
save_track(path, sources[i].cpu(), model.samplerate)
stem_paths.append(gr.File(value=path))
return stem_paths
# === Preset Loader with Fallback ===
def load_presets():
try:
preset_files = [f for f in os.listdir("presets") if f.endswith(".json")]
presets = {}
for f in preset_files:
path = os.path.join("presets", f)
try:
with open(path, "r") as infile:
data = json.load(infile)
if "name" in data and "effects" in data:
presets[data["name"]] = data["effects"]
except json.JSONDecodeError:
print(f"Invalid JSON: {f}")
return presets
except FileNotFoundError:
print("Presets folder not found")
return {}
preset_choices = load_presets()
if not preset_choices:
preset_choices = {
"Default": [],
"Clean Podcast": ["Noise Reduction", "Normalize"],
"Podcast Mastered": ["Noise Reduction", "Normalize", "Compress Dynamic Range"],
"Radio Ready": ["Bass Boost", "Treble Boost", "Limiter"],
"Music Production": ["Reverb", "Stereo Widening", "Pitch Shift"],
"ASMR Creator": ["Noise Gate", "Auto Gain", "Low-Pass Filter"],
"Voiceover Pro": ["Vocal Isolation", "TTS", "EQ Match"],
"8-bit Retro": ["Bitcrusher", "Echo", "Mono Downmix"]
}
preset_names = list(preset_choices.keys())
# === Waveform + Spectrogram Generator ===
def show_waveform(audio_file):
try:
audio = AudioSegment.from_file(audio_file)
samples = np.array(audio.get_array_of_samples())
plt.figure(figsize=(10, 2))
plt.plot(samples[:10000], color="blue")
plt.axis("off")
buf = BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight", dpi=100)
plt.close()
buf.seek(0)
return Image.open(buf)
except Exception as e:
return None
def detect_genre(audio_path):
try:
y, sr = torchaudio.load(audio_path)
mfccs = librosa.feature.mfcc(y=y.numpy().flatten(), sr=sr, n_mfcc=13).mean(axis=1).reshape(1, -1)
return "Speech"
except Exception:
return "Unknown"
# === Session Info Export ===
def generate_session_log(audio_path, effects, isolate_vocals, export_format, genre):
log = {
"timestamp": str(datetime.datetime.now()),
"filename": os.path.basename(audio_path),
"effects_applied": effects,
"isolate_vocals": isolate_vocals,
"export_format": export_format,
"detected_genre": genre
}
return json.dumps(log, indent=2)
# === Main Processing Function with Status Updates ===
def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, export_format):
status = "πŸ”Š Loading audio..."
try:
audio = AudioSegment.from_file(audio_file)
status = "πŸ›  Applying effects..."
effect_map = {
"Noise Reduction": apply_noise_reduction,
"Compress Dynamic Range": apply_compression,
"Add Reverb": apply_reverb,
"Pitch Shift": lambda x: apply_pitch_shift(x),
"Echo": apply_echo,
"Stereo Widening": apply_stereo_widen,
"Bass Boost": apply_bass_boost,
"Treble Boost": apply_treble_boost,
"Normalize": apply_normalize,
"Noise Gate": lambda x: apply_noise_gate(x, threshold=-50.0),
"Limiter": lambda x: apply_limiter(x, limit_dB=-1),
"Phaser": lambda x: apply_phaser(x),
"Flanger": lambda x: apply_phaser(x, rate=1.2, depth=0.9, mix=0.7),
"Bitcrusher": lambda x: apply_bitcrush(x, bit_depth=8),
"Auto Gain": lambda x: apply_auto_gain(x, target_dB=-20),
"Vocal Distortion": lambda x: apply_vocal_distortion(x)
}
effects_to_apply = preset_choices.get(preset_name, selected_effects)
for effect_name in effects_to_apply:
if effect_name in effect_map:
audio = effect_map[effect_name](audio)
status = "πŸ’Ύ Saving final audio..."
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
if isolate_vocals:
temp_input = os.path.join(tempfile.gettempdir(), "input.wav")
audio.export(temp_input, format="wav")
vocal_path = apply_vocal_isolation(temp_input)
final_audio = AudioSegment.from_wav(vocal_path)
else:
final_audio = audio
output_path = f.name
final_audio.export(output_path, format=export_format.lower())
waveform_image = show_waveform(output_path)
genre = detect_genre(output_path)
session_log = generate_session_log(audio_file, effects_to_apply, isolate_vocals, export_format, genre)
status = "πŸŽ‰ Done!"
return output_path, waveform_image, session_log, genre, status
except Exception as e:
status = f"❌ Error: {str(e)}"
return None, None, status, "", status
# === Batch Processing Function ===
def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, export_format):
status = "πŸ”Š Loading files..."
try:
output_dir = tempfile.mkdtemp()
results = []
session_logs = []
for file in files:
processed_path, _, log, _, _ = process_audio(file.name, selected_effects, isolate_vocals, preset_name, export_format)
results.append(processed_path)
session_logs.append(log)
zip_path = os.path.join(output_dir, "batch_output.zip")
with zipfile.ZipFile(zip_path, 'w') as zipf:
for i, res in enumerate(results):
filename = f"processed_{i}.{export_format.lower()}"
zipf.write(res, filename)
zipf.writestr(f"session_info_{i}.json", session_logs[i])
return zip_path, "πŸ“¦ ZIP created successfully!"
except Exception as e:
return None, f"❌ Batch processing failed: {str(e)}"
# === Transcribe & Edit Tab ===
whisper_model = WhisperModel("base")
def transcribe_audio(audio_path):
segments, info = whisper_model.transcribe(audio_path, beam_size=5)
text = " ".join([seg.text for seg in segments])
return text
# === TTS Tab ===
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
def generate_tts(text):
out_path = os.path.join(tempfile.gettempdir(), "tts_output.wav")
tts.tts_to_file(text=text, file_path=out_path)
return out_path
# === Save/Load Project File (.aiproj) ===
def save_project(audio_path, preset_name, effects):
project_data = {
"audio": AudioSegment.from_file(audio_path).raw_data,
"preset": preset_name,
"effects": effects
}
out_path = os.path.join(tempfile.gettempdir(), "project.aiproj")
with open(out_path, "wb") as f:
pickle.dump(project_data, f)
return out_path
def load_project(project_file):
with open(project_file.name, "rb") as f:
data = pickle.load(f)
return data["preset"], data["effects"]
# === Trim Silence Automatically (VAD) ===
def detect_silence(audio_file, silence_threshold=-50.0, min_silence_len=1000):
audio = AudioSegment.from_file(audio_file)
nonsilent_ranges = detect_nonsilent(
audio,
min_silence_len=int(min_silence_len),
silence_thresh=silence_threshold
)
if not nonsilent_ranges:
return audio.export(os.path.join(tempfile.gettempdir(), "trimmed.wav"), format="wav")
trimmed = audio[nonsilent_ranges[0][0]:nonsilent_tracks[-1][1]]
out_path = os.path.join(tempfile.gettempdir(), "trimmed.wav")
trimmed.export(out_path, format="wav")
return out_path
# === Mix Two Tracks ===
def mix_tracks(track1, track2, volume_offset=0):
a1 = AudioSegment.from_file(track1)
a2 = AudioSegment.from_file(track2)
mixed = a1.overlay(a2 - volume_offset)
out_path = os.path.join(tempfile.gettempdir(), "mixed.wav")
mixed.export(out_path, format="wav")
return out_path
# === Dummy Voice Cloning Tab – Works on Local Only ===
def clone_voice(*args):
return "⚠️ Voice cloning requires local install – use Python 3.9 or below"
# === Speaker Diarization ("Who Spoke When?") ===
try:
from pyannote.audio import Pipeline as DiarizationPipeline
from huggingface_hub import login
hf_token = os.getenv("HF_TOKEN")
if hf_token:
login(token=hf_token)
diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token or True)
except Exception as e:
diarize_pipeline = None
print(f"⚠️ Failed to load diarization: {e}")
def diarize_and_transcribe(audio_path):
if not diarize_pipeline:
return "⚠️ Diarization pipeline not loaded – check HF token or install pyannote.audio"
# Run diarization
audio = AudioSegment.from_file(audio_path)
temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
audio.export(temp_wav, format="wav")
try:
from pyannote.audio import Pipeline as DiarizationPipeline
diarization = diarize_pipeline(temp_wav)
result = whisper.transcribe(temp_wav)
segments = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
segments.append({
"speaker": speaker,
"start": turn.start,
"end": turn.end,
"text": text
})
return segments
except Exception as e:
return f"⚠️ Diarization failed: {str(e)}"
# === UI ===
effect_options = [
"Noise Reduction",
"Compress Dynamic Range",
"Add Reverb",
"Pitch Shift",
"Echo",
"Stereo Widening",
"Bass Boost",
"Treble Boost",
"Normalize",
"Noise Gate",
"Limiter",
"Phaser",
"Flanger",
"Bitcrusher",
"Auto Gain",
"Vocal Distortion"
]
with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
gr.Markdown("## 🎧 Ultimate AI Audio Studio\nUpload, edit, export β€” powered by AI!")
# --- Single File Studio ---
with gr.Tab("🎡 Single File Studio"):
gr.Interface(
fn=process_audio,
inputs=[
gr.Audio(label="Upload Audio", type="filepath"),
gr.CheckboxGroup(choices=effect_options, label="Apply Effects in Order"),
gr.Checkbox(label="Isolate Vocals After Effects"),
gr.Dropdown(choices=preset_names, label="Select Preset", value=preset_names[0] if preset_names else None),
gr.Dropdown(choices=["MP3", "WAV"], label="Export Format", value="MP3")
],
outputs=[
gr.Audio(label="Processed Audio", type="filepath"),
gr.Image(label="Waveform Preview"),
gr.Textbox(label="Session Log (JSON)", lines=5),
gr.Textbox(label="Detected Genre", lines=1),
gr.Textbox(label="Status", value="βœ… Ready", lines=1)
],
title="Edit One File at a Time",
description="Apply effects, preview waveform, and get full session log.",
flagging_mode="never",
submit_btn="Process Audio",
clear_btn=None
)
# --- Batch Processing ---
with gr.Tab("πŸ”Š Batch Processing"):
gr.Interface(
fn=batch_process_audio,
inputs=[
gr.File(label="Upload Multiple Files", file_count="multiple"),
gr.CheckboxGroup(choices=effect_options, label="Apply Effects in Order"),
gr.Checkbox(label="Isolate Vocals After Effects"),
gr.Dropdown(choices=preset_names, label="Select Preset", value=preset_names[0]),
gr.Dropdown(choices=["MP3", "WAV"], label="Export Format", value="MP3")
],
outputs=[
gr.File(label="Download ZIP of All Processed Files"),
gr.Textbox(label="Status", value="βœ… Ready", lines=1)
],
title="Batch Audio Processor",
description="Upload multiple files, apply effects in bulk, and download all results in a single ZIP.",
flagging_mode="never",
submit_btn="Process All Files",
clear_btn=None
)
# --- Remix Mode ---
with gr.Tab("πŸŽ› Remix Mode"):
gr.Interface(
fn=stem_split,
inputs=gr.Audio(label="Upload Music Track", type="filepath"),
outputs=[
gr.File(label="Vocals"),
gr.File(label="Drums"),
gr.File(label="Bass"),
gr.File(label="Other")
],
title="Split Into Drums, Bass, Vocals, and More",
description="Use AI to separate musical elements like vocals, drums, and bass.",
flagging_mode="never",
clear_btn=None
)
# --- Transcribe & Edit Tab ===
with gr.Tab("πŸ“ Transcribe & Edit"):
gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(label="Upload Audio", type="filepath"),
outputs=gr.Textbox(label="Transcribed Text", lines=10),
title="Transcribe & Edit Spoken Content",
description="Convert voice to text and edit it before exporting again."
)
# --- Voice Cloning (Local Only) ===
with gr.Tab("🎭 Voice Cloning (Local Only)"):
gr.Interface(
fn=clone_voice,
inputs=[
gr.File(label="Source Voice Clip"),
gr.File(label="Target Voice Clip"),
gr.Textbox(label="Text to Clone", lines=5)
],
outputs=gr.Audio(label="Cloned Output", type="filepath"),
title="Replace One Voice With Another",
description="Clone voice from source to target speaker using AI"
)
# --- Speaker Diarization (Who Spoke When?) ===
if diarize_pipeline:
with gr.Tab("πŸ§β€β™‚οΈ Who Spoke When?"):
gr.Interface(
fn=diarize_and_transcribe,
inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
outputs=gr.JSON(label="Diarized Transcript"),
title="Split By Speaker + Transcribe",
description="Detect speakers and transcribe their speech automatically."
)
# --- TTS Voice Generator ===
with gr.Tab("πŸ’¬ TTS Voice Generator"):
gr.Interface(
fn=generate_tts,
inputs=gr.Textbox(label="Enter Text", lines=5),
outputs=gr.Audio(label="Generated Speech", type="filepath"),
title="Text-to-Speech Generator",
description="Type anything and turn it into natural-sounding speech."
)
# --- Auto-Save / Resume Sessions ===
session_state = gr.State()
def save_or_resume_session(audio, preset, effects, action="save"):
if action == "save":
return {"audio": audio, "preset": preset, "effects": effects}, None, None, None
elif action == "load" and isinstance(audio, dict):
return (
None,
audio.get("audio"),
audio.get("preset"),
audio.get("effects")
)
return None, None, None, None
with gr.Tab("🧾 Auto-Save & Resume"):
gr.Markdown("Save your current state and resume editing later.")
action_radio = gr.Radio(["save", "load"], label="Action", value="save")
audio_input = gr.Audio(label="Upload or Load Audio", type="filepath")
preset_dropdown = gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0] if preset_names else None)
effect_checkbox = gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
action_btn = gr.Button("Save or Load Session")
session_data = gr.State()
loaded_audio = gr.Audio(label="Loaded Audio", type="filepath")
loaded_preset = gr.Dropdown(choices=preset_names, label="Loaded Preset")
loaded_effects = gr.CheckboxGroup(choices=effect_options, label="Loaded Effects")
action_btn.click(
fn=save_or_resume_session,
inputs=[audio_input, preset_dropdown, effect_checkbox, action_radio],
outputs=[session_data, loaded_audio, loaded_preset, loaded_effects]
)
# --- VAD – Detect & Remove Silence ===
with gr.Tab("βœ‚οΈ Trim Silence Automatically"):
gr.Interface(
fn=detect_silence,
inputs=[
gr.File(label="Upload Track"),
gr.Slider(minimum=-100, maximum=-10, value=-50, label="Silence Threshold (dB)"),
gr.Number(label="Min Silence Length (ms)", value=1000)
],
outputs=gr.File(label="Trimmed Output"),
title="Auto-Detect & Remove Silence",
description="Detect and trim silence at start/end or between words"
)
# --- Load/Save Project File (.aiproj) ===
with gr.Tab("πŸ“ Save/Load Project"):
gr.Interface(
fn=save_project,
inputs=[
gr.File(label="Original Audio"),
gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0]),
gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
],
outputs=gr.File(label="Project File (.aiproj)"),
title="Save Everything Together",
description="Save your session, effects, and settings in one file to reuse later."
)
gr.Interface(
fn=load_project,
inputs=gr.File(label="Upload .aiproj File"),
outputs=[
gr.Dropdown(choices=preset_names, label="Loaded Preset"),
gr.CheckboxGroup(choices=effect_options, label="Loaded Effects")
],
title="Resume Last Project",
description="Load your saved session"
)
# --- Mix Two Tracks ===
with gr.Tab("πŸ”€ Mix Two Tracks"):
gr.Interface(
fn=mix_tracks,
inputs=[
gr.File(label="Main Track"),
gr.File(label="Background Track"),
gr.Slider(minimum=-10, maximum=10, value=0, label="Volume Offset (dB)")
],
outputs=gr.File(label="Mixed Output"),
title="Overlay Two Tracks",
description="Mix, blend, or subtract two audio files."
)
demo.launch()