Spaces:

tee342
/

AudioMaster

Sleeping

App Files Files Community

AudioMaster / app.py

tee342

Update app.py

651e9be verified 4 months ago

raw

history blame

24 kB

	import gradio as gr
	from pydub import AudioSegment
	import numpy as np
	import tempfile
	import os
	import noisereduce as nr
	import json
	import torch
	from demucs import pretrained
	from demucs.apply import apply_model
	import torchaudio
	from pathlib import Path
	import matplotlib.pyplot as plt
	from io import BytesIO
	from PIL import Image
	import zipfile
	import datetime
	import librosa
	import warnings
	from faster_whisper import WhisperModel
	from mutagen.mp3 import MP3
	from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
	from TTS.api import TTS
	import pickle

	# Suppress warnings
	warnings.filterwarnings("ignore")

	# === Helper Functions ===
	def audiosegment_to_array(audio):
	return np.array(audio.get_array_of_samples()), audio.frame_rate

	def array_to_audiosegment(samples, frame_rate, channels=1):
	return AudioSegment(
	samples.tobytes(),
	frame_rate=frame_rate,
	sample_width=samples.dtype.itemsize,
	channels=channels
	)

	# === Effect Functions ===
	def apply_normalize(audio):
	return audio.normalize()

	def apply_noise_reduction(audio):
	samples, frame_rate = audiosegment_to_array(audio)
	reduced = nr.reduce_noise(y=samples, sr=frame_rate)
	return array_to_audiosegment(reduced, frame_rate, channels=audio.channels)

	def apply_compression(audio):
	return audio.compress_dynamic_range()

	def apply_reverb(audio):
	reverb = audio - 10
	return audio.overlay(reverb, position=1000)

	def apply_pitch_shift(audio, semitones=-2):
	new_frame_rate = int(audio.frame_rate * (2 ** (semitones / 12)))
	samples = np.array(audio.get_array_of_samples())
	resampled = np.interp(
	np.arange(0, len(samples), 2 ** (semitones / 12)),
	np.arange(len(samples)),
	samples
	).astype(np.int16)
	return AudioSegment(
	resampled.tobytes(),
	frame_rate=new_frame_rate,
	sample_width=audio.sample_width,
	channels=audio.channels
	)

	def apply_echo(audio, delay_ms=500, decay=0.5):
	echo = audio - 10
	return audio.overlay(echo, position=delay_ms)

	def apply_stereo_widen(audio, pan_amount=0.3):
	left = audio.pan(-pan_amount)
	right = audio.pan(pan_amount)
	return AudioSegment.from_mono_audiosegments(left, right)

	def apply_bass_boost(audio, gain=10):
	return audio.low_pass_filter(100).apply_gain(gain)

	def apply_treble_boost(audio, gain=10):
	return audio.high_pass_filter(4000).apply_gain(gain)

	def apply_noise_gate(audio, threshold=-50.0, attack=50, release=100):
	samples = np.array(audio.get_array_of_samples())
	rms = np.sqrt(np.mean(samples**2))
	if rms < 1:
	return audio
	normalized = samples / np.max(np.abs(samples))
	envelope = np.abs(normalized)
	gated = np.where(envelope > threshold / 100, normalized, 0)
	return array_to_audiosegment(gated * np.iinfo(np.int16).max, audio.frame_rate, channels=audio.channels)

	def apply_limiter(audio, limit_dB=-1):
	limiter = audio._spawn(audio.raw_data, overrides={"frame_rate": audio.frame_rate})
	return limiter.apply_gain(limit_dB)

	def apply_phaser(audio, rate=0.5, depth=0.7, feedback=0.2, mix=0.5):
	return audio._spawn(audio.raw_data, overrides={"frame_rate": int(audio.frame_rate * rate)})

	def apply_bitcrush(audio, bit_depth=8):
	samples = np.array(audio.get_array_of_samples())
	max_val = np.iinfo(np.int16).max
	crushed = (samples // (max_val // (2 ** bit_depth))).astype(np.int16)
	return array_to_audiosegment(crushed, audio.frame_rate, channels=audio.channels)

	def apply_auto_gain(audio, target_dB=-20):
	change = target_dB - audio.dBFS
	return audio.apply_gain(change)

	def apply_vocal_distortion(audio, intensity=0.3):
	samples = np.array(audio.get_array_of_samples()).astype(np.float32)
	distorted = samples + intensity * np.sin(samples * 2 * np.pi / 32768)
	return array_to_audiosegment(distorted.astype(np.int16), audio.frame_rate, channels=audio.channels)

	# === Vocal Isolation Helpers ===
	def load_track_local(path, sample_rate, channels=2):
	sig, rate = torchaudio.load(path)
	if rate != sample_rate:
	sig = torchaudio.functional.resample(sig, rate, sample_rate)
	if channels == 1:
	sig = sig.mean(0)
	return sig

	def save_track(path, wav, sample_rate):
	path = Path(path)
	torchaudio.save(str(path), wav, sample_rate)

	def apply_vocal_isolation(audio_path):
	model = pretrained.get_model(name='htdemucs')
	wav = load_track_local(audio_path, model.samplerate, channels=2)
	ref = wav.mean(0)
	wav -= ref[:, None]
	sources = apply_model(model, wav[None])[0]
	wav += ref[:, None]

	vocal_track = sources[3].cpu()
	out_path = os.path.join(tempfile.gettempdir(), "vocals.wav")
	save_track(out_path, vocal_track, model.samplerate)
	return out_path

	# === Stem Splitting (Drums, Bass, Other, Vocals) ===
	def stem_split(audio_path):
	model = pretrained.get_model(name='htdemucs')
	wav = load_track_local(audio_path, model.samplerate, channels=2)
	sources = apply_model(model, wav[None])[0]

	output_dir = tempfile.mkdtemp()
	stem_paths = []

	for i, name in enumerate(['drums', 'bass', 'other', 'vocals']):
	path = os.path.join(output_dir, f"{name}.wav")
	save_track(path, sources[i].cpu(), model.samplerate)
	stem_paths.append(gr.File(value=path))

	return stem_paths

	# === Preset Loader with Fallback ===
	def load_presets():
	try:
	preset_files = [f for f in os.listdir("presets") if f.endswith(".json")]
	presets = {}
	for f in preset_files:
	path = os.path.join("presets", f)
	try:
	with open(path, "r") as infile:
	data = json.load(infile)
	if "name" in data and "effects" in data:
	presets[data["name"]] = data["effects"]
	except json.JSONDecodeError:
	print(f"Invalid JSON: {f}")
	return presets
	except FileNotFoundError:
	print("Presets folder not found")
	return {}

	preset_choices = load_presets()

	if not preset_choices:
	preset_choices = {
	"Default": [],
	"Clean Podcast": ["Noise Reduction", "Normalize"],
	"Podcast Mastered": ["Noise Reduction", "Normalize", "Compress Dynamic Range"],
	"Radio Ready": ["Bass Boost", "Treble Boost", "Limiter"],
	"Music Production": ["Reverb", "Stereo Widening", "Pitch Shift"],
	"ASMR Creator": ["Noise Gate", "Auto Gain", "Low-Pass Filter"],
	"Voiceover Pro": ["Vocal Isolation", "TTS", "EQ Match"],
	"8-bit Retro": ["Bitcrusher", "Echo", "Mono Downmix"]
	}

	preset_names = list(preset_choices.keys())

	# === Waveform + Spectrogram Generator ===
	def show_waveform(audio_file):
	try:
	audio = AudioSegment.from_file(audio_file)
	samples = np.array(audio.get_array_of_samples())
	plt.figure(figsize=(10, 2))
	plt.plot(samples[:10000], color="blue")
	plt.axis("off")
	buf = BytesIO()
	plt.savefig(buf, format="png", bbox_inches="tight", dpi=100)
	plt.close()
	buf.seek(0)
	return Image.open(buf)
	except Exception as e:
	return None

	def detect_genre(audio_path):
	try:
	y, sr = torchaudio.load(audio_path)
	mfccs = librosa.feature.mfcc(y=y.numpy().flatten(), sr=sr, n_mfcc=13).mean(axis=1).reshape(1, -1)
	return "Speech"
	except Exception:
	return "Unknown"

	# === Session Info Export ===
	def generate_session_log(audio_path, effects, isolate_vocals, export_format, genre):
	log = {
	"timestamp": str(datetime.datetime.now()),
	"filename": os.path.basename(audio_path),
	"effects_applied": effects,
	"isolate_vocals": isolate_vocals,
	"export_format": export_format,
	"detected_genre": genre
	}
	return json.dumps(log, indent=2)

	# === Main Processing Function with Status Updates ===
	def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, export_format):
	status = "🔊 Loading audio..."
	try:
	audio = AudioSegment.from_file(audio_file)
	status = "🛠 Applying effects..."

	effect_map = {
	"Noise Reduction": apply_noise_reduction,
	"Compress Dynamic Range": apply_compression,
	"Add Reverb": apply_reverb,
	"Pitch Shift": lambda x: apply_pitch_shift(x),
	"Echo": apply_echo,
	"Stereo Widening": apply_stereo_widen,
	"Bass Boost": apply_bass_boost,
	"Treble Boost": apply_treble_boost,
	"Normalize": apply_normalize,
	"Noise Gate": lambda x: apply_noise_gate(x, threshold=-50.0),
	"Limiter": lambda x: apply_limiter(x, limit_dB=-1),
	"Phaser": lambda x: apply_phaser(x),
	"Flanger": lambda x: apply_phaser(x, rate=1.2, depth=0.9, mix=0.7),
	"Bitcrusher": lambda x: apply_bitcrush(x, bit_depth=8),
	"Auto Gain": lambda x: apply_auto_gain(x, target_dB=-20),
	"Vocal Distortion": lambda x: apply_vocal_distortion(x)
	}

	effects_to_apply = preset_choices.get(preset_name, selected_effects)
	for effect_name in effects_to_apply:
	if effect_name in effect_map:
	audio = effect_map[effect_name](audio)

	status = "💾 Saving final audio..."
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
	if isolate_vocals:
	temp_input = os.path.join(tempfile.gettempdir(), "input.wav")
	audio.export(temp_input, format="wav")
	vocal_path = apply_vocal_isolation(temp_input)
	final_audio = AudioSegment.from_wav(vocal_path)
	else:
	final_audio = audio

	output_path = f.name
	final_audio.export(output_path, format=export_format.lower())

	waveform_image = show_waveform(output_path)
	genre = detect_genre(output_path)
	session_log = generate_session_log(audio_file, effects_to_apply, isolate_vocals, export_format, genre)

	status = "🎉 Done!"
	return output_path, waveform_image, session_log, genre, status

	except Exception as e:
	status = f"❌ Error: {str(e)}"
	return None, None, status, "", status

	# === Batch Processing Function ===
	def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, export_format):
	status = "🔊 Loading files..."
	try:
	output_dir = tempfile.mkdtemp()
	results = []
	session_logs = []

	for file in files:
	processed_path, _, log, _, _ = process_audio(file.name, selected_effects, isolate_vocals, preset_name, export_format)
	results.append(processed_path)
	session_logs.append(log)

	zip_path = os.path.join(output_dir, "batch_output.zip")
	with zipfile.ZipFile(zip_path, 'w') as zipf:
	for i, res in enumerate(results):
	filename = f"processed_{i}.{export_format.lower()}"
	zipf.write(res, filename)
	zipf.writestr(f"session_info_{i}.json", session_logs[i])

	return zip_path, "📦 ZIP created successfully!"

	except Exception as e:
	return None, f"❌ Batch processing failed: {str(e)}"

	# === Transcribe & Edit Tab ===
	whisper_model = WhisperModel("base")

	def transcribe_audio(audio_path):
	segments, info = whisper_model.transcribe(audio_path, beam_size=5)
	text = " ".join([seg.text for seg in segments])
	return text

	# === TTS Tab ===
	tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)

	def generate_tts(text):
	out_path = os.path.join(tempfile.gettempdir(), "tts_output.wav")
	tts.tts_to_file(text=text, file_path=out_path)
	return out_path

	# === Save/Load Project File (.aiproj) ===
	def save_project(audio_path, preset_name, effects):
	project_data = {
	"audio": AudioSegment.from_file(audio_path).raw_data,
	"preset": preset_name,
	"effects": effects
	}
	out_path = os.path.join(tempfile.gettempdir(), "project.aiproj")
	with open(out_path, "wb") as f:
	pickle.dump(project_data, f)
	return out_path

	def load_project(project_file):
	with open(project_file.name, "rb") as f:
	data = pickle.load(f)
	return data["preset"], data["effects"]

	# === Trim Silence Automatically (VAD) ===
	def detect_silence(audio_file, silence_threshold=-50.0, min_silence_len=1000):
	audio = AudioSegment.from_file(audio_file)

	nonsilent_ranges = detect_nonsilent(
	audio,
	min_silence_len=int(min_silence_len),
	silence_thresh=silence_threshold
	)

	if not nonsilent_ranges:
	return audio.export(os.path.join(tempfile.gettempdir(), "trimmed.wav"), format="wav")

	trimmed = audio[nonsilent_ranges[0][0]:nonsilent_tracks[-1][1]]
	out_path = os.path.join(tempfile.gettempdir(), "trimmed.wav")
	trimmed.export(out_path, format="wav")
	return out_path

	# === Mix Two Tracks ===
	def mix_tracks(track1, track2, volume_offset=0):
	a1 = AudioSegment.from_file(track1)
	a2 = AudioSegment.from_file(track2)
	mixed = a1.overlay(a2 - volume_offset)
	out_path = os.path.join(tempfile.gettempdir(), "mixed.wav")
	mixed.export(out_path, format="wav")
	return out_path

	# === Dummy Voice Cloning Tab – Works on Local Only ===
	def clone_voice(*args):
	return "⚠️ Voice cloning requires local install – use Python 3.9 or below"

	# === Speaker Diarization ("Who Spoke When?") ===
	try:
	from pyannote.audio import Pipeline as DiarizationPipeline
	from huggingface_hub import login

	hf_token = os.getenv("HF_TOKEN")
	if hf_token:
	login(token=hf_token)
	diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token or True)
	except Exception as e:
	diarize_pipeline = None
	print(f"⚠️ Failed to load diarization: {e}")

	def diarize_and_transcribe(audio_path):
	if not diarize_pipeline:
	return "⚠️ Diarization pipeline not loaded – check HF token or install pyannote.audio"

	# Run diarization
	audio = AudioSegment.from_file(audio_path)
	temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
	audio.export(temp_wav, format="wav")

	try:
	from pyannote.audio import Pipeline as DiarizationPipeline
	diarization = diarize_pipeline(temp_wav)

	result = whisper.transcribe(temp_wav)

	segments = []
	for turn, _, speaker in diarization.itertracks(yield_label=True):
	text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
	segments.append({
	"speaker": speaker,
	"start": turn.start,
	"end": turn.end,
	"text": text
	})

	return segments
	except Exception as e:
	return f"⚠️ Diarization failed: {str(e)}"

	# === UI ===
	effect_options = [
	"Noise Reduction",
	"Compress Dynamic Range",
	"Add Reverb",
	"Pitch Shift",
	"Echo",
	"Stereo Widening",
	"Bass Boost",
	"Treble Boost",
	"Normalize",
	"Noise Gate",
	"Limiter",
	"Phaser",
	"Flanger",
	"Bitcrusher",
	"Auto Gain",
	"Vocal Distortion"
	]

	with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
	gr.Markdown("## 🎧 Ultimate AI Audio Studio\nUpload, edit, export — powered by AI!")

	# --- Single File Studio ---
	with gr.Tab("🎵 Single File Studio"):
	gr.Interface(
	fn=process_audio,
	inputs=[
	gr.Audio(label="Upload Audio", type="filepath"),
	gr.CheckboxGroup(choices=effect_options, label="Apply Effects in Order"),
	gr.Checkbox(label="Isolate Vocals After Effects"),
	gr.Dropdown(choices=preset_names, label="Select Preset", value=preset_names[0] if preset_names else None),
	gr.Dropdown(choices=["MP3", "WAV"], label="Export Format", value="MP3")
	],
	outputs=[
	gr.Audio(label="Processed Audio", type="filepath"),
	gr.Image(label="Waveform Preview"),
	gr.Textbox(label="Session Log (JSON)", lines=5),
	gr.Textbox(label="Detected Genre", lines=1),
	gr.Textbox(label="Status", value="✅ Ready", lines=1)
	],
	title="Edit One File at a Time",
	description="Apply effects, preview waveform, and get full session log.",
	flagging_mode="never",
	submit_btn="Process Audio",
	clear_btn=None
	)

	# --- Batch Processing ---
	with gr.Tab("🔊 Batch Processing"):
	gr.Interface(
	fn=batch_process_audio,
	inputs=[
	gr.File(label="Upload Multiple Files", file_count="multiple"),
	gr.CheckboxGroup(choices=effect_options, label="Apply Effects in Order"),
	gr.Checkbox(label="Isolate Vocals After Effects"),
	gr.Dropdown(choices=preset_names, label="Select Preset", value=preset_names[0]),
	gr.Dropdown(choices=["MP3", "WAV"], label="Export Format", value="MP3")
	],
	outputs=[
	gr.File(label="Download ZIP of All Processed Files"),
	gr.Textbox(label="Status", value="✅ Ready", lines=1)
	],
	title="Batch Audio Processor",
	description="Upload multiple files, apply effects in bulk, and download all results in a single ZIP.",
	flagging_mode="never",
	submit_btn="Process All Files",
	clear_btn=None
	)

	# --- Remix Mode ---
	with gr.Tab("🎛 Remix Mode"):
	gr.Interface(
	fn=stem_split,
	inputs=gr.Audio(label="Upload Music Track", type="filepath"),
	outputs=[
	gr.File(label="Vocals"),
	gr.File(label="Drums"),
	gr.File(label="Bass"),
	gr.File(label="Other")
	],
	title="Split Into Drums, Bass, Vocals, and More",
	description="Use AI to separate musical elements like vocals, drums, and bass.",
	flagging_mode="never",
	clear_btn=None
	)

	# --- Transcribe & Edit Tab ===
	with gr.Tab("📝 Transcribe & Edit"):
	gr.Interface(
	fn=transcribe_audio,
	inputs=gr.Audio(label="Upload Audio", type="filepath"),
	outputs=gr.Textbox(label="Transcribed Text", lines=10),
	title="Transcribe & Edit Spoken Content",
	description="Convert voice to text and edit it before exporting again."
	)

	# --- Voice Cloning (Local Only) ===
	with gr.Tab("🎭 Voice Cloning (Local Only)"):
	gr.Interface(
	fn=clone_voice,
	inputs=[
	gr.File(label="Source Voice Clip"),
	gr.File(label="Target Voice Clip"),
	gr.Textbox(label="Text to Clone", lines=5)
	],
	outputs=gr.Audio(label="Cloned Output", type="filepath"),
	title="Replace One Voice With Another",
	description="Clone voice from source to target speaker using AI"
	)

	# --- Speaker Diarization (Who Spoke When?) ===
	if diarize_pipeline:
	with gr.Tab("🧏‍♂️ Who Spoke When?"):
	gr.Interface(
	fn=diarize_and_transcribe,
	inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
	outputs=gr.JSON(label="Diarized Transcript"),
	title="Split By Speaker + Transcribe",
	description="Detect speakers and transcribe their speech automatically."
	)

	# --- TTS Voice Generator ===
	with gr.Tab("💬 TTS Voice Generator"):
	gr.Interface(
	fn=generate_tts,
	inputs=gr.Textbox(label="Enter Text", lines=5),
	outputs=gr.Audio(label="Generated Speech", type="filepath"),
	title="Text-to-Speech Generator",
	description="Type anything and turn it into natural-sounding speech."
	)

	# --- Auto-Save / Resume Sessions ===
	session_state = gr.State()

	def save_or_resume_session(audio, preset, effects, action="save"):
	if action == "save":
	return {"audio": audio, "preset": preset, "effects": effects}, None, None, None
	elif action == "load" and isinstance(audio, dict):
	return (
	None,
	audio.get("audio"),
	audio.get("preset"),
	audio.get("effects")
	)
	return None, None, None, None

	with gr.Tab("🧾 Auto-Save & Resume"):
	gr.Markdown("Save your current state and resume editing later.")

	action_radio = gr.Radio(["save", "load"], label="Action", value="save")
	audio_input = gr.Audio(label="Upload or Load Audio", type="filepath")
	preset_dropdown = gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0] if preset_names else None)
	effect_checkbox = gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
	action_btn = gr.Button("Save or Load Session")

	session_data = gr.State()
	loaded_audio = gr.Audio(label="Loaded Audio", type="filepath")
	loaded_preset = gr.Dropdown(choices=preset_names, label="Loaded Preset")
	loaded_effects = gr.CheckboxGroup(choices=effect_options, label="Loaded Effects")

	action_btn.click(
	fn=save_or_resume_session,
	inputs=[audio_input, preset_dropdown, effect_checkbox, action_radio],
	outputs=[session_data, loaded_audio, loaded_preset, loaded_effects]
	)

	# --- VAD – Detect & Remove Silence ===
	with gr.Tab("✂️ Trim Silence Automatically"):
	gr.Interface(
	fn=detect_silence,
	inputs=[
	gr.File(label="Upload Track"),
	gr.Slider(minimum=-100, maximum=-10, value=-50, label="Silence Threshold (dB)"),
	gr.Number(label="Min Silence Length (ms)", value=1000)
	],
	outputs=gr.File(label="Trimmed Output"),
	title="Auto-Detect & Remove Silence",
	description="Detect and trim silence at start/end or between words"
	)

	# --- Load/Save Project File (.aiproj) ===
	with gr.Tab("📁 Save/Load Project"):
	gr.Interface(
	fn=save_project,
	inputs=[
	gr.File(label="Original Audio"),
	gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0]),
	gr.CheckboxGroup(choices=effect_options, label="Applied Effects")
	],
	outputs=gr.File(label="Project File (.aiproj)"),
	title="Save Everything Together",
	description="Save your session, effects, and settings in one file to reuse later."
	)

	gr.Interface(
	fn=load_project,
	inputs=gr.File(label="Upload .aiproj File"),
	outputs=[
	gr.Dropdown(choices=preset_names, label="Loaded Preset"),
	gr.CheckboxGroup(choices=effect_options, label="Loaded Effects")
	],
	title="Resume Last Project",
	description="Load your saved session"
	)

	# --- Mix Two Tracks ===
	with gr.Tab("🔀 Mix Two Tracks"):
	gr.Interface(
	fn=mix_tracks,
	inputs=[
	gr.File(label="Main Track"),
	gr.File(label="Background Track"),
	gr.Slider(minimum=-10, maximum=10, value=0, label="Volume Offset (dB)")
	],
	outputs=gr.File(label="Mixed Output"),
	title="Overlay Two Tracks",
	description="Mix, blend, or subtract two audio files."
	)

	demo.launch()