Spaces:

tee342
/

AudioMaster

Sleeping

App Files Files Community

AudioMaster / app.py

tee342

Update app.py

c260091 verified 5 months ago

raw

history blame

16.3 kB

	import gradio as gr
	from pydub import AudioSegment
	import numpy as np
	import tempfile
	import os
	import noisereduce as nr
	import json
	import torch
	from demucs import pretrained
	from demucs.apply import apply_model
	import torchaudio
	from pathlib import Path
	import matplotlib.pyplot as plt
	from io import BytesIO
	from PIL import Image
	import zipfile
	import datetime
	import librosa
	import joblib
	import warnings
	from faster_whisper import WhisperModel
	from mutagen.mp3 import MP3
	from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
	import whisper
	from pyannote.audio import Pipeline as DiarizationPipeline
	from openvoice.api import TTS, ToneColorConverter
	from openvoice.se_extractor import get_se

	# Suppress warnings
	warnings.filterwarnings("ignore")

	# === Helper Functions ===
	def audiosegment_to_array(audio):
	return np.array(audio.get_array_of_samples()), audio.frame_rate

	def array_to_audiosegment(samples, frame_rate, channels=1):
	return AudioSegment(
	samples.tobytes(),
	frame_rate=frame_rate,
	sample_width=samples.dtype.itemsize,
	channels=channels
	)

	# === Effect Functions ===
	def apply_normalize(audio):
	return audio.normalize()

	def apply_noise_reduction(audio):
	samples, frame_rate = audiosegment_to_array(audio)
	reduced = nr.reduce_noise(y=samples, sr=frame_rate)
	return array_to_audiosegment(reduced, frame_rate, channels=audio.channels)

	def apply_compression(audio):
	return audio.compress_dynamic_range()

	def apply_reverb(audio):
	reverb = audio - 10
	return audio.overlay(reverb, position=1000)

	def apply_pitch_shift(audio, semitones=-2):
	new_frame_rate = int(audio.frame_rate * (2 ** (semitones / 12)))
	samples = np.array(audio.get_array_of_samples())
	resampled = np.interp(
	np.arange(0, len(samples), 2 ** (semitones / 12)),
	np.arange(len(samples)),
	samples
	).astype(np.int16)
	return AudioSegment(
	resampled.tobytes(),
	frame_rate=new_frame_rate,
	sample_width=audio.sample_width,
	channels=audio.channels
	)

	def apply_echo(audio, delay_ms=500, decay=0.5):
	echo = audio - 10
	return audio.overlay(echo, position=delay_ms)

	def apply_stereo_widen(audio, pan_amount=0.3):
	left = audio.pan(-pan_amount)
	right = audio.pan(pan_amount)
	return AudioSegment.from_mono_audiosegments(left, right)

	def apply_bass_boost(audio, gain=10):
	return audio.low_pass_filter(100).apply_gain(gain)

	def apply_treble_boost(audio, gain=10):
	return audio.high_pass_filter(4000).apply_gain(gain)

	# === Vocal Isolation Helpers ===
	def load_track_local(path, sample_rate, channels=2):
	sig, rate = torchaudio.load(path)
	if rate != sample_rate:
	sig = torchaudio.functional.resample(sig, rate, sample_rate)
	if channels == 1:
	sig = sig.mean(0)
	return sig

	def save_track(path, wav, sample_rate):
	path = Path(path)
	torchaudio.save(str(path), wav, sample_rate)

	def apply_vocal_isolation(audio_path):
	model = pretrained.get_model(name='htdemucs')
	wav = load_track_local(audio_path, model.samplerate, channels=2)
	ref = wav.mean(0)
	wav -= ref[:, None]
	sources = apply_model(model, wav[None])[0]
	wav += ref[:, None]

	vocal_track = sources[3].cpu()
	out_path = os.path.join(tempfile.gettempdir(), "vocals.wav")
	save_track(out_path, vocal_track, model.samplerate)
	return out_path

	# === Stem Splitting (Drums, Bass, Other, Vocals) ===
	def stem_split(audio_path):
	model = pretrained.get_model(name='htdemucs')
	wav = load_track_local(audio_path, model.samplerate, channels=2)
	sources = apply_model(model, wav[None])[0]

	output_dir = tempfile.mkdtemp()
	stem_paths = []

	for i, name in enumerate(['drums', 'bass', 'other', 'vocals']):
	path = os.path.join(output_dir, f"{name}.wav")
	save_track(path, sources[i].cpu(), model.samplerate)
	stem_paths.append(path)

	return [gr.File(value=path) for path in stem_paths]

	# === Preset Loader with Fallback ===
	def load_presets():
	try:
	preset_files = [f for f in os.listdir("presets") if f.endswith(".json")]
	presets = {}
	for f in preset_files:
	path = os.path.join("presets", f)
	try:
	with open(path, "r") as infile:
	data = json.load(infile)
	if "name" in data and "effects" in data:
	presets[data["name"]] = data["effects"]
	except json.JSONDecodeError:
	print(f"Invalid JSON: {f}")
	return presets
	except FileNotFoundError:
	print("Presets folder not found")
	return {}

	preset_choices = load_presets()

	if not preset_choices:
	preset_choices = {
	"Default": [],
	"Clean Podcast": ["Noise Reduction", "Normalize"],
	"Music Remix": ["Bass Boost", "Stereo Widening"]
	}

	preset_names = list(preset_choices.keys())

	# === Waveform + Spectrogram Generator ===
	def show_waveform(audio_file):
	try:
	audio = AudioSegment.from_file(audio_file)
	samples = np.array(audio.get_array_of_samples())
	plt.figure(figsize=(10, 2))
	plt.plot(samples[:10000], color="blue")
	plt.axis("off")
	buf = BytesIO()
	plt.savefig(buf, format="png", bbox_inches="tight", dpi=100)
	plt.close()
	buf.seek(0)
	return Image.open(buf)
	except Exception as e:
	return None

	def detect_genre(audio_path):
	try:
	y, sr = torchaudio.load(audio_path)
	mfccs = librosa.feature.mfcc(y=y.numpy().flatten(), sr=sr, n_mfcc=13).mean(axis=1).reshape(1, -1)
	return "Speech"
	except Exception:
	return "Unknown"

	# === Session Info Export ===
	def generate_session_log(audio_path, effects, isolate_vocals, export_format, genre):
	log = {
	"timestamp": str(datetime.datetime.now()),
	"filename": os.path.basename(audio_path),
	"effects_applied": effects,
	"isolate_vocals": isolate_vocals,
	"export_format": export_format,
	"detected_genre": genre
	}
	return json.dumps(log, indent=2)

	# === Main Processing Function with Status Updates ===
	def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, export_format):
	status = "🔊 Loading audio..."
	try:
	audio = AudioSegment.from_file(audio_file)
	status = "🛠 Applying effects..."

	effect_map = {
	"Noise Reduction": apply_noise_reduction,
	"Compress Dynamic Range": apply_compression,
	"Add Reverb": apply_reverb,
	"Pitch Shift": lambda x: apply_pitch_shift(x),
	"Echo": apply_echo,
	"Stereo Widening": apply_stereo_widen,
	"Bass Boost": apply_bass_boost,
	"Treble Boost": apply_treble_boost,
	"Normalize": apply_normalize,
	}

	effects_to_apply = preset_choices.get(preset_name, selected_effects)
	for effect_name in effects_to_apply:
	if effect_name in effect_map:
	audio = effect_map[effect_name](audio)

	status = "💾 Saving final audio..."
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
	if isolate_vocals:
	temp_input = os.path.join(tempfile.gettempdir(), "input.wav")
	audio.export(temp_input, format="wav")
	vocal_path = apply_vocal_isolation(temp_input)
	final_audio = AudioSegment.from_wav(vocal_path)
	else:
	final_audio = audio

	output_path = f.name
	final_audio.export(output_path, format=export_format.lower())

	waveform_image = show_waveform(output_path)
	genre = detect_genre(output_path)
	session_log = generate_session_log(audio_file, effects_to_apply, isolate_vocals, export_format, genre)

	status = "🎉 Done!"
	return output_path, waveform_image, session_log, genre, status

	except Exception as e:
	status = f"❌ Error: {str(e)}"
	return None, None, status, "", status

	# === Batch Processing Function ===
	def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, export_format):
	status = "🔊 Loading files..."
	try:
	output_dir = tempfile.mkdtemp()
	results = []
	session_logs = []

	for file in files:
	processed_path, _, log, _, _ = process_audio(file.name, selected_effects, isolate_vocals, preset_name, export_format)
	results.append(processed_path)
	session_logs.append(log)

	zip_path = os.path.join(output_dir, "batch_output.zip")
	with zipfile.ZipFile(zip_path, 'w') as zipf:
	for i, res in enumerate(results):
	filename = f"processed_{i}.{export_format.lower()}"
	zipf.write(res, filename)
	zipf.writestr(f"session_info_{i}.json", session_logs[i])

	return zip_path, "📦 ZIP created successfully!"

	except Exception as e:
	return None, f"❌ Batch processing failed: {str(e)}"

	# === Load Models Once at Start ===

	# 🧠 Speaker Diarization Model
	diarize_model = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="YOUR_HF_TOKEN")

	# 🎤 OpenVoice TTS + Converter
	tts_model = TTS(lang='en')
	tone_converter = ToneColorConverter().to("cuda" if torch.cuda.is_available() else "cpu")

	# === Transcribe & Diarize Tab ===
	whisper_model = WhisperModel("base")

	def diarize_and_transcribe(audio_path):
	# Run diarization
	audio = AudioSegment.from_file(audio_path)
	temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
	audio.export(temp_wav, format="wav")
	diarization = diarize_model(temp_wav)

	# Run transcription
	result = whisper.transcribe(temp_wav)

	segments = []
	for turn, _, speaker in diarization.itertracks(yield_label=True):
	text = " ".join([seg.text for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
	segments.append({
	"speaker": speaker,
	"start": turn.start,
	"end": turn.end,
	"text": text
	})

	return segments

	# === Voice Cloning (Dubbing) ===
	def clone_voice(source_audio, target_audio, text):
	source_se, _ = get_se(source_audio)
	target_se, _ = get_se(target_audio)

	out_path = os.path.join(tempfile.gettempdir(), "cloned_output.wav")

	tts_model.tts_to_file(text=text, file_path=out_path)
	tone_converter.convert(
	audio_src_path=out_path,
	src_se=source_se,
	tgt_se=target_se,
	output_path=out_path
	)
	return out_path

	# === UI ===
	effect_options = [
	"Noise Reduction",
	"Compress Dynamic Range",
	"Add Reverb",
	"Pitch Shift",
	"Echo",
	"Stereo Widening",
	"Bass Boost",
	"Treble Boost",
	"Normalize"
	]

	with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
	gr.Markdown("## 🎧 Ultimate AI Audio Studio\nUpload, edit, export — powered by AI!")

	# --- Single File Studio ---
	with gr.Tab("🎵 Single File Studio"):
	gr.Interface(
	fn=process_audio,
	inputs=[
	gr.Audio(label="Upload Audio", type="filepath"),
	gr.CheckboxGroup(choices=effect_options, label="Apply Effects in Order"),
	gr.Checkbox(label="Isolate Vocals After Effects"),
	gr.Dropdown(choices=preset_names, label="Select Preset", value=preset_names[0] if preset_names else None),
	gr.Dropdown(choices=["MP3", "WAV"], label="Export Format", value="MP3")
	],
	outputs=[
	gr.Audio(label="Processed Audio", type="filepath"),
	gr.Image(label="Waveform Preview"),
	gr.Textbox(label="Session Log (JSON)", lines=5),
	gr.Textbox(label="Detected Genre", lines=1),
	gr.Textbox(label="Status", value="✅ Ready", lines=1)
	],
	title="Edit One File at a Time",
	description="Apply effects, preview waveform, and get full session log.",
	flagging_mode="never",
	submit_btn="Process Audio",
	clear_btn=None
	)

	# --- Batch Processing ---
	with gr.Tab("🔊 Batch Processing"):
	gr.Interface(
	fn=batch_process_audio,
	inputs=[
	gr.File(label="Upload Multiple Files", file_count="multiple"),
	gr.CheckboxGroup(choices=effect_options, label="Apply Effects in Order"),
	gr.Checkbox(label="Isolate Vocals After Effects"),
	gr.Dropdown(choices=preset_names, label="Select Preset", value=preset_names[0] if preset_names else None),
	gr.Dropdown(choices=["MP3", "WAV"], label="Export Format", value="MP3")
	],
	outputs=[
	gr.File(label="Download ZIP of All Processed Files"),
	gr.Textbox(label="Status", value="✅ Ready", lines=1)
	],
	title="Batch Audio Processor",
	description="Upload multiple files, apply effects in bulk, and download all results in a single ZIP.",
	flagging_mode="never",
	submit_btn="Process All Files",
	clear_btn=None
	)

	# --- Remix Mode ---
	with gr.Tab("🎛 Remix Mode"):
	gr.Interface(
	fn=stem_split,
	inputs=gr.Audio(label="Upload Music Track", type="filepath"),
	outputs=[
	gr.File(label="Vocals"),
	gr.File(label="Drums"),
	gr.File(label="Bass"),
	gr.File(label="Other")
	],
	title="Split Into Drums, Bass, Vocals, and More",
	description="Use AI to separate musical elements like vocals, drums, and bass.",
	flagging_mode="never",
	clear_btn=None
	)

	# --- Transcribe & Edit ===
	with gr.Tab("📝 Transcribe & Edit"):
	gr.Interface(
	fn=transcribe_audio,
	inputs=gr.Audio(label="Upload Audio", type="filepath"),
	outputs=gr.Textbox(label="Transcribed Text", lines=10),
	title="Transcribe & Edit Spoken Content",
	description="Convert voice to text and edit it before exporting again."
	)

	# --- Speaker Diarization ===
	with gr.Tab("🧏‍♂️ Who Spoke When?"):
	gr.Interface(
	fn=diarize_and_transcribe,
	inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
	outputs=gr.JSON(label="Diarized Transcript"),
	title="Split By Speaker + Transcribe",
	description="Detect speakers and transcribe their speech automatically."
	)

	# --- Voice Cloning (Dubbing) ===
	with gr.Tab("🎭 Voice Cloning (Dubbing)"):
	gr.Interface(
	fn=clone_voice,
	inputs=[
	gr.File(label="Source Voice Clip"),
	gr.File(label="Target Voice Clip"),
	gr.Textbox(label="Text to Clone", lines=5)
	],
	outputs=gr.Audio(label="Cloned Output", type="filepath"),
	title="Replace One Voice With Another",
	description="Clone voice from source to target speaker using AI"
	)

	# --- TTS Voice Generator ===
	with gr.Tab("💬 TTS Voice Generator"):
	gr.Interface(
	fn=generate_tts,
	inputs=gr.Textbox(label="Enter Text", lines=5),
	outputs=gr.Audio(label="Generated Speech", type="filepath"),
	title="Text-to-Speech Generator",
	description="Type anything and turn it into natural-sounding speech."
	)

	# --- Audio Analysis Dashboard ===
	with gr.Tab("📊 Audio Analysis"):
	gr.Interface(
	fn=analyze_audio,
	inputs=gr.Audio(label="Upload Track", type="filepath"),
	outputs=[
	gr.JSON(label="Audio Stats"),
	gr.Image(label="Waveform Graph")
	],
	title="View Loudness, BPM, Silence, and More",
	description="Analyze audio loudness, tempo, and frequency content."
	)

	demo.launch()