import gradio as gr from pydub import AudioSegment from pydub.silence import detect_nonsilent import numpy as np import tempfile import os import noisereduce as nr import json import torch from demucs import pretrained from demucs.apply import apply_model import torchaudio from pathlib import Path import matplotlib.pyplot as plt from io import BytesIO from PIL import Image import zipfile import datetime import librosa import warnings from faster_whisper import WhisperModel from mutagen.mp3 import MP3 from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER from TTS.api import TTS import pickle # Suppress warnings warnings.filterwarnings("ignore") # === Helper Functions === def audiosegment_to_array(audio): return np.array(audio.get_array_of_samples()), audio.frame_rate def array_to_audiosegment(samples, frame_rate, channels=1): return AudioSegment( samples.tobytes(), frame_rate=frame_rate, sample_width=samples.dtype.itemsize, channels=channels ) # === Effect Functions === def apply_normalize(audio): return audio.normalize() def apply_noise_reduction(audio): samples, frame_rate = audiosegment_to_array(audio) reduced = nr.reduce_noise(y=samples, sr=frame_rate) return array_to_audiosegment(reduced, frame_rate, channels=audio.channels) def apply_compression(audio): return audio.compress_dynamic_range() def apply_reverb(audio): reverb = audio - 10 return audio.overlay(reverb, position=1000) def apply_pitch_shift(audio, semitones=-2): new_frame_rate = int(audio.frame_rate * (2 ** (semitones / 12))) samples = np.array(audio.get_array_of_samples()) resampled = np.interp( np.arange(0, len(samples), 2 ** (semitones / 12)), np.arange(len(samples)), samples ).astype(np.int16) return AudioSegment( resampled.tobytes(), frame_rate=new_frame_rate, sample_width=audio.sample_width, channels=audio.channels ) def apply_echo(audio, delay_ms=500, decay=0.5): echo = audio - 10 return audio.overlay(echo, position=delay_ms) def apply_stereo_widen(audio, pan_amount=0.3): left = audio.pan(-pan_amount) right = audio.pan(pan_amount) return AudioSegment.from_mono_audiosegments(left, right) def apply_bass_boost(audio, gain=10): return audio.low_pass_filter(100).apply_gain(gain) def apply_treble_boost(audio, gain=10): return audio.high_pass_filter(4000).apply_gain(gain) # === Vocal Isolation Helpers === def load_track_local(path, sample_rate, channels=2): sig, rate = torchaudio.load(path) if rate != sample_rate: sig = torchaudio.functional.resample(sig, rate, sample_rate) if channels == 1: sig = sig.mean(0) return sig def save_track(path, wav, sample_rate): path = Path(path) torchaudio.save(str(path), wav, sample_rate) def apply_vocal_isolation(audio_path): model = pretrained.get_model(name='htdemucs') wav = load_track_local(audio_path, model.samplerate, channels=2) ref = wav.mean(0) wav -= ref[:, None] sources = apply_model(model, wav[None])[0] wav += ref[:, None] vocal_track = sources[3].cpu() out_path = os.path.join(tempfile.gettempdir(), "vocals.wav") save_track(out_path, vocal_track, model.samplerate) return out_path # === Stem Splitting (Drums, Bass, Other, Vocals) === def stem_split(audio_path): model = pretrained.get_model(name='htdemucs') wav = load_track_local(audio_path, model.samplerate, channels=2) sources = apply_model(model, wav[None])[0] output_dir = tempfile.mkdtemp() stem_paths = [] for i, name in enumerate(['drums', 'bass', 'other', 'vocals']): path = os.path.join(output_dir, f"{name}.wav") save_track(path, sources[i].cpu(), model.samplerate) stem_paths.append(gr.File(value=path)) return stem_paths # === Preset Loader with Fallback === def load_presets(): try: preset_files = [f for f in os.listdir("presets") if f.endswith(".json")] presets = {} for f in preset_files: path = os.path.join("presets", f) try: with open(path, "r") as infile: data = json.load(infile) if "name" in data and "effects" in data: presets[data["name"]] = data["effects"] except json.JSONDecodeError: print(f"Invalid JSON: {f}") return presets except FileNotFoundError: print("Presets folder not found") return {} preset_choices = load_presets() if not preset_choices: preset_choices = { "Default": [], "Clean Podcast": ["Noise Reduction", "Normalize"], "Music Remix": ["Bass Boost", "Stereo Widening"] } preset_names = list(preset_choices.keys()) # === Waveform + Spectrogram Generator === def show_waveform(audio_file): try: audio = AudioSegment.from_file(audio_file) samples = np.array(audio.get_array_of_samples()) plt.figure(figsize=(10, 2)) plt.plot(samples[:10000], color="blue") plt.axis("off") buf = BytesIO() plt.savefig(buf, format="png", bbox_inches="tight", dpi=100) plt.close() buf.seek(0) return Image.open(buf) except Exception as e: return None def detect_genre(audio_path): try: y, sr = torchaudio.load(audio_path) mfccs = librosa.feature.mfcc(y=y.numpy().flatten(), sr=sr, n_mfcc=13).mean(axis=1).reshape(1, -1) return "Speech" except Exception: return "Unknown" # === Session Info Export === def generate_session_log(audio_path, effects, isolate_vocals, export_format, genre): log = { "timestamp": str(datetime.datetime.now()), "filename": os.path.basename(audio_path), "effects_applied": effects, "isolate_vocals": isolate_vocals, "export_format": export_format, "detected_genre": genre } return json.dumps(log, indent=2) # === Main Processing Function with Status Updates === def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, export_format): status = "๐Ÿ”Š Loading audio..." try: audio = AudioSegment.from_file(audio_file) status = "๐Ÿ›  Applying effects..." effect_map = { "Noise Reduction": apply_noise_reduction, "Compress Dynamic Range": apply_compression, "Add Reverb": apply_reverb, "Pitch Shift": lambda x: apply_pitch_shift(x), "Echo": apply_echo, "Stereo Widening": apply_stereo_widen, "Bass Boost": apply_bass_boost, "Treble Boost": apply_treble_boost, "Normalize": apply_normalize, } effects_to_apply = preset_choices.get(preset_name, selected_effects) for effect_name in effects_to_apply: if effect_name in effect_map: audio = effect_map[effect_name](audio) status = "๐Ÿ’พ Saving final audio..." with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: if isolate_vocals: temp_input = os.path.join(tempfile.gettempdir(), "input.wav") audio.export(temp_input, format="wav") vocal_path = apply_vocal_isolation(temp_input) final_audio = AudioSegment.from_wav(vocal_path) else: final_audio = audio output_path = f.name final_audio.export(output_path, format=export_format.lower()) waveform_image = show_waveform(output_path) genre = detect_genre(output_path) session_log = generate_session_log(audio_file, effects_to_apply, isolate_vocals, export_format, genre) status = "๐ŸŽ‰ Done!" return output_path, waveform_image, session_log, genre, status except Exception as e: status = f"โŒ Error: {str(e)}" return None, None, status, "", status # === Batch Processing Function === def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, export_format): status = "๐Ÿ”Š Loading files..." try: output_dir = tempfile.mkdtemp() results = [] session_logs = [] for file in files: processed_path, _, log, _, _ = process_audio(file.name, selected_effects, isolate_vocals, preset_name, export_format) results.append(processed_path) session_logs.append(log) zip_path = os.path.join(output_dir, "batch_output.zip") with zipfile.ZipFile(zip_path, 'w') as zipf: for i, res in enumerate(results): filename = f"processed_{i}.{export_format.lower()}" zipf.write(res, filename) zipf.writestr(f"session_info_{i}.json", session_logs[i]) return zip_path, "๐Ÿ“ฆ ZIP created successfully!" except Exception as e: return None, f"โŒ Batch processing failed: {str(e)}" # === Transcribe & Edit Tab === whisper_model = WhisperModel("base") def transcribe_audio(audio_path): segments, info = whisper_model.transcribe(audio_path, beam_size=5) text = " ".join([seg.text for seg in segments]) return text # === TTS Tab === tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False) def generate_tts(text): out_path = os.path.join(tempfile.gettempdir(), "tts_output.wav") tts.tts_to_file(text=text, file_path=out_path) return out_path # === Trim Silence Automatically (VAD) === def detect_silence(audio_file, silence_threshold=-50.0, min_silence_len=1000): audio = AudioSegment.from_file(audio_file) nonsilent_ranges = detect_nonsilent( audio, min_silence_len=int(min_silence_len), silence_thresh=silence_threshold ) if not nonsilent_ranges: return audio.export(os.path.join(tempfile.gettempdir(), "trimmed.wav"), format="wav") trimmed = audio[nonsilent_ranges[0][0]:nonsilent_ranges[-1][1]] out_path = os.path.join(tempfile.gettempdir(), "trimmed.wav") trimmed.export(out_path, format="wav") return out_path # === Mix Two Tracks === def mix_tracks(track1, track2, volume_offset=0): a1 = AudioSegment.from_file(track1) a2 = AudioSegment.from_file(track2) mixed = a1.overlay(a2 - volume_offset) out_path = os.path.join(tempfile.gettempdir(), "mixed.wav") mixed.export(out_path, format="wav") return out_path # === Save/Load Project File (.aiproj) === def save_project(audio_path, preset_name, effects): project_data = { "audio": AudioSegment.from_file(audio_path).raw_data, "preset": preset_name, "effects": effects } out_path = os.path.join(tempfile.gettempdir(), "project.aiproj") with open(out_path, "wb") as f: pickle.dump(project_data, f) return out_path def load_project(project_file): with open(project_file.name, "rb") as f: data = pickle.load(f) return data["preset"], data["effects"] # === Auto-Save / Resume Sessions === def save_or_resume_session(audio, preset, effects, action="save"): if action == "save": return {"audio": audio, "preset": preset, "effects": effects}, None, None, None elif action == "load" and isinstance(audio, dict): return ( None, audio.get("audio"), audio.get("preset"), audio.get("effects") ) return None, None, None, None # === Voice Cloning โ€“ Fallback Version for Hugging Face === def clone_voice(source_audio, target_audio, text): print("โš ๏ธ Voice cloning not available in browser version โ€” use local install for full support") return generate_tts(text) # === UI Setup === effect_options = [ "Noise Reduction", "Compress Dynamic Range", "Add Reverb", "Pitch Shift", "Echo", "Stereo Widening", "Bass Boost", "Treble Boost", "Normalize" ] with gr.Blocks(title="AI Audio Studio", css="style.css") as demo: gr.Markdown("## ๐ŸŽง Ultimate AI Audio Studio\nUpload, edit, export โ€” powered by AI!") # --- Single File Studio --- with gr.Tab("๐ŸŽต Single File Studio"): gr.Interface( fn=process_audio, inputs=[ gr.Audio(label="Upload Audio", type="filepath"), gr.CheckboxGroup(choices=effect_options, label="Apply Effects in Order"), gr.Checkbox(label="Isolate Vocals After Effects"), gr.Dropdown(choices=preset_names, label="Select Preset", value=preset_names[0] if preset_names else None), gr.Dropdown(choices=["MP3", "WAV"], label="Export Format", value="MP3") ], outputs=[ gr.Audio(label="Processed Audio", type="filepath"), gr.Image(label="Waveform Preview"), gr.Textbox(label="Session Log (JSON)", lines=5), gr.Textbox(label="Detected Genre", lines=1), gr.Textbox(label="Status", value="โœ… Ready", lines=1) ], title="Edit One File at a Time", description="Apply effects, preview waveform, and get full session log.", flagging_mode="never", submit_btn="Process Audio", clear_btn=None ) # --- Batch Processing --- with gr.Tab("๐Ÿ”Š Batch Processing"): gr.Interface( fn=batch_process_audio, inputs=[ gr.File(label="Upload Multiple Files", file_count="multiple"), gr.CheckboxGroup(choices=effect_options, label="Apply Effects in Order"), gr.Checkbox(label="Isolate Vocals After Effects"), gr.Dropdown(choices=preset_names, label="Select Preset", value=preset_names[0] if preset_names else None), gr.Dropdown(choices=["MP3", "WAV"], label="Export Format", value="MP3") ], outputs=[ gr.File(label="Download ZIP of All Processed Files"), gr.Textbox(label="Status", value="โœ… Ready", lines=1) ], title="Batch Audio Processor", description="Upload multiple files, apply effects in bulk, and download all results in a single ZIP.", flagging_mode="never", submit_btn="Process All Files", clear_btn=None ) # --- Remix Mode --- with gr.Tab("๐ŸŽ› Remix Mode"): gr.Interface( fn=stem_split, inputs=gr.Audio(label="Upload Music Track", type="filepath"), outputs=[ gr.File(label="Vocals"), gr.File(label="Drums"), gr.File(label="Bass"), gr.File(label="Other") ], title="Split Into Drums, Bass, Vocals, and More", description="Use AI to separate musical elements like vocals, drums, and bass.", flagging_mode="never", clear_btn=None ) # --- Transcribe & Edit --- with gr.Tab("๐Ÿ“ Transcribe & Edit"): gr.Interface( fn=transcribe_audio, inputs=gr.Audio(label="Upload Audio", type="filepath"), outputs=gr.Textbox(label="Transcribed Text", lines=10), title="Transcribe Spoken Content", description="Convert voice to text and edit it before exporting again." ) # --- TTS Voice Generator --- with gr.Tab("๐Ÿ’ฌ TTS Voice Generator"): gr.Interface( fn=generate_tts, inputs=gr.Textbox(label="Enter Text", lines=5), outputs=gr.Audio(label="Generated Speech", type="filepath"), title="Text-to-Speech Generator", description="Type anything and turn it into natural-sounding speech." ) # --- VAD โ€“ Detect & Remove Silence === with gr.Tab("โœ‚๏ธ Trim Silence Automatically"): gr.Interface( fn=detect_silence, inputs=[ gr.File(label="Upload Track"), gr.Slider(minimum=-100, maximum=-10, value=-50, label="Silence Threshold (dB)"), gr.Number(label="Min Silence Length (ms)", value=1000) ], outputs=gr.File(label="Trimmed Output"), title="Auto-Detect & Remove Silence", description="Detect and trim silence at start/end or between words" ) # --- Load/Save Project File (.aiproj) === with gr.Tab("๐Ÿ“ Save/Load Project"): gr.Interface( fn=save_project, inputs=[ gr.File(label="Original Audio"), gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0]), gr.CheckboxGroup(choices=effect_options, label="Applied Effects") ], outputs=gr.File(label="Project File (.aiproj)"), title="Save Everything Together", description="Save your session, effects, and settings in one file to reuse later." ) gr.Interface( fn=load_project, inputs=gr.File(label="Upload .aiproj File"), outputs=[ gr.Dropdown(choices=preset_names, label="Loaded Preset"), gr.CheckboxGroup(choices=effect_options, label="Loaded Effects") ], title="Resume Last Project", description="Load your saved session" ) # --- Auto-Save / Resume Sessions === session_state = gr.State() with gr.Tab("๐Ÿงพ Auto-Save & Resume"): gr.Markdown("Save your current state and resume editing later.") action_radio = gr.Radio(["save", "load"], label="Action", value="save") audio_input = gr.Audio(label="Upload or Load Audio", type="filepath") preset_dropdown = gr.Dropdown(choices=preset_names, label="Used Preset", value=preset_names[0] if preset_names else None) effect_checkbox = gr.CheckboxGroup(choices=effect_options, label="Applied Effects") save_btn = gr.Button("Save or Load Session") loaded_audio = gr.Audio(label="Loaded Audio", type="filepath") loaded_preset = gr.Dropdown(choices=preset_names, label="Loaded Preset") loaded_effects = gr.CheckboxGroup(choices=effect_options, label="Loaded Effects") save_btn.click( fn=save_or_resume_session, inputs=[audio_input, preset_dropdown, effect_checkbox, action_radio], outputs=[session_state, loaded_audio, loaded_preset, loaded_effects] ) # --- Mix Two Tracks === with gr.Tab("๐Ÿ”€ Mix Two Tracks"): gr.Interface( fn=mix_tracks, inputs=[ gr.File(label="Main Track"), gr.File(label="Background Track"), gr.Slider(minimum=-10, maximum=10, value=0, label="Volume Offset (dB)") ], outputs=gr.File(label="Mixed Output"), title="Overlay Two Tracks", description="Mix, blend, or subtract two audio files." ) # === Voice Style Transfer (Dummy) === def apply_style_transfer(audio_path, mood="Happy"): return audio_path with gr.Tab("๐Ÿง  Voice Style Transfer"): gr.Interface( fn=apply_style_transfer, inputs=[ gr.Audio(label="Upload Voice Clip", type="filepath"), gr.Radio(["Happy", "Sad", "Angry", "Calm"], label="Choose Tone") ], outputs=gr.Audio(label="Stylized Output", type="filepath"), title="Change Emotional Tone of Voice", description="Shift the emotional style of any voice clip." ) # --- Voice Cloning (Fallback) === with gr.Tab("๐ŸŽญ Voice Cloning (Demo)"): gr.Interface( fn=clone_voice, inputs=[ gr.File(label="Source Voice Clip"), gr.File(label="Target Voice Clip"), gr.Textbox(label="Text to Clone", lines=5) ], outputs=gr.Audio(label="Cloned Output", type="filepath"), title="Replace One Voice With Another (Demo)", description="Clone voice from source to target speaker using AI" ) demo.launch()