Spaces:

CCockrum
/

Tune-Splitter-III

Running

App Files Files Community

Tune-Splitter-III / app.py

CCockrum

Create app.py

5f8b972 verified 8 months ago

raw

history blame contribute delete

15.1 kB

	import streamlit as st
	import numpy as np
	import librosa
	import soundfile as sf
	import os
	import tempfile
	from pathlib import Path
	import torch
	from tqdm import tqdm
	import base64
	import io
	from PIL import Image
	import matplotlib.pyplot as plt

	# Page configuration
	st.set_page_config(
	page_title="Music Stem Splitter",
	page_icon="🎵",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Set maximum audio duration (in seconds) and file size (in MB)
	MAX_AUDIO_DURATION = 300 # 5 minutes
	MAX_FILE_SIZE_MB = 100

	# Load pretrained separator model
	@st.cache_resource
	def load_separator_model():
	try:
	# Import here to avoid loading until needed
	from demucs.pretrained import get_model
	model = get_model('htdemucs')
	model.eval()
	if torch.cuda.is_available():
	model.cuda()
	return model
	except ImportError:
	st.error("Required package 'demucs' not found. Please install it with 'pip install demucs'.")
	return None

	# Function to check audio length
	def check_audio_length(audio_path):
	try:
	duration = librosa.get_duration(path=audio_path)
	return duration
	except Exception as e:
	st.error(f"Could not determine audio length: {str(e)}")
	return MAX_AUDIO_DURATION + 1 # Return a value that will fail the check

	# Function to separate stems from an audio file
	def separate_stems(audio_path, model, sample_rate=44100):
	from demucs.apply import apply_model
	import torchaudio

	# Load audio with potential resampling to save memory
	waveform, original_sample_rate = torchaudio.load(audio_path)

	# Resample if needed to optimize memory usage
	if original_sample_rate > sample_rate:
	resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=sample_rate)
	waveform = resampler(waveform)
	st.info(f"Audio resampled from {original_sample_rate}Hz to {sample_rate}Hz to optimize performance.")
	else:
	sample_rate = original_sample_rate

	# Create a mono version just for visualization
	if waveform.shape[0] > 1:
	waveform_mono = torch.mean(waveform, dim=0, keepdim=True)
	else:
	waveform_mono = waveform

	# Get the audio length in seconds for progress tracking
	audio_length = waveform.shape[1] / sample_rate

	# Create a progress bar
	progress_bar = st.progress(0)
	status_text = st.empty()

	# Prepare the model input
	if torch.cuda.is_available():
	waveform = waveform.cuda()

	# For Demucs, we need the audio as (batch, channels, time)
	if waveform.dim() == 2: # (channels, time)
	waveform = waveform.unsqueeze(0)

	# Create a temp directory for saving stems
	temp_dir = tempfile.mkdtemp()
	stems = {}

	# Process and separate stems
	status_text.text("Separating stems... This may take a while depending on the audio length.")

	# Optimize memory usage by processing in chunks if needed
	with torch.no_grad():
	# Use smaller chunks for CPU, larger for GPU
	chunk_size = 10 * sample_rate if torch.cuda.is_available() else 5 * sample_rate

	if waveform.shape[-1] > chunk_size and waveform.shape[-1] > 30 * sample_rate:
	# Process in chunks for very long audio
	st.info("Processing long audio in chunks to optimize memory usage...")
	sources = []

	# Calculate number of chunks
	num_chunks = int(np.ceil(waveform.shape[-1] / chunk_size))

	for i in range(num_chunks):
	# Update progress
	progress = i / num_chunks * 0.7 # 70% of progress for separation
	progress_bar.progress(progress)
	status_text.text(f"Processing chunk {i+1}/{num_chunks}...")

	# Extract chunk
	start = i * chunk_size
	end = min(start + chunk_size, waveform.shape[-1])
	chunk = waveform[:, :, start:end]

	# Process chunk
	chunk_sources = apply_model(model, chunk, device="cuda" if torch.cuda.is_available() else "cpu")

	# Append to sources
	if i == 0:
	sources = chunk_sources
	else:
	# Concatenate along time dimension
	sources = torch.cat([sources, chunk_sources], dim=-1)

	# Clear GPU memory if needed
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	else:
	# Process entire audio at once for shorter clips
	sources = apply_model(model, waveform, device="cuda" if torch.cuda.is_available() else "cpu")

	# sources is (batch, source, channels, time)
	sources = sources[0] # Remove batch dimension

	# Save each source
	source_names = ["drums", "bass", "other", "vocals"]
	for i, source_name in enumerate(source_names):
	stems[source_name] = sources[i].cpu().numpy()

	# Update progress
	progress = 0.7 + (i + 1) / len(source_names) * 0.2 # 20% of progress for stem saving
	progress_bar.progress(progress)
	status_text.text(f"Processed {source_name} stem ({i+1}/{len(source_names)})")

	# Create visualizations (at reduced resolution to save memory)
	visualizations = {}
	for stem_name, audio_data in stems.items():
	# Create spectrogram visualization
	plt.figure(figsize=(10, 4))

	# Use a smaller portion of audio for visualization if it's too long
	max_samples = min(sample_rate * 30, audio_data.shape[1]) # 30 seconds max
	visualization_data = audio_data[0, :max_samples] if audio_data.shape[1] > max_samples else audio_data[0]

	# Create spectrogram with reduced resolution
	D = librosa.amplitude_to_db(np.abs(librosa.stft(visualization_data, n_fft=1024, hop_length=512)), ref=np.max)

	plt.subplot(1, 1, 1)
	librosa.display.specshow(D, y_axis='log', x_axis='time', sr=sample_rate)
	plt.title(f'{stem_name.capitalize()} Spectrogram')
	plt.colorbar(format='%+2.0f dB')
	plt.tight_layout()

	# Save figure to bytes
	buf = io.BytesIO()
	plt.savefig(buf, format='png', dpi=100) # Lower DPI to save memory
	buf.seek(0)
	visualizations[stem_name] = buf
	plt.close()

	# Clear GPU memory
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Update progress to complete
	progress_bar.progress(1.0)
	status_text.text("Stem separation complete!")

	return stems, sample_rate, visualizations

	# Function to create a download link for audio files
	def get_binary_file_downloader_html(bin_data, file_label, file_extension):
	b64data = base64.b64encode(bin_data).decode()
	href = f'<a href="data:audio/{file_extension};base64,{b64data}" download="{file_label}.{file_extension}">Download {file_label}</a>'
	return href

	# Title and description
	st.title("🎵 Music Stem Splitter")

	st.markdown("""
	This application separates music tracks into individual stems:
	- Vocals: Lead and background vocals
	- Drums: Drum kit and percussion
	- Bass: Bass guitar, synth bass, etc.
	- Other: All other instruments and sounds

	Upload an audio file (MP3, WAV, or FLAC) to get started.
	""")

	# Add warning about HF Spaces limitations
	st.warning(f"""
	⚠️ Hugging Face Spaces Limitations:
	- Maximum file size: {MAX_FILE_SIZE_MB}MB
	- Maximum audio duration: {MAX_AUDIO_DURATION} seconds ({MAX_AUDIO_DURATION//60} minutes)
	- Processing may take several minutes depending on server load
	""")

	# Initialize session state for storing results
	if 'stems' not in st.session_state:
	st.session_state.stems = None
	if 'sample_rate' not in st.session_state:
	st.session_state.sample_rate = None
	if 'visualizations' not in st.session_state:
	st.session_state.visualizations = None

	# File uploader
	st.subheader("Upload Audio File")
	uploaded_file = st.file_uploader("Choose an audio file", type=["mp3", "wav", "flac", "ogg"])

	# Model loading (only when needed)
	model_load_state = st.empty()

	# Process the uploaded file
	if uploaded_file is not None:
	# Check file size
	file_size_mb = uploaded_file.size / 1e6

	if file_size_mb > MAX_FILE_SIZE_MB:
	st.error(f"File too large: {file_size_mb:.1f}MB. Maximum allowed size is {MAX_FILE_SIZE_MB}MB.")
	else:
	# Display file info
	file_details = {"Filename": uploaded_file.name, "FileSize": f"{file_size_mb:.2f} MB"}
	st.write(file_details)

	# Create a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
	tmp_file.write(uploaded_file.getvalue())
	tmp_path = tmp_file.name

	# Check audio duration
	audio_duration = check_audio_length(tmp_path)

	if audio_duration > MAX_AUDIO_DURATION:
	st.error(f"Audio duration too long: {audio_duration:.1f} seconds. Maximum allowed duration is {MAX_AUDIO_DURATION} seconds ({MAX_AUDIO_DURATION//60} minutes).")
	# Clean up temporary file
	os.unlink(tmp_path)
	else:
	st.info(f"Audio duration: {audio_duration:.1f} seconds")

	# Load model (with caching for efficiency)
	with model_load_state:
	st.info("Loading AI model... This may take a moment the first time.")
	model = load_separator_model()

	if model is not None:
	# Process button
	if st.button("Split into Stems"):
	try:
	# Select processing sample rate based on file duration
	# Shorter files can use higher quality, longer files use lower to save memory
	if audio_duration < 60: # Less than 1 minute
	processing_sample_rate = 44100
	elif audio_duration < 180: # 1-3 minutes
	processing_sample_rate = 32000
	else: # 3-5 minutes
	processing_sample_rate = 22050

	# Perform stem separation
	st.session_state.stems, st.session_state.sample_rate, st.session_state.visualizations = separate_stems(
	tmp_path, model, sample_rate=processing_sample_rate
	)
	st.success("Stem separation completed! Scroll down to preview and download individual stems.")
	except Exception as e:
	st.error(f"An error occurred during processing: {str(e)}")
	st.info("Try with a shorter audio clip or a different file format.")
	else:
	st.warning("Required packages not available. To run locally, install with 'pip install demucs librosa soundfile'")

	# Clean up temporary file
	os.unlink(tmp_path)

	# Display results if available
	if st.session_state.stems is not None:
	st.header("Separated Stems")

	# Create tabs for each stem
	stem_tabs = st.tabs(["Vocals", "Drums", "Bass", "Other"])

	# Get stem names in correct order
	stem_names = ["vocals", "drums", "bass", "other"]

	# Process each stem
	for i, (stem_tab, stem_name) in enumerate(zip(stem_tabs, stem_names)):
	with stem_tab:
	# Create columns for audio player and visualization
	col1, col2 = st.columns([1, 1])

	with col1:
	st.subheader(f"{stem_name.capitalize()} Stem")

	# Convert numpy array to audio file for playback
	audio_data = st.session_state.stems[stem_name]

	# Create a temporary buffer for the audio data
	buf = io.BytesIO()
	sf.write(buf, audio_data.T, st.session_state.sample_rate, format='WAV')
	buf.seek(0)

	# Display audio player
	st.audio(buf, format='audio/wav')

	# Download button
	st.markdown(get_binary_file_downloader_html(buf.getvalue(), f"{stem_name}", "wav"), unsafe_allow_html=True)

	# Additional information
	if stem_name == "vocals":
	st.info("Contains lead vocals and backing vocals.")
	elif stem_name == "drums":
	st.info("Contains drums and percussion elements.")
	elif stem_name == "bass":
	st.info("Contains bass guitar and low-frequency elements.")
	else: # other
	st.info("Contains all other instruments (guitars, keys, synths, etc).")

	with col2:
	# Display visualization
	if st.session_state.visualizations and stem_name in st.session_state.visualizations:
	st.image(st.session_state.visualizations[stem_name], caption=f"{stem_name.capitalize()} Spectrogram")

	# Show instructions for downloading all stems
	st.header("Usage Tips")
	st.markdown("""
	### What can you do with these stems?
	- Create remixes or mashups
	- Practice playing along with isolated instrument tracks
	- Create karaoke versions by removing vocals
	- Analyze individual instrument parts for educational purposes

	### Next steps:
	1. Download each stem you want to use
	2. Import them into your DAW (Digital Audio Workstation)
	3. Mix, process, and create!
	""")

	# Add instructions for local deployment
	st.sidebar.header("About This App")
	st.sidebar.markdown("""
	This application uses the Demucs model to separate audio tracks into individual stems. The model was developed by Facebook AI Research.

	### How it works
	The separation process uses a deep neural network to identify and isolate:
	- Vocals
	- Drums
	- Bass
	- Other instruments

	### Source code
	[GitHub Repository](https://github.com/huggingface/music-stem-splitter)
	(Link to your repo once created)
	""")

	# Add a note about processing time
	st.sidebar.markdown("""
	### Processing Time
	The processing time depends on:
	- Length of the audio file
	- Available computational resources
	- File quality

	For best results, use high-quality audio files without excessive background noise.
	""")

	# Show model information
	st.sidebar.markdown("""
	### Model Information
	This app uses the HTDemucs model, which is trained to separate music into four stems.

	Audio processing is optimized based on file length:
	- Short files (< 1 min): 44.1kHz processing
	- Medium files (1-3 min): 32kHz processing
	- Longer files (3-5 min): 22kHz processing
	""")