Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import numpy as np | |
import librosa | |
import librosa.display | |
import matplotlib.pyplot as plt | |
from PIL import Image | |
from io import BytesIO | |
import soundfile as sf | |
from pathlib import Path | |
from skimage.metrics import structural_similarity as ssim | |
def generate_waveform_image(audio_path, output_path=None, fig_size=(10, 4)): | |
"""Generate a waveform image from an audio file. | |
Args: | |
audio_path: Path to the audio file | |
output_path: Path to save the generated image (optional) | |
fig_size: Size of the figure (width, height) | |
Returns: | |
BytesIO object containing the image if output_path is None, otherwise saves to output_path | |
""" | |
# Load audio file | |
y, sr = librosa.load(audio_path, sr=None, mono=False) | |
# If mono, convert to stereo-like format for consistent plotting | |
if y.ndim == 1: | |
y = np.array([y, y]) | |
plt.figure(figsize=fig_size) | |
# Plot waveform for each channel with fixed Y-axis scale | |
plt.subplot(2, 1, 1) | |
plt.plot(y[0]) | |
plt.title('Channel 1') | |
plt.ylim([-1.0, 1.0]) # Fixed Y-axis scale for all waveforms | |
plt.subplot(2, 1, 2) | |
plt.plot(y[1]) | |
plt.title('Channel 2') | |
plt.ylim([-1.0, 1.0]) # Fixed Y-axis scale for all waveforms | |
plt.tight_layout() | |
if output_path: | |
plt.savefig(output_path) | |
plt.close() | |
return output_path | |
else: | |
buf = BytesIO() | |
plt.savefig(buf, format='png') | |
plt.close() | |
buf.seek(0) | |
return buf | |
def generate_spectrogram_image(audio_path, output_path=None, fig_size=(10, 8)): | |
"""Generate a spectrogram image from an audio file. | |
Args: | |
audio_path: Path to the audio file | |
output_path: Path to save the generated image (optional) | |
fig_size: Size of the figure (width, height) | |
Returns: | |
BytesIO object containing the image if output_path is None, otherwise saves to output_path | |
""" | |
# Load audio file | |
y, sr = librosa.load(audio_path, sr=None, mono=False) | |
# If mono, convert to stereo-like format for consistent plotting | |
if y.ndim == 1: | |
y = np.array([y, y]) | |
plt.figure(figsize=fig_size) | |
# Set fixed min and max values for spectrogram color scale | |
vmin = -80 # dB | |
vmax = 0 # dB | |
# Generate spectrograms for each channel | |
for i in range(2): | |
# Compute spectrogram | |
S = librosa.amplitude_to_db(np.abs(librosa.stft(y[i])), ref=np.max) | |
plt.subplot(2, 1, i+1) | |
# Use fixed frequency range and consistent color scaling | |
librosa.display.specshow( | |
S, | |
sr=sr, | |
x_axis='time', | |
y_axis='log', | |
vmin=vmin, | |
vmax=vmax | |
) | |
plt.colorbar(format='%+2.0f dB') | |
plt.title(f'Channel {i+1} Spectrogram') | |
# Set frequency range (y-axis) - typically up to Nyquist frequency (sr/2) | |
plt.ylim([20, sr/2]) # From 20Hz to Nyquist frequency | |
plt.tight_layout() | |
if output_path: | |
plt.savefig(output_path) | |
plt.close() | |
return output_path | |
else: | |
buf = BytesIO() | |
plt.savefig(buf, format='png') | |
plt.close() | |
buf.seek(0) | |
return buf | |
def compare_images(image1_path, image2_path, min_similarity_threshold=0.999): | |
"""Compare two images using Structural Similarity Index (SSIM) which is robust to small shifts. | |
Args: | |
image1_path: Path to the first image | |
image2_path: Path to the second image | |
min_similarity_threshold: Minimum similarity required for images to be considered matching (0.0-1.0) | |
- Higher values (closer to 1.0) require images to be more similar | |
- Lower values (closer to 0.0) are more permissive | |
- A value of 0.99 requires 99% similarity between images | |
- A value of 0.0 would consider any images to match | |
Returns: | |
Tuple of (similarity_score, is_match) | |
- similarity_score: Value between 0.0 and 1.0, where 1.0 means identical images | |
- is_match: Boolean indicating if similarity_score >= min_similarity_threshold | |
""" | |
# Open images | |
img1 = Image.open(image1_path).convert('RGB') | |
img2 = Image.open(image2_path).convert('RGB') | |
# Ensure same size for comparison | |
if img1.size != img2.size: | |
img2 = img2.resize(img1.size) | |
# Convert to numpy arrays | |
arr1 = np.array(img1) | |
arr2 = np.array(img2) | |
# Calculate SSIM for each color channel | |
similarity_scores = [] | |
for channel in range(3): # RGB channels | |
score = ssim(arr1[:,:,channel], arr2[:,:,channel], data_range=255) | |
similarity_scores.append(score) | |
# Calculate average SSIM across channels | |
similarity_score = np.mean(similarity_scores) | |
# Determine if images match by comparing similarity to threshold | |
is_match = similarity_score >= min_similarity_threshold | |
return (similarity_score, is_match) | |
def generate_reference_images(input_path, output_dir=None, prefix=""): | |
"""Generate reference waveform and spectrogram images for an audio file. | |
Args: | |
input_path: Path to the audio file | |
output_dir: Directory to save the generated images (optional) | |
prefix: Prefix to add to the output image filenames | |
Returns: | |
Tuple of (waveform_path, spectrogram_path) | |
""" | |
if output_dir is None: | |
output_dir = os.path.dirname(input_path) | |
# Create output directory if it doesn't exist | |
os.makedirs(output_dir, exist_ok=True) | |
input_filename = os.path.basename(input_path) | |
name_without_ext = os.path.splitext(input_filename)[0] | |
# Generate waveform image | |
waveform_path = os.path.join(output_dir, f"{prefix}{name_without_ext}_waveform.png") | |
generate_waveform_image(input_path, waveform_path) | |
# Generate spectrogram image | |
spectrogram_path = os.path.join(output_dir, f"{prefix}{name_without_ext}_spectrogram.png") | |
generate_spectrogram_image(input_path, spectrogram_path) | |
return (waveform_path, spectrogram_path) |