Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,185 Bytes
01f8b5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import soundfile as sf
from pathlib import Path
from skimage.metrics import structural_similarity as ssim
def generate_waveform_image(audio_path, output_path=None, fig_size=(10, 4)):
"""Generate a waveform image from an audio file.
Args:
audio_path: Path to the audio file
output_path: Path to save the generated image (optional)
fig_size: Size of the figure (width, height)
Returns:
BytesIO object containing the image if output_path is None, otherwise saves to output_path
"""
# Load audio file
y, sr = librosa.load(audio_path, sr=None, mono=False)
# If mono, convert to stereo-like format for consistent plotting
if y.ndim == 1:
y = np.array([y, y])
plt.figure(figsize=fig_size)
# Plot waveform for each channel with fixed Y-axis scale
plt.subplot(2, 1, 1)
plt.plot(y[0])
plt.title('Channel 1')
plt.ylim([-1.0, 1.0]) # Fixed Y-axis scale for all waveforms
plt.subplot(2, 1, 2)
plt.plot(y[1])
plt.title('Channel 2')
plt.ylim([-1.0, 1.0]) # Fixed Y-axis scale for all waveforms
plt.tight_layout()
if output_path:
plt.savefig(output_path)
plt.close()
return output_path
else:
buf = BytesIO()
plt.savefig(buf, format='png')
plt.close()
buf.seek(0)
return buf
def generate_spectrogram_image(audio_path, output_path=None, fig_size=(10, 8)):
"""Generate a spectrogram image from an audio file.
Args:
audio_path: Path to the audio file
output_path: Path to save the generated image (optional)
fig_size: Size of the figure (width, height)
Returns:
BytesIO object containing the image if output_path is None, otherwise saves to output_path
"""
# Load audio file
y, sr = librosa.load(audio_path, sr=None, mono=False)
# If mono, convert to stereo-like format for consistent plotting
if y.ndim == 1:
y = np.array([y, y])
plt.figure(figsize=fig_size)
# Set fixed min and max values for spectrogram color scale
vmin = -80 # dB
vmax = 0 # dB
# Generate spectrograms for each channel
for i in range(2):
# Compute spectrogram
S = librosa.amplitude_to_db(np.abs(librosa.stft(y[i])), ref=np.max)
plt.subplot(2, 1, i+1)
# Use fixed frequency range and consistent color scaling
librosa.display.specshow(
S,
sr=sr,
x_axis='time',
y_axis='log',
vmin=vmin,
vmax=vmax
)
plt.colorbar(format='%+2.0f dB')
plt.title(f'Channel {i+1} Spectrogram')
# Set frequency range (y-axis) - typically up to Nyquist frequency (sr/2)
plt.ylim([20, sr/2]) # From 20Hz to Nyquist frequency
plt.tight_layout()
if output_path:
plt.savefig(output_path)
plt.close()
return output_path
else:
buf = BytesIO()
plt.savefig(buf, format='png')
plt.close()
buf.seek(0)
return buf
def compare_images(image1_path, image2_path, min_similarity_threshold=0.999):
"""Compare two images using Structural Similarity Index (SSIM) which is robust to small shifts.
Args:
image1_path: Path to the first image
image2_path: Path to the second image
min_similarity_threshold: Minimum similarity required for images to be considered matching (0.0-1.0)
- Higher values (closer to 1.0) require images to be more similar
- Lower values (closer to 0.0) are more permissive
- A value of 0.99 requires 99% similarity between images
- A value of 0.0 would consider any images to match
Returns:
Tuple of (similarity_score, is_match)
- similarity_score: Value between 0.0 and 1.0, where 1.0 means identical images
- is_match: Boolean indicating if similarity_score >= min_similarity_threshold
"""
# Open images
img1 = Image.open(image1_path).convert('RGB')
img2 = Image.open(image2_path).convert('RGB')
# Ensure same size for comparison
if img1.size != img2.size:
img2 = img2.resize(img1.size)
# Convert to numpy arrays
arr1 = np.array(img1)
arr2 = np.array(img2)
# Calculate SSIM for each color channel
similarity_scores = []
for channel in range(3): # RGB channels
score = ssim(arr1[:,:,channel], arr2[:,:,channel], data_range=255)
similarity_scores.append(score)
# Calculate average SSIM across channels
similarity_score = np.mean(similarity_scores)
# Determine if images match by comparing similarity to threshold
is_match = similarity_score >= min_similarity_threshold
return (similarity_score, is_match)
def generate_reference_images(input_path, output_dir=None, prefix=""):
"""Generate reference waveform and spectrogram images for an audio file.
Args:
input_path: Path to the audio file
output_dir: Directory to save the generated images (optional)
prefix: Prefix to add to the output image filenames
Returns:
Tuple of (waveform_path, spectrogram_path)
"""
if output_dir is None:
output_dir = os.path.dirname(input_path)
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
input_filename = os.path.basename(input_path)
name_without_ext = os.path.splitext(input_filename)[0]
# Generate waveform image
waveform_path = os.path.join(output_dir, f"{prefix}{name_without_ext}_waveform.png")
generate_waveform_image(input_path, waveform_path)
# Generate spectrogram image
spectrogram_path = os.path.join(output_dir, f"{prefix}{name_without_ext}_spectrogram.png")
generate_spectrogram_image(input_path, spectrogram_path)
return (waveform_path, spectrogram_path) |