import os import numpy as np import librosa import librosa.display import matplotlib.pyplot as plt from PIL import Image from io import BytesIO import soundfile as sf from pathlib import Path from skimage.metrics import structural_similarity as ssim def generate_waveform_image(audio_path, output_path=None, fig_size=(10, 4)): """Generate a waveform image from an audio file. Args: audio_path: Path to the audio file output_path: Path to save the generated image (optional) fig_size: Size of the figure (width, height) Returns: BytesIO object containing the image if output_path is None, otherwise saves to output_path """ # Load audio file y, sr = librosa.load(audio_path, sr=None, mono=False) # If mono, convert to stereo-like format for consistent plotting if y.ndim == 1: y = np.array([y, y]) plt.figure(figsize=fig_size) # Plot waveform for each channel with fixed Y-axis scale plt.subplot(2, 1, 1) plt.plot(y[0]) plt.title('Channel 1') plt.ylim([-1.0, 1.0]) # Fixed Y-axis scale for all waveforms plt.subplot(2, 1, 2) plt.plot(y[1]) plt.title('Channel 2') plt.ylim([-1.0, 1.0]) # Fixed Y-axis scale for all waveforms plt.tight_layout() if output_path: plt.savefig(output_path) plt.close() return output_path else: buf = BytesIO() plt.savefig(buf, format='png') plt.close() buf.seek(0) return buf def generate_spectrogram_image(audio_path, output_path=None, fig_size=(10, 8)): """Generate a spectrogram image from an audio file. Args: audio_path: Path to the audio file output_path: Path to save the generated image (optional) fig_size: Size of the figure (width, height) Returns: BytesIO object containing the image if output_path is None, otherwise saves to output_path """ # Load audio file y, sr = librosa.load(audio_path, sr=None, mono=False) # If mono, convert to stereo-like format for consistent plotting if y.ndim == 1: y = np.array([y, y]) plt.figure(figsize=fig_size) # Set fixed min and max values for spectrogram color scale vmin = -80 # dB vmax = 0 # dB # Generate spectrograms for each channel for i in range(2): # Compute spectrogram S = librosa.amplitude_to_db(np.abs(librosa.stft(y[i])), ref=np.max) plt.subplot(2, 1, i+1) # Use fixed frequency range and consistent color scaling librosa.display.specshow( S, sr=sr, x_axis='time', y_axis='log', vmin=vmin, vmax=vmax ) plt.colorbar(format='%+2.0f dB') plt.title(f'Channel {i+1} Spectrogram') # Set frequency range (y-axis) - typically up to Nyquist frequency (sr/2) plt.ylim([20, sr/2]) # From 20Hz to Nyquist frequency plt.tight_layout() if output_path: plt.savefig(output_path) plt.close() return output_path else: buf = BytesIO() plt.savefig(buf, format='png') plt.close() buf.seek(0) return buf def compare_images(image1_path, image2_path, min_similarity_threshold=0.999): """Compare two images using Structural Similarity Index (SSIM) which is robust to small shifts. Args: image1_path: Path to the first image image2_path: Path to the second image min_similarity_threshold: Minimum similarity required for images to be considered matching (0.0-1.0) - Higher values (closer to 1.0) require images to be more similar - Lower values (closer to 0.0) are more permissive - A value of 0.99 requires 99% similarity between images - A value of 0.0 would consider any images to match Returns: Tuple of (similarity_score, is_match) - similarity_score: Value between 0.0 and 1.0, where 1.0 means identical images - is_match: Boolean indicating if similarity_score >= min_similarity_threshold """ # Open images img1 = Image.open(image1_path).convert('RGB') img2 = Image.open(image2_path).convert('RGB') # Ensure same size for comparison if img1.size != img2.size: img2 = img2.resize(img1.size) # Convert to numpy arrays arr1 = np.array(img1) arr2 = np.array(img2) # Calculate SSIM for each color channel similarity_scores = [] for channel in range(3): # RGB channels score = ssim(arr1[:,:,channel], arr2[:,:,channel], data_range=255) similarity_scores.append(score) # Calculate average SSIM across channels similarity_score = np.mean(similarity_scores) # Determine if images match by comparing similarity to threshold is_match = similarity_score >= min_similarity_threshold return (similarity_score, is_match) def generate_reference_images(input_path, output_dir=None, prefix=""): """Generate reference waveform and spectrogram images for an audio file. Args: input_path: Path to the audio file output_dir: Directory to save the generated images (optional) prefix: Prefix to add to the output image filenames Returns: Tuple of (waveform_path, spectrogram_path) """ if output_dir is None: output_dir = os.path.dirname(input_path) # Create output directory if it doesn't exist os.makedirs(output_dir, exist_ok=True) input_filename = os.path.basename(input_path) name_without_ext = os.path.splitext(input_filename)[0] # Generate waveform image waveform_path = os.path.join(output_dir, f"{prefix}{name_without_ext}_waveform.png") generate_waveform_image(input_path, waveform_path) # Generate spectrogram image spectrogram_path = os.path.join(output_dir, f"{prefix}{name_without_ext}_spectrogram.png") generate_spectrogram_image(input_path, spectrogram_path) return (waveform_path, spectrogram_path)