Spaces:

ASesYusuf1
/

dgfsfxc-tgsacxs-otyhrhs

Running on Zero

File size: 6,185 Bytes

01f8b5b

import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import soundfile as sf
from pathlib import Path
from skimage.metrics import structural_similarity as ssim


def generate_waveform_image(audio_path, output_path=None, fig_size=(10, 4)):
    """Generate a waveform image from an audio file.
    
    Args:
        audio_path: Path to the audio file
        output_path: Path to save the generated image (optional)
        fig_size: Size of the figure (width, height)
        
    Returns:
        BytesIO object containing the image if output_path is None, otherwise saves to output_path
    """
    # Load audio file
    y, sr = librosa.load(audio_path, sr=None, mono=False)
    
    # If mono, convert to stereo-like format for consistent plotting
    if y.ndim == 1:
        y = np.array([y, y])
    
    plt.figure(figsize=fig_size)
    
    # Plot waveform for each channel with fixed Y-axis scale
    plt.subplot(2, 1, 1)
    plt.plot(y[0])
    plt.title('Channel 1')
    plt.ylim([-1.0, 1.0])  # Fixed Y-axis scale for all waveforms
    
    plt.subplot(2, 1, 2)
    plt.plot(y[1])
    plt.title('Channel 2')
    plt.ylim([-1.0, 1.0])  # Fixed Y-axis scale for all waveforms
    
    plt.tight_layout()
    
    if output_path:
        plt.savefig(output_path)
        plt.close()
        return output_path
    else:
        buf = BytesIO()
        plt.savefig(buf, format='png')
        plt.close()
        buf.seek(0)
        return buf


def generate_spectrogram_image(audio_path, output_path=None, fig_size=(10, 8)):
    """Generate a spectrogram image from an audio file.
    
    Args:
        audio_path: Path to the audio file
        output_path: Path to save the generated image (optional)
        fig_size: Size of the figure (width, height)
        
    Returns:
        BytesIO object containing the image if output_path is None, otherwise saves to output_path
    """
    # Load audio file
    y, sr = librosa.load(audio_path, sr=None, mono=False)
    
    # If mono, convert to stereo-like format for consistent plotting
    if y.ndim == 1:
        y = np.array([y, y])
    
    plt.figure(figsize=fig_size)
    
    # Set fixed min and max values for spectrogram color scale
    vmin = -80  # dB
    vmax = 0    # dB
    
    # Generate spectrograms for each channel
    for i in range(2):
        # Compute spectrogram
        S = librosa.amplitude_to_db(np.abs(librosa.stft(y[i])), ref=np.max)
        
        plt.subplot(2, 1, i+1)
        # Use fixed frequency range and consistent color scaling
        librosa.display.specshow(
            S, 
            sr=sr, 
            x_axis='time', 
            y_axis='log',
            vmin=vmin,
            vmax=vmax
        )
        plt.colorbar(format='%+2.0f dB')
        plt.title(f'Channel {i+1} Spectrogram')
        
        # Set frequency range (y-axis) - typically up to Nyquist frequency (sr/2)
        plt.ylim([20, sr/2])  # From 20Hz to Nyquist frequency
    
    plt.tight_layout()
    
    if output_path:
        plt.savefig(output_path)
        plt.close()
        return output_path
    else:
        buf = BytesIO()
        plt.savefig(buf, format='png')
        plt.close()
        buf.seek(0)
        return buf


def compare_images(image1_path, image2_path, min_similarity_threshold=0.999):
    """Compare two images using Structural Similarity Index (SSIM) which is robust to small shifts.
    
    Args:
        image1_path: Path to the first image
        image2_path: Path to the second image
        min_similarity_threshold: Minimum similarity required for images to be considered matching (0.0-1.0)
            - Higher values (closer to 1.0) require images to be more similar
            - Lower values (closer to 0.0) are more permissive
            - A value of 0.99 requires 99% similarity between images
            - A value of 0.0 would consider any images to match
        
    Returns:
        Tuple of (similarity_score, is_match)
        - similarity_score: Value between 0.0 and 1.0, where 1.0 means identical images
        - is_match: Boolean indicating if similarity_score >= min_similarity_threshold
    """
    # Open images
    img1 = Image.open(image1_path).convert('RGB')
    img2 = Image.open(image2_path).convert('RGB')
    
    # Ensure same size for comparison
    if img1.size != img2.size:
        img2 = img2.resize(img1.size)
    
    # Convert to numpy arrays
    arr1 = np.array(img1)
    arr2 = np.array(img2)
    
    # Calculate SSIM for each color channel
    similarity_scores = []
    for channel in range(3):  # RGB channels
        score = ssim(arr1[:,:,channel], arr2[:,:,channel], data_range=255)
        similarity_scores.append(score)
    
    # Calculate average SSIM across channels
    similarity_score = np.mean(similarity_scores)
    
    # Determine if images match by comparing similarity to threshold
    is_match = similarity_score >= min_similarity_threshold
    
    return (similarity_score, is_match)


def generate_reference_images(input_path, output_dir=None, prefix=""):
    """Generate reference waveform and spectrogram images for an audio file.
    
    Args:
        input_path: Path to the audio file
        output_dir: Directory to save the generated images (optional)
        prefix: Prefix to add to the output image filenames
        
    Returns:
        Tuple of (waveform_path, spectrogram_path)
    """
    if output_dir is None:
        output_dir = os.path.dirname(input_path)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    input_filename = os.path.basename(input_path)
    name_without_ext = os.path.splitext(input_filename)[0]
    
    # Generate waveform image
    waveform_path = os.path.join(output_dir, f"{prefix}{name_without_ext}_waveform.png")
    generate_waveform_image(input_path, waveform_path)
    
    # Generate spectrogram image
    spectrogram_path = os.path.join(output_dir, f"{prefix}{name_without_ext}_spectrogram.png")
    generate_spectrogram_image(input_path, spectrogram_path)
    
    return (waveform_path, spectrogram_path)