File size: 6,185 Bytes
01f8b5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import soundfile as sf
from pathlib import Path
from skimage.metrics import structural_similarity as ssim


def generate_waveform_image(audio_path, output_path=None, fig_size=(10, 4)):
    """Generate a waveform image from an audio file.
    
    Args:
        audio_path: Path to the audio file
        output_path: Path to save the generated image (optional)
        fig_size: Size of the figure (width, height)
        
    Returns:
        BytesIO object containing the image if output_path is None, otherwise saves to output_path
    """
    # Load audio file
    y, sr = librosa.load(audio_path, sr=None, mono=False)
    
    # If mono, convert to stereo-like format for consistent plotting
    if y.ndim == 1:
        y = np.array([y, y])
    
    plt.figure(figsize=fig_size)
    
    # Plot waveform for each channel with fixed Y-axis scale
    plt.subplot(2, 1, 1)
    plt.plot(y[0])
    plt.title('Channel 1')
    plt.ylim([-1.0, 1.0])  # Fixed Y-axis scale for all waveforms
    
    plt.subplot(2, 1, 2)
    plt.plot(y[1])
    plt.title('Channel 2')
    plt.ylim([-1.0, 1.0])  # Fixed Y-axis scale for all waveforms
    
    plt.tight_layout()
    
    if output_path:
        plt.savefig(output_path)
        plt.close()
        return output_path
    else:
        buf = BytesIO()
        plt.savefig(buf, format='png')
        plt.close()
        buf.seek(0)
        return buf


def generate_spectrogram_image(audio_path, output_path=None, fig_size=(10, 8)):
    """Generate a spectrogram image from an audio file.
    
    Args:
        audio_path: Path to the audio file
        output_path: Path to save the generated image (optional)
        fig_size: Size of the figure (width, height)
        
    Returns:
        BytesIO object containing the image if output_path is None, otherwise saves to output_path
    """
    # Load audio file
    y, sr = librosa.load(audio_path, sr=None, mono=False)
    
    # If mono, convert to stereo-like format for consistent plotting
    if y.ndim == 1:
        y = np.array([y, y])
    
    plt.figure(figsize=fig_size)
    
    # Set fixed min and max values for spectrogram color scale
    vmin = -80  # dB
    vmax = 0    # dB
    
    # Generate spectrograms for each channel
    for i in range(2):
        # Compute spectrogram
        S = librosa.amplitude_to_db(np.abs(librosa.stft(y[i])), ref=np.max)
        
        plt.subplot(2, 1, i+1)
        # Use fixed frequency range and consistent color scaling
        librosa.display.specshow(
            S, 
            sr=sr, 
            x_axis='time', 
            y_axis='log',
            vmin=vmin,
            vmax=vmax
        )
        plt.colorbar(format='%+2.0f dB')
        plt.title(f'Channel {i+1} Spectrogram')
        
        # Set frequency range (y-axis) - typically up to Nyquist frequency (sr/2)
        plt.ylim([20, sr/2])  # From 20Hz to Nyquist frequency
    
    plt.tight_layout()
    
    if output_path:
        plt.savefig(output_path)
        plt.close()
        return output_path
    else:
        buf = BytesIO()
        plt.savefig(buf, format='png')
        plt.close()
        buf.seek(0)
        return buf


def compare_images(image1_path, image2_path, min_similarity_threshold=0.999):
    """Compare two images using Structural Similarity Index (SSIM) which is robust to small shifts.
    
    Args:
        image1_path: Path to the first image
        image2_path: Path to the second image
        min_similarity_threshold: Minimum similarity required for images to be considered matching (0.0-1.0)
            - Higher values (closer to 1.0) require images to be more similar
            - Lower values (closer to 0.0) are more permissive
            - A value of 0.99 requires 99% similarity between images
            - A value of 0.0 would consider any images to match
        
    Returns:
        Tuple of (similarity_score, is_match)
        - similarity_score: Value between 0.0 and 1.0, where 1.0 means identical images
        - is_match: Boolean indicating if similarity_score >= min_similarity_threshold
    """
    # Open images
    img1 = Image.open(image1_path).convert('RGB')
    img2 = Image.open(image2_path).convert('RGB')
    
    # Ensure same size for comparison
    if img1.size != img2.size:
        img2 = img2.resize(img1.size)
    
    # Convert to numpy arrays
    arr1 = np.array(img1)
    arr2 = np.array(img2)
    
    # Calculate SSIM for each color channel
    similarity_scores = []
    for channel in range(3):  # RGB channels
        score = ssim(arr1[:,:,channel], arr2[:,:,channel], data_range=255)
        similarity_scores.append(score)
    
    # Calculate average SSIM across channels
    similarity_score = np.mean(similarity_scores)
    
    # Determine if images match by comparing similarity to threshold
    is_match = similarity_score >= min_similarity_threshold
    
    return (similarity_score, is_match)


def generate_reference_images(input_path, output_dir=None, prefix=""):
    """Generate reference waveform and spectrogram images for an audio file.
    
    Args:
        input_path: Path to the audio file
        output_dir: Directory to save the generated images (optional)
        prefix: Prefix to add to the output image filenames
        
    Returns:
        Tuple of (waveform_path, spectrogram_path)
    """
    if output_dir is None:
        output_dir = os.path.dirname(input_path)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    input_filename = os.path.basename(input_path)
    name_without_ext = os.path.splitext(input_filename)[0]
    
    # Generate waveform image
    waveform_path = os.path.join(output_dir, f"{prefix}{name_without_ext}_waveform.png")
    generate_waveform_image(input_path, waveform_path)
    
    # Generate spectrogram image
    spectrogram_path = os.path.join(output_dir, f"{prefix}{name_without_ext}_spectrogram.png")
    generate_spectrogram_image(input_path, spectrogram_path)
    
    return (waveform_path, spectrogram_path)