File size: 4,021 Bytes
421323e
 
 
 
 
e599c74
421323e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e599c74
 
 
 
 
 
421323e
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from typing import Optional
from config import config
import numpy as np
import librosa
from PIL import Image 
import soundfile as sf

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='librosa')

class Mel:
    def __init__(
            self,
            file_path: str = None,
            spectrogram: Optional[np.ndarray] = None,
            image: Image.Image = None,
            x_res: int = config.image_size,
            y_res: int = config.image_size,
            sample_rate: int = config.sample_rate,
            n_fft: int = 2048,
            hop_length: int = 882,
            top_db: int = 80,
            n_iter: int = 32,
            ):
        self.hop_length = hop_length
        self.sr = sample_rate
        self.n_fft = n_fft
        self.top_db = top_db
        self.n_iter = n_iter
        self.x_res = x_res
        self.y_res = y_res
        self.n_mels = self.y_res
        self.slice_size = self.x_res * self.hop_length - 1
        self.file_path = file_path
        self.spectrogram = spectrogram
        self.image = image

        if file_path is not None and not isinstance(file_path, str):
            raise ValueError("file_path must be a string")
        if spectrogram is not None and not isinstance(spectrogram, np.ndarray):
            raise ValueError("spectrogram must be an ndarray")
        if image is not None and not isinstance(image, Image.Image):
            raise ValueError("image must be a PIL Image")

        if file_path is not None:
            self.load_file()
        elif image is not None:
            self.load_spectrogram()
        elif spectrogram is not None:
            self.load_image()
        else:
            print("Both file path and image are None!")

    def load_file(self):
        try:
            # Load audio
            if ".wav" in self.file_path:
                audio, _ = librosa.load(self.file_path, mono=True, sr=self.sr)
            # Pad audio if necessary
            if len(audio) < self.x_res * self.hop_length:
                audio = np.concatenate([audio, np.zeros((self.x_res * self.hop_length - len(audio),))])
            # Compute mel spectrogram
            S = librosa.feature.melspectrogram(
                y=audio, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels, fmax=self.sr//2
            )
            log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
            log_S = log_S[:self.y_res, :self.x_res]  # Ensure the spectrogram is of the desired size
            self.spectrogram = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8)
            self.image = Image.fromarray(self.spectrogram)
        except Exception as e:
            print(f"Error loading {self.file_path}: {e}")

    def load_spectrogram(self):
        self.spectrogram = np.array(self.image)

    def load_image(self):
        self.spectrogram = self.spectrogram.astype("uint8")
        self.image = Image.fromarray(self.spectrogram)

    def get_spectrogram(self):
        return self.spectrogram

    def get_image(self):
        return self.image

    def get_audio(self):
        log_S = self.spectrogram.astype("float") * self.top_db / 255 - self.top_db
        S = librosa.db_to_power(log_S)
        audio = librosa.feature.inverse.mel_to_audio(
            S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_iter=self.n_iter
        )
        return Audio(audio, rate=self.sr)

    def save_audio(self):
        audio = self.get_audio()
        sf.write(config.generated_track_path, audio.data, audio.rate)
        print(f"Audio saved to {config.generated_track_path}")

    def plot_spectrogram(self):
        plt.figure(figsize=(10, 4))
        plt.imshow(self.spectrogram, aspect='auto', origin='lower', cmap='viridis')
        plt.colorbar(label='Magnitude')
        plt.title('Mel Spectrogram')
        plt.xlabel('Time (frames)')
        plt.ylabel('Frequency (Mel bins)')
        plt.tight_layout()
        plt.show()