Spaces:
Runtime error
Runtime error
| # Copy from https://github.com/happylittlecat2333/Auffusion/blob/main/converter.py | |
| import numpy as np | |
| from PIL import Image | |
| import math | |
| import os | |
| import random | |
| import torch | |
| import json | |
| import torch.utils.data | |
| import numpy as np | |
| import librosa | |
| # from librosa.util import normalize | |
| from scipy.io.wavfile import read | |
| from librosa.filters import mel as librosa_mel_fn | |
| import torch.nn.functional as F | |
| import torch.nn as nn | |
| from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d | |
| from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm | |
| MAX_WAV_VALUE = 32768.0 | |
| def load_wav(full_path): | |
| sampling_rate, data = read(full_path) | |
| return data, sampling_rate | |
| def dynamic_range_compression(x, C=1, clip_val=1e-5): | |
| return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) | |
| def dynamic_range_decompression(x, C=1): | |
| return np.exp(x) / C | |
| def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): | |
| return torch.log(torch.clamp(x, min=clip_val) * C) | |
| def dynamic_range_decompression_torch(x, C=1): | |
| return torch.exp(x) / C | |
| def spectral_normalize_torch(magnitudes): | |
| output = dynamic_range_compression_torch(magnitudes) | |
| return output | |
| def spectral_de_normalize_torch(magnitudes): | |
| output = dynamic_range_decompression_torch(magnitudes) | |
| return output | |
| mel_basis = {} | |
| hann_window = {} | |
| def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): | |
| if torch.min(y) < -1.: | |
| print('min value is ', torch.min(y)) | |
| if torch.max(y) > 1.: | |
| print('max value is ', torch.max(y)) | |
| global mel_basis, hann_window | |
| if fmax not in mel_basis: | |
| mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) | |
| mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device) | |
| hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) | |
| y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') | |
| y = y.squeeze(1) | |
| # complex tensor as default, then use view_as_real for future pytorch compatibility | |
| spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], | |
| center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) | |
| spec = torch.view_as_real(spec) | |
| spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) | |
| spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec) | |
| spec = spectral_normalize_torch(spec) | |
| return spec | |
| def spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): | |
| if torch.min(y) < -1.: | |
| print('min value is ', torch.min(y)) | |
| if torch.max(y) > 1.: | |
| print('max value is ', torch.max(y)) | |
| global hann_window | |
| hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) | |
| y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') | |
| y = y.squeeze(1) | |
| # complex tensor as default, then use view_as_real for future pytorch compatibility | |
| spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], | |
| center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) | |
| spec = torch.view_as_real(spec) | |
| spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) | |
| return spec | |
| def normalize_spectrogram( | |
| spectrogram: torch.Tensor, | |
| max_value: float = 200, | |
| min_value: float = 1e-5, | |
| power: float = 1., | |
| inverse: bool = False | |
| ) -> torch.Tensor: | |
| # Rescale to 0-1 | |
| max_value = np.log(max_value) # 5.298317366548036 | |
| min_value = np.log(min_value) # -11.512925464970229 | |
| assert spectrogram.max() <= max_value and spectrogram.min() >= min_value | |
| data = (spectrogram - min_value) / (max_value - min_value) | |
| # Invert | |
| if inverse: | |
| data = 1 - data | |
| # Apply the power curve | |
| data = torch.pow(data, power) | |
| # 1D -> 3D | |
| data = data.repeat(3, 1, 1) | |
| # Flip Y axis: image origin at the top-left corner, spectrogram origin at the bottom-left corner | |
| data = torch.flip(data, [1]) | |
| return data | |
| def denormalize_spectrogram( | |
| data: torch.Tensor, | |
| max_value: float = 200, | |
| min_value: float = 1e-5, | |
| power: float = 1, | |
| inverse: bool = False, | |
| ) -> torch.Tensor: | |
| max_value = np.log(max_value) | |
| min_value = np.log(min_value) | |
| # Flip Y axis: image origin at the top-left corner, spectrogram origin at the bottom-left corner | |
| data = torch.flip(data, [1]) | |
| assert len(data.shape) == 3, "Expected 3 dimensions, got {}".format(len(data.shape)) | |
| if data.shape[0] == 1: | |
| data = data.repeat(3, 1, 1) | |
| assert data.shape[0] == 3, "Expected 3 channels, got {}".format(data.shape[0]) | |
| data = data[0] | |
| # Reverse the power curve | |
| data = torch.pow(data, 1 / power) | |
| # Invert | |
| if inverse: | |
| data = 1 - data | |
| # Rescale to max value | |
| spectrogram = data * (max_value - min_value) + min_value | |
| return spectrogram | |
| def get_mel_spectrogram_from_audio(audio, device="cpu"): | |
| audio = audio / MAX_WAV_VALUE | |
| audio = librosa.util.normalize(audio) * 0.95 | |
| # print(' >>> normalize done <<< ') | |
| audio = torch.FloatTensor(audio) | |
| audio = audio.unsqueeze(0) | |
| waveform = audio.to(device) | |
| spec = mel_spectrogram(waveform, n_fft=2048, num_mels=256, sampling_rate=16000, hop_size=160, win_size=1024, fmin=0, fmax=8000, center=False) | |
| return audio, spec | |
| LRELU_SLOPE = 0.1 | |
| MAX_WAV_VALUE = 32768.0 | |
| class AttrDict(dict): | |
| def __init__(self, *args, **kwargs): | |
| super(AttrDict, self).__init__(*args, **kwargs) | |
| self.__dict__ = self | |
| def get_config(config_path): | |
| config = json.loads(open(config_path).read()) | |
| config = AttrDict(config) | |
| return config | |
| def init_weights(m, mean=0.0, std=0.01): | |
| classname = m.__class__.__name__ | |
| if classname.find("Conv") != -1: | |
| m.weight.data.normal_(mean, std) | |
| def apply_weight_norm(m): | |
| classname = m.__class__.__name__ | |
| if classname.find("Conv") != -1: | |
| weight_norm(m) | |
| def get_padding(kernel_size, dilation=1): | |
| return int((kernel_size*dilation - dilation)/2) | |
| class ResBlock1(torch.nn.Module): | |
| def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): | |
| super(ResBlock1, self).__init__() | |
| self.h = h | |
| self.convs1 = nn.ModuleList([ | |
| weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], | |
| padding=get_padding(kernel_size, dilation[0]))), | |
| weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], | |
| padding=get_padding(kernel_size, dilation[1]))), | |
| weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], | |
| padding=get_padding(kernel_size, dilation[2]))) | |
| ]) | |
| self.convs1.apply(init_weights) | |
| self.convs2 = nn.ModuleList([ | |
| weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, | |
| padding=get_padding(kernel_size, 1))), | |
| weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, | |
| padding=get_padding(kernel_size, 1))), | |
| weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, | |
| padding=get_padding(kernel_size, 1))) | |
| ]) | |
| self.convs2.apply(init_weights) | |
| def forward(self, x): | |
| for c1, c2 in zip(self.convs1, self.convs2): | |
| xt = F.leaky_relu(x, LRELU_SLOPE) | |
| xt = c1(xt) | |
| xt = F.leaky_relu(xt, LRELU_SLOPE) | |
| xt = c2(xt) | |
| x = xt + x | |
| return x | |
| def remove_weight_norm(self): | |
| for l in self.convs1: | |
| remove_weight_norm(l) | |
| for l in self.convs2: | |
| remove_weight_norm(l) | |
| class ResBlock2(torch.nn.Module): | |
| def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): | |
| super(ResBlock2, self).__init__() | |
| self.h = h | |
| self.convs = nn.ModuleList([ | |
| weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], | |
| padding=get_padding(kernel_size, dilation[0]))), | |
| weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], | |
| padding=get_padding(kernel_size, dilation[1]))) | |
| ]) | |
| self.convs.apply(init_weights) | |
| def forward(self, x): | |
| for c in self.convs: | |
| xt = F.leaky_relu(x, LRELU_SLOPE) | |
| xt = c(xt) | |
| x = xt + x | |
| return x | |
| def remove_weight_norm(self): | |
| for l in self.convs: | |
| remove_weight_norm(l) | |
| class Generator(torch.nn.Module): | |
| def __init__(self, h): | |
| super(Generator, self).__init__() | |
| self.h = h | |
| self.num_kernels = len(h.resblock_kernel_sizes) | |
| self.num_upsamples = len(h.upsample_rates) | |
| self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)) # change: 80 --> 512 | |
| resblock = ResBlock1 if h.resblock == '1' else ResBlock2 | |
| self.ups = nn.ModuleList() | |
| for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): | |
| if (k-u) % 2 == 0: | |
| self.ups.append(weight_norm( | |
| ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)), | |
| k, u, padding=(k-u)//2))) | |
| else: | |
| self.ups.append(weight_norm( | |
| ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)), | |
| k, u, padding=(k-u)//2+1, output_padding=1))) | |
| # self.ups.append(weight_norm( | |
| # ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)), | |
| # k, u, padding=(k-u)//2))) | |
| self.resblocks = nn.ModuleList() | |
| for i in range(len(self.ups)): | |
| ch = h.upsample_initial_channel//(2**(i+1)) | |
| for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): | |
| self.resblocks.append(resblock(h, ch, k, d)) | |
| self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) | |
| self.ups.apply(init_weights) | |
| self.conv_post.apply(init_weights) | |
| def forward(self, x): | |
| x = self.conv_pre(x) | |
| for i in range(self.num_upsamples): | |
| x = F.leaky_relu(x, LRELU_SLOPE) | |
| x = self.ups[i](x) | |
| xs = None | |
| for j in range(self.num_kernels): | |
| if xs is None: | |
| xs = self.resblocks[i*self.num_kernels+j](x) | |
| else: | |
| xs += self.resblocks[i*self.num_kernels+j](x) | |
| x = xs / self.num_kernels | |
| x = F.leaky_relu(x) | |
| x = self.conv_post(x) | |
| x = torch.tanh(x) | |
| return x | |
| def remove_weight_norm(self): | |
| for l in self.ups: | |
| remove_weight_norm(l) | |
| for l in self.resblocks: | |
| l.remove_weight_norm() | |
| remove_weight_norm(self.conv_pre) | |
| remove_weight_norm(self.conv_post) | |
| def from_pretrained(cls, pretrained_model_name_or_path, subfolder=None): | |
| if subfolder is not None: | |
| pretrained_model_name_or_path = os.path.join(pretrained_model_name_or_path, subfolder) | |
| config_path = os.path.join(pretrained_model_name_or_path, "config.json") | |
| ckpt_path = os.path.join(pretrained_model_name_or_path, "vocoder.pt") | |
| config = get_config(config_path) | |
| vocoder = cls(config) | |
| state_dict_g = torch.load(ckpt_path) | |
| vocoder.load_state_dict(state_dict_g["generator"]) | |
| vocoder.eval() | |
| vocoder.remove_weight_norm() | |
| return vocoder | |
| def inference(self, mels, lengths=None): | |
| self.eval() | |
| with torch.no_grad(): | |
| wavs = self(mels).squeeze(1) | |
| wavs = (wavs.cpu().numpy() * MAX_WAV_VALUE).astype("int16") | |
| if lengths is not None: | |
| wavs = wavs[:, :lengths] | |
| return wavs | |
| def normalize(images): | |
| """ | |
| Normalize an image array to [-1,1]. | |
| """ | |
| if images.min() >= 0: | |
| return 2.0 * images - 1.0 | |
| else: | |
| return images | |
| def pad_spec(spec, spec_length, pad_value=0, random_crop=True): # spec: [3, mel_dim, spec_len] | |
| assert spec_length % 8 == 0, "spec_length must be divisible by 8" | |
| if spec.shape[-1] < spec_length: | |
| # pad spec to spec_length | |
| spec = F.pad(spec, (0, spec_length - spec.shape[-1]), value=pad_value) | |
| else: | |
| # random crop | |
| if random_crop: | |
| start = random.randint(0, spec.shape[-1] - spec_length) | |
| spec = spec[:, :, start:start+spec_length] | |
| else: | |
| spec = spec[:, :, :spec_length] | |
| return spec |