Spaces:

fffiloni
/

auffusion

Running on Zero

App Files Files Community

auffusion / converter.py

fffiloni

Update converter.py

59f6f8d verified 5 months ago

raw

history blame contribute delete

12.3 kB



	import numpy as np
	from PIL import Image

	import math
	import os
	import random
	import torch
	import json
	import torch.utils.data
	import numpy as np
	import librosa
	from librosa.util import normalize
	from scipy.io.wavfile import read
	from librosa.filters import mel as librosa_mel_fn

	import torch.nn.functional as F
	import torch.nn as nn
	from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
	from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm

	MAX_WAV_VALUE = 32768.0


	def load_wav(full_path):
	sampling_rate, data = read(full_path)
	return data, sampling_rate


	def dynamic_range_compression(x, C=1, clip_val=1e-5):
	return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)


	def dynamic_range_decompression(x, C=1):
	return np.exp(x) / C


	def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
	return torch.log(torch.clamp(x, min=clip_val) * C)


	def dynamic_range_decompression_torch(x, C=1):
	return torch.exp(x) / C


	def spectral_normalize_torch(magnitudes):
	output = dynamic_range_compression_torch(magnitudes)
	return output


	def spectral_de_normalize_torch(magnitudes):
	output = dynamic_range_decompression_torch(magnitudes)
	return output


	mel_basis = {}
	hann_window = {}


	def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
	if torch.min(y) < -1.:
	print('min value is ', torch.min(y))
	if torch.max(y) > 1.:
	print('max value is ', torch.max(y))

	global mel_basis, hann_window
	if fmax not in mel_basis:
	mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
	mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
	hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)

	y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
	y = y.squeeze(1)

	# complex tensor as default, then use view_as_real for future pytorch compatibility
	spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
	center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
	spec = torch.view_as_real(spec)
	spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))

	spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
	spec = spectral_normalize_torch(spec)

	return spec


	def spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
	if torch.min(y) < -1.:
	print('min value is ', torch.min(y))
	if torch.max(y) > 1.:
	print('max value is ', torch.max(y))

	global hann_window
	hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)

	y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
	y = y.squeeze(1)

	# complex tensor as default, then use view_as_real for future pytorch compatibility
	spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
	center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
	spec = torch.view_as_real(spec)
	spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))

	return spec


	def normalize_spectrogram(
	spectrogram: torch.Tensor,
	max_value: float = 200,
	min_value: float = 1e-5,
	power: float = 1.,
	inverse: bool = False
	) -> torch.Tensor:

	# Rescale to 0-1
	max_value = np.log(max_value) # 5.298317366548036
	min_value = np.log(min_value) # -11.512925464970229

	assert spectrogram.max() <= max_value and spectrogram.min() >= min_value

	data = (spectrogram - min_value) / (max_value - min_value)

	# Invert
	if inverse:
	data = 1 - data

	# Apply the power curve
	data = torch.pow(data, power)

	# 1D -> 3D
	data = data.repeat(3, 1, 1)

	# Flip Y axis: image origin at the top-left corner, spectrogram origin at the bottom-left corner
	data = torch.flip(data, [1])

	return data



	def denormalize_spectrogram(
	data: torch.Tensor,
	max_value: float = 200,
	min_value: float = 1e-5,
	power: float = 1,
	inverse: bool = False,
	) -> torch.Tensor:

	max_value = np.log(max_value)
	min_value = np.log(min_value)

	# Flip Y axis: image origin at the top-left corner, spectrogram origin at the bottom-left corner
	data = torch.flip(data, [1])

	assert len(data.shape) == 3, "Expected 3 dimensions, got {}".format(len(data.shape))

	if data.shape[0] == 1:
	data = data.repeat(3, 1, 1)

	assert data.shape[0] == 3, "Expected 3 channels, got {}".format(data.shape[0])
	data = data[0]

	# Reverse the power curve
	data = torch.pow(data, 1 / power)

	# Invert
	if inverse:
	data = 1 - data

	# Rescale to max value
	spectrogram = data * (max_value - min_value) + min_value

	return spectrogram


	def get_mel_spectrogram_from_audio(audio, device="cuda"):
	audio = audio / MAX_WAV_VALUE

	audio = librosa.util.normalize(audio) * 0.95

	audio = torch.FloatTensor(audio)
	audio = audio.unsqueeze(0)

	waveform = audio.to(device)
	spec = mel_spectrogram(waveform, n_fft=2048, num_mels=256, sampling_rate=16000, hop_size=160, win_size=1024, fmin=0, fmax=8000, center=False)
	return audio, spec



	LRELU_SLOPE = 0.1
	MAX_WAV_VALUE = 32768.0


	class AttrDict(dict):
	def __init__(self, args, *kwargs):
	super(AttrDict, self).__init__(args, *kwargs)
	self.__dict__ = self


	def get_config(config_path):
	config = json.loads(open(config_path).read())
	config = AttrDict(config)
	return config

	def init_weights(m, mean=0.0, std=0.01):
	classname = m.__class__.__name__
	if classname.find("Conv") != -1:
	m.weight.data.normal_(mean, std)


	def apply_weight_norm(m):
	classname = m.__class__.__name__
	if classname.find("Conv") != -1:
	weight_norm(m)


	def get_padding(kernel_size, dilation=1):
	return int((kernel_size*dilation - dilation)/2)


	class ResBlock1(torch.nn.Module):
	def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
	super(ResBlock1, self).__init__()
	self.h = h
	self.convs1 = nn.ModuleList([
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
	padding=get_padding(kernel_size, dilation[0]))),
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
	padding=get_padding(kernel_size, dilation[1]))),
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
	padding=get_padding(kernel_size, dilation[2])))
	])
	self.convs1.apply(init_weights)

	self.convs2 = nn.ModuleList([
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
	padding=get_padding(kernel_size, 1))),
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
	padding=get_padding(kernel_size, 1))),
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
	padding=get_padding(kernel_size, 1)))
	])
	self.convs2.apply(init_weights)

	def forward(self, x):
	for c1, c2 in zip(self.convs1, self.convs2):
	xt = F.leaky_relu(x, LRELU_SLOPE)
	xt = c1(xt)
	xt = F.leaky_relu(xt, LRELU_SLOPE)
	xt = c2(xt)
	x = xt + x
	return x

	def remove_weight_norm(self):
	for l in self.convs1:
	remove_weight_norm(l)
	for l in self.convs2:
	remove_weight_norm(l)


	class ResBlock2(torch.nn.Module):
	def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
	super(ResBlock2, self).__init__()
	self.h = h
	self.convs = nn.ModuleList([
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
	padding=get_padding(kernel_size, dilation[0]))),
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
	padding=get_padding(kernel_size, dilation[1])))
	])
	self.convs.apply(init_weights)

	def forward(self, x):
	for c in self.convs:
	xt = F.leaky_relu(x, LRELU_SLOPE)
	xt = c(xt)
	x = xt + x
	return x

	def remove_weight_norm(self):
	for l in self.convs:
	remove_weight_norm(l)



	class Generator(torch.nn.Module):
	def __init__(self, h):
	super(Generator, self).__init__()
	self.h = h
	self.num_kernels = len(h.resblock_kernel_sizes)
	self.num_upsamples = len(h.upsample_rates)
	self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)) # change: 80 --> 512
	resblock = ResBlock1 if h.resblock == '1' else ResBlock2

	self.ups = nn.ModuleList()
	for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
	if (k-u) % 2 == 0:
	self.ups.append(weight_norm(
	ConvTranspose1d(h.upsample_initial_channel//(2i), h.upsample_initial_channel//(2(i+1)),
	k, u, padding=(k-u)//2)))
	else:
	self.ups.append(weight_norm(
	ConvTranspose1d(h.upsample_initial_channel//(2i), h.upsample_initial_channel//(2(i+1)),
	k, u, padding=(k-u)//2+1, output_padding=1)))

	# self.ups.append(weight_norm(
	# ConvTranspose1d(h.upsample_initial_channel//(2i), h.upsample_initial_channel//(2(i+1)),
	# k, u, padding=(k-u)//2)))


	self.resblocks = nn.ModuleList()
	for i in range(len(self.ups)):
	ch = h.upsample_initial_channel//(2**(i+1))
	for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
	self.resblocks.append(resblock(h, ch, k, d))

	self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
	self.ups.apply(init_weights)
	self.conv_post.apply(init_weights)

	def forward(self, x):
	x = self.conv_pre(x)
	for i in range(self.num_upsamples):
	x = F.leaky_relu(x, LRELU_SLOPE)
	x = self.ups[i](x)
	xs = None
	for j in range(self.num_kernels):
	if xs is None:
	xs = self.resblocks[i*self.num_kernels+j](x)
	else:
	xs += self.resblocks[i*self.num_kernels+j](x)
	x = xs / self.num_kernels
	x = F.leaky_relu(x)
	x = self.conv_post(x)
	x = torch.tanh(x)

	return x

	def remove_weight_norm(self):
	for l in self.ups:
	remove_weight_norm(l)
	for l in self.resblocks:
	l.remove_weight_norm()
	remove_weight_norm(self.conv_pre)
	remove_weight_norm(self.conv_post)

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, subfolder=None):
	if subfolder is not None:
	pretrained_model_name_or_path = os.path.join(pretrained_model_name_or_path, subfolder)
	config_path = os.path.join(pretrained_model_name_or_path, "config.json")
	ckpt_path = os.path.join(pretrained_model_name_or_path, "vocoder.pt")

	config = get_config(config_path)
	vocoder = cls(config)

	state_dict_g = torch.load(ckpt_path)
	vocoder.load_state_dict(state_dict_g["generator"])
	vocoder.eval()
	vocoder.remove_weight_norm()
	return vocoder


	@torch.no_grad()
	def inference(self, mels, lengths=None):
	self.eval()
	with torch.no_grad():
	wavs = self(mels).squeeze(1)

	wavs = (wavs.cpu().numpy() * MAX_WAV_VALUE).astype("int16")

	if lengths is not None:
	wavs = wavs[:, :lengths]

	return wavs