Spaces:

FunAudioLLM
/

InspireMusic

Runtime error

InspireMusic / inspiremusic /wavtokenizer /decoder /discriminators.py

chong.zhang

init

5827423 7 months ago

7.75 kB

	from typing import Tuple, List

	import torch
	from torch import nn
	from torch.nn import Conv2d
	from torch.nn.utils import weight_norm


	class MultiPeriodDiscriminator(nn.Module):
	"""
	Multi-Period Discriminator module adapted from https://github.com/jik876/hifi-gan.
	Additionally, it allows incorporating conditional information with a learned embeddings table.

	Args:
	periods (tuple[int]): Tuple of periods for each discriminator.
	num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
	Defaults to None.
	"""

	def __init__(self, periods: Tuple[int] = (2, 3, 5, 7, 11), num_embeddings: int = None):
	super().__init__()
	self.discriminators = nn.ModuleList([DiscriminatorP(period=p, num_embeddings=num_embeddings) for p in periods])

	def forward(
	self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
	) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
	y_d_rs = []
	y_d_gs = []
	fmap_rs = []
	fmap_gs = []
	for d in self.discriminators:
	y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
	y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
	y_d_rs.append(y_d_r)
	fmap_rs.append(fmap_r)
	y_d_gs.append(y_d_g)
	fmap_gs.append(fmap_g)

	return y_d_rs, y_d_gs, fmap_rs, fmap_gs


	class DiscriminatorP(nn.Module):
	def __init__(
	self,
	period: int,
	in_channels: int = 1,
	kernel_size: int = 5,
	stride: int = 3,
	lrelu_slope: float = 0.1,
	num_embeddings: int = None,
	):
	super().__init__()
	self.period = period
	self.convs = nn.ModuleList(
	[
	weight_norm(Conv2d(in_channels, 32, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
	weight_norm(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
	weight_norm(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
	weight_norm(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
	weight_norm(Conv2d(1024, 1024, (kernel_size, 1), (1, 1), padding=(kernel_size // 2, 0))),
	]
	)
	if num_embeddings is not None:
	self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=1024)
	torch.nn.init.zeros_(self.emb.weight)

	self.conv_post = weight_norm(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
	self.lrelu_slope = lrelu_slope

	def forward(
	self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
	) -> Tuple[torch.Tensor, List[torch.Tensor]]:
	x = x.unsqueeze(1)
	fmap = []
	# 1d to 2d
	b, c, t = x.shape
	if t % self.period != 0: # pad first
	n_pad = self.period - (t % self.period)
	x = torch.nn.functional.pad(x, (0, n_pad), "reflect")
	t = t + n_pad
	x = x.view(b, c, t // self.period, self.period)

	for i, l in enumerate(self.convs):
	x = l(x)
	x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
	if i > 0:
	fmap.append(x)
	if cond_embedding_id is not None:
	emb = self.emb(cond_embedding_id)
	h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
	else:
	h = 0
	x = self.conv_post(x)
	fmap.append(x)
	x += h
	x = torch.flatten(x, 1, -1)

	return x, fmap


	class MultiResolutionDiscriminator(nn.Module):
	def __init__(
	self,
	resolutions: Tuple[Tuple[int, int, int]] = ((1024, 256, 1024), (2048, 512, 2048), (512, 128, 512)),
	num_embeddings: int = None,
	):
	"""
	Multi-Resolution Discriminator module adapted from https://github.com/mindslab-ai/univnet.
	Additionally, it allows incorporating conditional information with a learned embeddings table.

	Args:
	resolutions (tuple[tuple[int, int, int]]): Tuple of resolutions for each discriminator.
	Each resolution should be a tuple of (n_fft, hop_length, win_length).
	num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
	Defaults to None.
	"""
	super().__init__()
	self.discriminators = nn.ModuleList(
	[DiscriminatorR(resolution=r, num_embeddings=num_embeddings) for r in resolutions]
	)

	def forward(
	self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
	) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
	y_d_rs = []
	y_d_gs = []
	fmap_rs = []
	fmap_gs = []

	for d in self.discriminators:
	y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
	y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
	y_d_rs.append(y_d_r)
	fmap_rs.append(fmap_r)
	y_d_gs.append(y_d_g)
	fmap_gs.append(fmap_g)

	return y_d_rs, y_d_gs, fmap_rs, fmap_gs


	class DiscriminatorR(nn.Module):
	def __init__(
	self,
	resolution: Tuple[int, int, int],
	channels: int = 64,
	in_channels: int = 1,
	num_embeddings: int = None,
	lrelu_slope: float = 0.1,
	):
	super().__init__()
	self.resolution = resolution
	self.in_channels = in_channels
	self.lrelu_slope = lrelu_slope
	self.convs = nn.ModuleList(
	[
	weight_norm(nn.Conv2d(in_channels, channels, kernel_size=(7, 5), stride=(2, 2), padding=(3, 2))),
	weight_norm(nn.Conv2d(channels, channels, kernel_size=(5, 3), stride=(2, 1), padding=(2, 1))),
	weight_norm(nn.Conv2d(channels, channels, kernel_size=(5, 3), stride=(2, 2), padding=(2, 1))),
	weight_norm(nn.Conv2d(channels, channels, kernel_size=3, stride=(2, 1), padding=1)),
	weight_norm(nn.Conv2d(channels, channels, kernel_size=3, stride=(2, 2), padding=1)),
	]
	)
	if num_embeddings is not None:
	self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
	torch.nn.init.zeros_(self.emb.weight)
	self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), padding=(1, 1)))

	def forward(
	self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
	) -> Tuple[torch.Tensor, List[torch.Tensor]]:
	fmap = []
	x = self.spectrogram(x)
	x = x.unsqueeze(1)
	for l in self.convs:
	x = l(x)
	x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
	fmap.append(x)
	if cond_embedding_id is not None:
	emb = self.emb(cond_embedding_id)
	h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
	else:
	h = 0
	x = self.conv_post(x)
	fmap.append(x)
	x += h
	x = torch.flatten(x, 1, -1)

	return x, fmap

	def spectrogram(self, x: torch.Tensor) -> torch.Tensor:
	n_fft, hop_length, win_length = self.resolution
	magnitude_spectrogram = torch.stft(
	x,
	n_fft=n_fft,
	hop_length=hop_length,
	win_length=win_length,
	window=None, # interestingly rectangular window kind of works here
	center=True,
	return_complex=True,
	).abs()

	return magnitude_spectrogram