Spaces:

tartuNLP
/

XTTSv2-est

Running

XTTSv2-est / TTS /tts /layers /feed_forward /duration_predictor.py

Rasmus Lellep

initial commit

5a03f53 5 months ago

1.1 kB

	from torch import nn

	from TTS.tts.layers.generic.res_conv_bn import Conv1dBN


	class DurationPredictor(nn.Module):
	"""Speedy Speech duration predictor model.
	Predicts phoneme durations from encoder outputs.

	Note:
	Outputs interpreted as log(durations)
	To get actual durations, do exp transformation

	conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1

	Args:
	hidden_channels (int): number of channels in the inner layers.
	"""

	def __init__(self, hidden_channels):
	super().__init__()

	self.layers = nn.ModuleList(
	[
	Conv1dBN(hidden_channels, hidden_channels, 4, 1),
	Conv1dBN(hidden_channels, hidden_channels, 3, 1),
	Conv1dBN(hidden_channels, hidden_channels, 1, 1),
	nn.Conv1d(hidden_channels, 1, 1),
	]
	)

	def forward(self, x, x_mask):
	"""
	Shapes:
	x: [B, C, T]
	x_mask: [B, 1, T]
	"""
	o = x
	for layer in self.layers:
	o = layer(o) * x_mask
	return o