Spaces:
Running
Running
| from torch import nn | |
| from TTS.tts.layers.generic.res_conv_bn import Conv1dBN | |
| class DurationPredictor(nn.Module): | |
| """Speedy Speech duration predictor model. | |
| Predicts phoneme durations from encoder outputs. | |
| Note: | |
| Outputs interpreted as log(durations) | |
| To get actual durations, do exp transformation | |
| conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1 | |
| Args: | |
| hidden_channels (int): number of channels in the inner layers. | |
| """ | |
| def __init__(self, hidden_channels): | |
| super().__init__() | |
| self.layers = nn.ModuleList( | |
| [ | |
| Conv1dBN(hidden_channels, hidden_channels, 4, 1), | |
| Conv1dBN(hidden_channels, hidden_channels, 3, 1), | |
| Conv1dBN(hidden_channels, hidden_channels, 1, 1), | |
| nn.Conv1d(hidden_channels, 1, 1), | |
| ] | |
| ) | |
| def forward(self, x, x_mask): | |
| """ | |
| Shapes: | |
| x: [B, C, T] | |
| x_mask: [B, 1, T] | |
| """ | |
| o = x | |
| for layer in self.layers: | |
| o = layer(o) * x_mask | |
| return o | |