Upload XVectorForSequenceClassification

Browse files

Files changed (13) hide show

angular_loss.py +68 -0
audio_processing.py +411 -0
cnn.py +247 -0
config.json +6 -2
conv_asr.py +189 -0
features.py +560 -0
logging.py +55 -0
model.safetensors +3 -0
modeling_xvector.py +153 -0
module.py +105 -0
normalization.py +99 -0
spectrogram_augment.py +223 -0
tdnn_attention.py +550 -0

angular_loss.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import torch
+import torch.nn as nn
+class Loss(nn.modules.loss._Loss):
+    """Inherit this class to implement custom loss."""
+    def __init__(self, **kwargs):
+        super(Loss, self).__init__(**kwargs)
+class AdditiveMarginSoftmaxLoss(Loss):
+    """Computes Additive Margin Softmax (CosFace) Loss
+    Paper: CosFace: Large Margin Cosine Loss for Deep Face Recognition
+    args:
+    scale: scale value for cosine angle
+    margin: margin value added to cosine angle
+    """
+    def __init__(self, scale=30.0, margin=0.2):
+        super().__init__()
+        self.eps = 1e-7
+        self.scale = scale
+        self.margin = margin
+    def forward(self, logits: torch.Tensor, labels: torch.Tensor):
+        # Extract the logits corresponding to the true class
+        logits_target = logits[torch.arange(logits.size(0)), labels]  # Faster indexing
+        numerator = self.scale * (logits_target - self.margin)  # Apply additive margin
+        # Exclude the target logits from denominator calculation
+        logits.scatter_(1, labels.unsqueeze(1), float('-inf'))  # Mask target class
+        denominator = torch.exp(numerator) + torch.sum(torch.exp(self.scale * logits), dim=1)
+        # Compute final loss
+        loss = -torch.log(torch.exp(numerator) / denominator)
+        return loss.mean()
+class AdditiveAngularMarginSoftmaxLoss(Loss):
+    """Computes Additive Angular Margin Softmax (ArcFace) Loss
+    Paper: ArcFace: Additive Angular Margin Loss for Deep Face Recognition
+    Args:
+    scale: scale value for cosine angle
+    margin: margin value added to cosine angle
+    """
+    def __init__(self, scale=20.0, margin=1.35):
+        super().__init__()
+        self.eps = 1e-7
+        self.scale = scale
+        self.margin = margin
+    def forward(self, logits: torch.Tensor, labels: torch.Tensor):
+        numerator = self.scale * torch.cos(
+            torch.acos(torch.clamp(torch.diagonal(logits.transpose(0, 1)[labels]), -1.0 + self.eps, 1 - self.eps))
+            + self.margin
+        )
+        excl = torch.cat(
+            [torch.cat((logits[i, :y], logits[i, y + 1 :])).unsqueeze(0) for i, y in enumerate(labels)], dim=0
+        )
+        denominator = torch.exp(numerator) + torch.sum(torch.exp(self.scale * excl), dim=1)
+        L = numerator - torch.log(denominator)
+        return -torch.mean(L)

audio_processing.py ADDED Viewed

	@@ -0,0 +1,411 @@

+import math
+from packaging import version
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+import torch
+try:
+    import torchaudio
+    import torchaudio.functional
+    import torchaudio.transforms
+    TORCHAUDIO_VERSION = version.parse(torchaudio.__version__)
+    TORCHAUDIO_VERSION_MIN = version.parse('0.5')
+    HAVE_TORCHAUDIO = True
+except ModuleNotFoundError:
+    HAVE_TORCHAUDIO = False
+from .logging import logger
+from .module import NeuralModule
+from .features import FilterbankFeatures, FilterbankFeaturesTA
+from .spectrogram_augment import SpecCutout, SpecAugment
+class AudioPreprocessor(NeuralModule, ABC):
+    """
+    An interface for Neural Modules that performs audio pre-processing,
+    transforming the wav files to features.
+    """
+    def __init__(self, win_length, hop_length):
+        super().__init__()
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.torch_windows = {
+            'hann': torch.hann_window,
+            'hamming': torch.hamming_window,
+            'blackman': torch.blackman_window,
+            'bartlett': torch.bartlett_window,
+            'ones': torch.ones,
+            None: torch.ones,
+        }
+        # Normally, when you call to(dtype) on a torch.nn.Module, all
+        # floating point parameters and buffers will change to that
+        # dtype, rather than being float32. The AudioPreprocessor
+        # classes, uniquely, don't actually have any parameters or
+        # buffers from what I see. In addition, we want the input to
+        # the preprocessor to be float32, but need to create the
+        # output in appropriate precision. We have this empty tensor
+        # here just to detect which dtype tensor this module should
+        # output at the end of execution.
+        self.register_buffer("dtype_sentinel_tensor", torch.tensor((), dtype=torch.float32), persistent=False)
+    @torch.no_grad()
+    def forward(self, input_signal, length):
+        processed_signal, processed_length = self.get_features(input_signal.to(torch.float32), length)
+        processed_signal = processed_signal.to(self.dtype_sentinel_tensor.dtype)
+        return processed_signal, processed_length
+    @abstractmethod
+    def get_features(self, input_signal, length):
+        # Called by forward(). Subclasses should implement this.
+        pass
+class AudioToMelSpectrogramPreprocessor(AudioPreprocessor):
+    """Featurizer module that converts wavs to mel spectrograms.
+    Args:
+        sample_rate (int): Sample rate of the input audio data.
+            Defaults to 16000
+        window_size (float): Size of window for fft in seconds
+            Defaults to 0.02
+        window_stride (float): Stride of window for fft in seconds
+            Defaults to 0.01
+        n_window_size (int): Size of window for fft in samples
+            Defaults to None. Use one of window_size or n_window_size.
+        n_window_stride (int): Stride of window for fft in samples
+            Defaults to None. Use one of window_stride or n_window_stride.
+        window (str): Windowing function for fft. can be one of ['hann',
+            'hamming', 'blackman', 'bartlett']
+            Defaults to "hann"
+        normalize (str): Can be one of ['per_feature', 'all_features']; all
+            other options disable feature normalization. 'all_features'
+            normalizes the entire spectrogram to be mean 0 with std 1.
+            'pre_features' normalizes per channel / freq instead.
+            Defaults to "per_feature"
+        n_fft (int): Length of FT window. If None, it uses the smallest power
+            of 2 that is larger than n_window_size.
+            Defaults to None
+        preemph (float): Amount of pre emphasis to add to audio. Can be
+            disabled by passing None.
+            Defaults to 0.97
+        features (int): Number of mel spectrogram freq bins to output.
+            Defaults to 64
+        lowfreq (int): Lower bound on mel basis in Hz.
+            Defaults to 0
+        highfreq  (int): Lower bound on mel basis in Hz.
+            Defaults to None
+        log (bool): Log features.
+            Defaults to True
+        log_zero_guard_type(str): Need to avoid taking the log of zero. There
+            are two options: "add" or "clamp".
+            Defaults to "add".
+        log_zero_guard_value(float, or str): Add or clamp requires the number
+            to add with or clamp to. log_zero_guard_value can either be a float
+            or "tiny" or "eps". torch.finfo is used if "tiny" or "eps" is
+            passed.
+            Defaults to 2**-24.
+        dither (float): Amount of white-noise dithering.
+            Defaults to 1e-5
+        pad_to (int): Ensures that the output size of the time dimension is
+            a multiple of pad_to.
+            Defaults to 16
+        frame_splicing (int): Defaults to 1
+        exact_pad (bool): If True, sets stft center to False and adds padding, such that num_frames = audio_length
+            // hop_length. Defaults to False.
+        pad_value (float): The value that shorter mels are padded with.
+            Defaults to 0
+        mag_power (float): The power that the linear spectrogram is raised to
+            prior to multiplication with mel basis.
+            Defaults to 2 for a power spec
+        rng : Random number generator
+        nb_augmentation_prob (float) : Probability with which narrowband augmentation would be applied to
+            samples in the batch.
+            Defaults to 0.0
+        nb_max_freq (int) : Frequency above which all frequencies will be masked for narrowband augmentation.
+            Defaults to 4000
+        use_torchaudio: Whether to use the `torchaudio` implementation.
+        mel_norm: Normalization used for mel filterbank weights.
+            Defaults to 'slaney' (area normalization)
+        stft_exact_pad: Deprecated argument, kept for compatibility with older checkpoints.
+        stft_conv: Deprecated argument, kept for compatibility with older checkpoints.
+    """
+    def __init__(
+        self,
+        sample_rate=16000,
+        window_size=0.02,
+        window_stride=0.01,
+        n_window_size=None,
+        n_window_stride=None,
+        window="hann",
+        normalize="per_feature",
+        n_fft=None,
+        preemph=0.97,
+        features=64,
+        lowfreq=0,
+        highfreq=None,
+        log=True,
+        log_zero_guard_type="add",
+        log_zero_guard_value=2**-24,
+        dither=1e-5,
+        pad_to=16,
+        frame_splicing=1,
+        exact_pad=False,
+        pad_value=0,
+        mag_power=2.0,
+        rng=None,
+        nb_augmentation_prob=0.0,
+        nb_max_freq=4000,
+        use_torchaudio: bool = False,
+        mel_norm="slaney",
+    ):
+        super().__init__(n_window_size, n_window_stride)
+        self._sample_rate = sample_rate
+        if window_size and n_window_size:
+            raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.")
+        if window_stride and n_window_stride:
+            raise ValueError(
+                f"{self} received both window_stride and " f"n_window_stride. Only one should be specified."
+            )
+        if window_size:
+            n_window_size = int(window_size * self._sample_rate)
+        if window_stride:
+            n_window_stride = int(window_stride * self._sample_rate)
+        # Given the long and similar argument list, point to the class and instantiate it by reference
+        if not use_torchaudio:
+            logger.warning("Current only support FilterbankFeatures with torchaudio.")
+            featurizer_class = FilterbankFeaturesTA
+        else:
+            featurizer_class = FilterbankFeaturesTA
+        self.featurizer = featurizer_class(
+            sample_rate=self._sample_rate,
+            n_window_size=n_window_size,
+            n_window_stride=n_window_stride,
+            window=window,
+            normalize=normalize,
+            n_fft=n_fft,
+            preemph=preemph,
+            nfilt=features,
+            lowfreq=lowfreq,
+            highfreq=highfreq,
+            log=log,
+            log_zero_guard_type=log_zero_guard_type,
+            log_zero_guard_value=log_zero_guard_value,
+            dither=dither,
+            pad_to=pad_to,
+            frame_splicing=frame_splicing,
+            exact_pad=exact_pad,
+            pad_value=pad_value,
+            mag_power=mag_power,
+            rng=rng,
+            nb_augmentation_prob=nb_augmentation_prob,
+            nb_max_freq=nb_max_freq,
+            mel_norm=mel_norm,
+        )
+    def get_features(self, input_signal, length):
+        return self.featurizer(input_signal, length) # return tensor shape of (B, D, T)
+    @property
+    def filter_banks(self):
+        return self.featurizer.filter_banks
+class AudioToMFCCPreprocessor(AudioPreprocessor):
+    """Preprocessor that converts wavs to MFCCs.
+    Uses torchaudio.transforms.MFCC.
+    Args:
+        sample_rate: The sample rate of the audio.
+            Defaults to 16000.
+        window_size: Size of window for fft in seconds. Used to calculate the
+            win_length arg for mel spectrogram.
+            Defaults to 0.02
+        window_stride: Stride of window for fft in seconds. Used to caculate
+            the hop_length arg for mel spect.
+            Defaults to 0.01
+        n_window_size: Size of window for fft in samples
+            Defaults to None. Use one of window_size or n_window_size.
+        n_window_stride: Stride of window for fft in samples
+            Defaults to None. Use one of window_stride or n_window_stride.
+        window: Windowing function for fft. can be one of ['hann',
+            'hamming', 'blackman', 'bartlett', 'none', 'null'].
+            Defaults to 'hann'
+        n_fft: Length of FT window. If None, it uses the smallest power of 2
+            that is larger than n_window_size.
+            Defaults to None
+        lowfreq (int): Lower bound on mel basis in Hz.
+            Defaults to 0
+        highfreq  (int): Lower bound on mel basis in Hz.
+            Defaults to None
+        n_mels: Number of mel filterbanks.
+            Defaults to 64
+        n_mfcc: Number of coefficients to retain
+            Defaults to 64
+        dct_type: Type of discrete cosine transform to use
+        norm: Type of norm to use
+        log: Whether to use log-mel spectrograms instead of db-scaled.
+            Defaults to True.
+    """
+    def __init__(
+        self,
+        sample_rate=16000,
+        window_size=0.02,
+        window_stride=0.01,
+        n_window_size=None,
+        n_window_stride=None,
+        window='hann',
+        n_fft=None,
+        lowfreq=0.0,
+        highfreq=None,
+        n_mels=64,
+        n_mfcc=64,
+        dct_type=2,
+        norm='ortho',
+        log=True,
+    ):
+        self._sample_rate = sample_rate
+        if not HAVE_TORCHAUDIO:
+            logger.warning('Could not import torchaudio. Some features might not work.')
+            raise ModuleNotFoundError(
+                "torchaudio is not installed but is necessary for "
+                "AudioToMFCCPreprocessor. We recommend you try "
+                "building it from source for the PyTorch version you have."
+            )
+        if window_size and n_window_size:
+            raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.")
+        if window_stride and n_window_stride:
+            raise ValueError(
+                f"{self} received both window_stride and " f"n_window_stride. Only one should be specified."
+            )
+        # Get win_length (n_window_size) and hop_length (n_window_stride)
+        if window_size:
+            n_window_size = int(window_size * self._sample_rate)
+        if window_stride:
+            n_window_stride = int(window_stride * self._sample_rate)
+        super().__init__(n_window_size, n_window_stride)
+        mel_kwargs = {}
+        mel_kwargs['f_min'] = lowfreq
+        mel_kwargs['f_max'] = highfreq
+        mel_kwargs['n_mels'] = n_mels
+        mel_kwargs['n_fft'] = n_fft or 2 ** math.ceil(math.log2(n_window_size))
+        mel_kwargs['win_length'] = n_window_size
+        mel_kwargs['hop_length'] = n_window_stride
+        # Set window_fn. None defaults to torch.ones.
+        window_fn = self.torch_windows.get(window, None)
+        if window_fn is None:
+            raise ValueError(
+                f"Window argument for AudioProcessor is invalid: {window}."
+                f"For no window function, use 'ones' or None."
+            )
+        mel_kwargs['window_fn'] = window_fn
+        # Use torchaudio's implementation of MFCCs as featurizer
+        self.featurizer = torchaudio.transforms.MFCC(
+            sample_rate=self._sample_rate,
+            n_mfcc=n_mfcc,
+            dct_type=dct_type,
+            norm=norm,
+            log_mels=log,
+            melkwargs=mel_kwargs,
+        )
+    def get_features(self, input_signal, length):
+        features = self.featurizer(input_signal)
+        seq_len = torch.ceil(length.to(torch.float32) / self.hop_length).to(dtype=torch.long)
+        return features, seq_len
+class SpectrogramAugmentation(NeuralModule):
+    """
+    Performs time and freq cuts in one of two ways.
+    SpecAugment zeroes out vertical and horizontal sections as described in
+    SpecAugment (https://arxiv.org/abs/1904.08779). Arguments for use with
+    SpecAugment are `freq_masks`, `time_masks`, `freq_width`, and `time_width`.
+    SpecCutout zeroes out rectangulars as described in Cutout
+    (https://arxiv.org/abs/1708.04552). Arguments for use with Cutout are
+    `rect_masks`, `rect_freq`, and `rect_time`.
+    Args:
+        freq_masks (int): how many frequency segments should be cut.
+            Defaults to 0.
+        time_masks (int): how many time segments should be cut
+            Defaults to 0.
+        freq_width (int): maximum number of frequencies to be cut in one
+            segment.
+            Defaults to 10.
+        time_width (int): maximum number of time steps to be cut in one
+            segment
+            Defaults to 10.
+        rect_masks (int): how many rectangular masks should be cut
+            Defaults to 0.
+        rect_freq (int): maximum size of cut rectangles along the frequency
+            dimension
+            Defaults to 5.
+        rect_time (int): maximum size of cut rectangles along the time
+            dimension
+            Defaults to 25.
+        use_numba_spec_augment: use numba code for Spectrogram augmentation
+        use_vectorized_spec_augment: use vectorized code for Spectrogram augmentation
+    """
+    def __init__(
+        self,
+        freq_masks=0,
+        time_masks=0,
+        freq_width=10,
+        time_width=10,
+        rect_masks=0,
+        rect_time=5,
+        rect_freq=20,
+        rng=None,
+        mask_value=0.0,
+        use_vectorized_spec_augment: bool = True,
+    ):
+        super().__init__()
+        if rect_masks > 0:
+            self.spec_cutout = SpecCutout(
+                rect_masks=rect_masks,
+                rect_time=rect_time,
+                rect_freq=rect_freq,
+                rng=rng,
+            )
+            # self.spec_cutout.to(self._device)
+        else:
+            self.spec_cutout = lambda input_spec: input_spec
+        if freq_masks + time_masks > 0:
+            self.spec_augment = SpecAugment(
+                freq_masks=freq_masks,
+                time_masks=time_masks,
+                freq_width=freq_width,
+                time_width=time_width,
+                rng=rng,
+                mask_value=mask_value,
+                use_vectorized_code=use_vectorized_spec_augment,
+            )
+        else:
+            self.spec_augment = lambda input_spec, length: input_spec
+    def forward(self, input_spec, length):
+        augmented_spec = self.spec_cutout(input_spec=input_spec)
+        augmented_spec = self.spec_augment(input_spec=augmented_spec, length=length)
+        return augmented_spec # # return tensor shape of (B, D, T)

cnn.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import math
+import torch.nn as nn
+import torch.nn.functional as F
+class Conv1d(nn.Module):
+    """This function implements 1d convolution.
+    Arguments
+    ---------
+    out_channels : int
+        It is the number of output channels.
+    kernel_size : int
+        Kernel size of the convolutional filters.
+    input_shape : tuple
+        The shape of the input. Alternatively use ``in_channels``.
+    in_channels : int
+        The number of input channels. Alternatively use ``input_shape``.
+    stride : int
+        Stride factor of the convolutional filters. When the stride factor > 1,
+        a decimation in time is performed.
+    dilation : int
+        Dilation factor of the convolutional filters.
+    padding : str
+        (same, valid, causal). If "valid", no padding is performed.
+        If "same" and stride is 1, output shape is the same as the input shape.
+        "causal" results in causal (dilated) convolutions.
+    groups : int
+        Number of blocked connections from input channels to output channels.
+    bias : bool
+        Whether to add a bias term to convolution operation.
+    padding_mode : str
+        This flag specifies the type of padding. See torch.nn documentation
+        for more information.
+    skip_transpose : bool
+        If False, uses batch x time x channel convention of speechbrain.
+        If True, uses batch x channel x time convention.
+    weight_norm : bool
+        If True, use weight normalization,
+        to be removed with self.remove_weight_norm() at inference
+    conv_init : str
+        Weight initialization for the convolution network
+    default_padding: str or int
+        This sets the default padding mode that will be used by the pytorch Conv1d backend.
+    Example
+    -------
+    >>> inp_tensor = torch.rand([10, 40, 16])
+    >>> cnn_1d = Conv1d(
+    ...     input_shape=inp_tensor.shape, out_channels=8, kernel_size=5
+    ... )
+    >>> out_tensor = cnn_1d(inp_tensor)
+    >>> out_tensor.shape
+    torch.Size([10, 40, 8])
+    """
+    def __init__(
+        self,
+        out_channels,
+        kernel_size,
+        input_shape=None,
+        in_channels=None,
+        stride=1,
+        dilation=1,
+        padding="same",
+        groups=1,
+        bias=True,
+        padding_mode="reflect",
+        skip_transpose=False,
+        weight_norm=False,
+        conv_init=None,
+        default_padding=0,
+    ):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.padding = padding
+        self.padding_mode = padding_mode
+        self.unsqueeze = False
+        self.skip_transpose = skip_transpose
+        if input_shape is None and in_channels is None:
+            raise ValueError("Must provide one of input_shape or in_channels")
+        if in_channels is None:
+            in_channels = self._check_input_shape(input_shape)
+        self.in_channels = in_channels
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            self.kernel_size,
+            stride=self.stride,
+            dilation=self.dilation,
+            padding=default_padding,
+            groups=groups,
+            bias=bias,
+        )
+        if conv_init == "kaiming":
+            nn.init.kaiming_normal_(self.conv.weight)
+        elif conv_init == "zero":
+            nn.init.zeros_(self.conv.weight)
+        elif conv_init == "normal":
+            nn.init.normal_(self.conv.weight, std=1e-6)
+        if weight_norm:
+            self.conv = nn.utils.weight_norm(self.conv)
+    def forward(self, x, *args, **kwargs):
+        """Returns the output of the convolution.
+        Arguments
+        ---------
+        x : torch.Tensor (batch, time, channel)
+            input to convolve. 2d or 4d tensors are expected.
+        Returns
+        -------
+        wx : torch.Tensor
+            The convolved outputs.
+        """
+        if not self.skip_transpose:
+            x = x.transpose(1, -1)
+        if self.unsqueeze:
+            x = x.unsqueeze(1)
+        if self.padding == "same":
+            x = self._manage_padding(
+                x, self.kernel_size, self.dilation, self.stride
+            )
+        elif self.padding == "causal":
+            num_pad = (self.kernel_size - 1) * self.dilation
+            x = F.pad(x, (num_pad, 0))
+        elif self.padding == "valid":
+            pass
+        else:
+            raise ValueError(
+                "Padding must be 'same', 'valid' or 'causal'. Got "
+                + self.padding
+            )
+        wx = self.conv(x)
+        if self.unsqueeze:
+            wx = wx.squeeze(1)
+        if not self.skip_transpose:
+            wx = wx.transpose(1, -1)
+        return wx
+    def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int):
+        """This function performs zero-padding on the time axis
+        such that their lengths is unchanged after the convolution.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor.
+        kernel_size : int
+            Size of kernel.
+        dilation : int
+            Dilation used.
+        stride : int
+            Stride.
+        Returns
+        -------
+        x : torch.Tensor
+            The padded outputs.
+        """
+        # Detecting input shape
+        L_in = self.in_channels
+        # Time padding
+        padding = get_padding_elem(L_in, stride, kernel_size, dilation)
+        # Applying padding
+        x = F.pad(x, padding, mode=self.padding_mode)
+        return x
+    def _check_input_shape(self, shape):
+        """Checks the input shape and returns the number of input channels."""
+        if len(shape) == 2:
+            self.unsqueeze = True
+            in_channels = 1
+        elif self.skip_transpose:
+            in_channels = shape[1]
+        elif len(shape) == 3:
+            in_channels = shape[2]
+        else:
+            raise ValueError(
+                "conv1d expects 2d, 3d inputs. Got " + str(len(shape))
+            )
+        # Kernel size must be odd
+        if not self.padding == "valid" and self.kernel_size % 2 == 0:
+            raise ValueError(
+                "The field kernel size must be an odd number. Got %s."
+                % (self.kernel_size)
+            )
+        return in_channels
+    def remove_weight_norm(self):
+        """Removes weight normalization at inference if used during training."""
+        self.conv = nn.utils.remove_weight_norm(self.conv)
+def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
+    """This function computes the number of elements to add for zero-padding.
+    Arguments
+    ---------
+    L_in : int
+    stride: int
+    kernel_size : int
+    dilation : int
+    Returns
+    -------
+    padding : int
+        The size of the padding to be added
+    """
+    if stride > 1:
+        padding = [math.floor(kernel_size / 2), math.floor(kernel_size / 2)]
+    else:
+        L_out = (
+            math.floor((L_in - dilation * (kernel_size - 1) - 1) / stride) + 1
+        )
+        padding = [
+            math.floor((L_in - L_out) / 2),
+            math.floor((L_in - L_out) / 2),
+        ]
+    return padding

config.json CHANGED Viewed

@@ -1,9 +1,12 @@
 {
-  "_attn_implementation_autoset": true,
   "angular": false,
   "attention_channels": 128,
   "auto_map": {
-    "AutoConfig": "configuration_xvector.XVectorConfig"
   },
   "bos_token_id": 1,
   "decoder_config": {
@@ -2603,6 +2606,7 @@
   },
   "time_masks": 5,
   "time_width": 0.03,
   "transformers_version": "4.48.3",
   "use_torchaudio": true,
   "use_vectorized_spec_augment": true,

 {
   "angular": false,
+  "architectures": [
+    "XVectorForSequenceClassification"
+  ],
   "attention_channels": 128,
   "auto_map": {
+    "AutoConfig": "configuration_xvector.XVectorConfig",
+    "AutoModelForAudioClassification": "modeling_xvector.XVectorForSequenceClassification"
   },
   "bos_token_id": 1,
   "decoder_config": {
   },
   "time_masks": 5,
   "time_width": 0.03,
+  "torch_dtype": "float32",
   "transformers_version": "4.48.3",
   "use_torchaudio": true,
   "use_vectorized_spec_augment": true,

conv_asr.py ADDED Viewed

	@@ -0,0 +1,189 @@

+from typing import Optional, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .module import NeuralModule
+from .tdnn_attention import StatsPoolLayer, AttentivePoolLayer, init_weights
+from .cnn import Conv1d
+from .normalization import BatchNorm1d
+class TDNNLayer(nn.Module):
+    def __init__(self, in_conv_dim, out_conv_dim, kernel_size, dilation):
+        super().__init__()
+        self.in_conv_dim = in_conv_dim
+        self.out_conv_dim = out_conv_dim
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+        self.activation = nn.ReLU()
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
+        weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
+        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+class XVectorEncoder(NeuralModule):
+    """
+    input:
+        feat_in: input feature shape (mel spec feature shape)
+        filters: list of filter shapes for SE_TDNN modules
+        kernel_sizes: list of kernel shapes for SE_TDNN modules
+        dilations: list of dilations for group conv se layer
+        scale: scale value to group wider conv channels (deafult:8)
+    output:
+        outputs : encoded output
+        output_length: masked output lengths
+    """
+    def __init__(
+        self,
+        feat_in: int,
+        filters: list,
+        kernel_sizes: list,
+        dilations: list,
+        init_mode: str = 'xavier_uniform',
+    ):
+        super().__init__()
+        self.blocks = nn.ModuleList()
+        # TDNN layers
+        in_channels = feat_in
+        tdnn_blocks = len(filters)
+        for block_index in range(tdnn_blocks):
+            out_channels = filters[block_index]
+            self.blocks.extend(
+                [
+                    Conv1d(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        kernel_size=kernel_sizes[block_index],
+                        dilation=dilations[block_index],
+                    ),
+                    torch.nn.LeakyReLU(),
+                    BatchNorm1d(input_size=out_channels),
+                ]
+            )
+            in_channels = filters[block_index]
+        self.apply(lambda x: init_weights(x, mode=init_mode))
+    def forward(self, audio_signal: torch.Tensor, length: torch.Tensor = None):
+        """
+        audio_signal: tensor shape of (B, D, T)
+        output: tensor shape of (B, D, T)
+        """
+        x = audio_signal.transpose(1, 2)
+        for layer in self.blocks:
+            x = layer(x)
+        output = x.transpose(1, 2)
+        return output, length
+class SpeakerDecoder(NeuralModule):
+    """
+    Speaker Decoder creates the final neural layers that maps from the outputs
+    of Jasper Encoder to the embedding layer followed by speaker based softmax loss.
+    Args:
+        feat_in (int): Number of channels being input to this module
+        num_classes (int): Number of unique speakers in dataset
+        emb_sizes (list) : shapes of intermediate embedding layers (we consider speaker embbeddings
+            from 1st of this layers). Defaults to [1024,1024]
+        pool_mode (str) : Pooling strategy type. options are 'xvector','tap', 'attention'
+            Defaults to 'xvector (mean and variance)'
+            tap (temporal average pooling: just mean)
+            attention (attention based pooling)
+        init_mode (str): Describes how neural network parameters are
+            initialized. Options are ['xavier_uniform', 'xavier_normal',
+            'kaiming_uniform','kaiming_normal'].
+            Defaults to "xavier_uniform".
+    """
+    def __init__(
+        self,
+        feat_in: int,
+        num_classes: int,
+        emb_sizes: Optional[Union[int, list]] = 256,
+        pool_mode: str = 'xvector',
+        angular: bool = False,
+        attention_channels: int = 128,
+        init_mode: str = "xavier_uniform",
+    ):
+        super().__init__()
+        self.angular = angular
+        self.emb_id = 2
+        bias = False if self.angular else True
+        emb_sizes = [emb_sizes] if type(emb_sizes) is int else emb_sizes
+        self._num_classes = num_classes
+        self.pool_mode = pool_mode.lower()
+        if self.pool_mode == 'xvector' or self.pool_mode == 'tap':
+            self._pooling = StatsPoolLayer(feat_in=feat_in, pool_mode=self.pool_mode)
+            affine_type = 'linear'
+        elif self.pool_mode == 'attention':
+            self._pooling = AttentivePoolLayer(inp_filters=feat_in, attention_channels=attention_channels)
+            affine_type = 'conv'
+        shapes = [self._pooling.feat_in]
+        for size in emb_sizes:
+            shapes.append(int(size))
+        emb_layers = []
+        for shape_in, shape_out in zip(shapes[:-1], shapes[1:]):
+            layer = self.affine_layer(shape_in, shape_out, learn_mean=False, affine_type=affine_type)
+            emb_layers.append(layer)
+        self.emb_layers = nn.ModuleList(emb_layers)
+        self.final = nn.Linear(shapes[-1], self._num_classes, bias=bias)
+        self.apply(lambda x: init_weights(x, mode=init_mode))
+    def affine_layer(
+        self,
+        inp_shape,
+        out_shape,
+        learn_mean=True,
+        affine_type='conv',
+    ):
+        if affine_type == 'conv':
+            layer = nn.Sequential(
+                nn.BatchNorm1d(inp_shape, affine=True, track_running_stats=True),
+                nn.Conv1d(inp_shape, out_shape, kernel_size=1),
+            )
+        else:
+            layer = nn.Sequential(
+                nn.Linear(inp_shape, out_shape),
+                nn.BatchNorm1d(out_shape, affine=learn_mean, track_running_stats=True),
+                nn.ReLU(),
+            )
+        return layer
+    def forward(self, encoder_output, length: torch.Tensor = None):
+        pool = self._pooling(encoder_output, length)
+        embs = []
+        for layer in self.emb_layers:
+            pool, emb = layer(pool), layer[: self.emb_id](pool)
+            embs.append(emb)
+        pool = pool.squeeze(-1)
+        if self.angular:
+            for W in self.final.parameters():
+                W = F.normalize(W, p=2, dim=1)
+            pool = F.normalize(pool, p=2, dim=1)
+        out = self.final(pool)
+        return out, embs[-1].squeeze(-1)

features.py ADDED Viewed

	@@ -0,0 +1,560 @@

+import math
+import random
+from typing import Optional, Union, Tuple
+import librosa
+import torchaudio
+import torch
+import torch.nn as nn
+try:
+    import torchaudio
+    HAVE_TORCHAUDIO = True
+except ModuleNotFoundError:
+    HAVE_TORCHAUDIO = False
+CONSTANT = 1e-5
+def normalize_batch(x, seq_len, normalize_type):
+    x_mean = None
+    x_std = None
+    if normalize_type == "per_feature":
+        batch_size = x.shape[0]
+        max_time = x.shape[2]
+        # When doing stream capture to a graph, item() is not allowed
+        # becuase it calls cudaStreamSynchronize(). Therefore, we are
+        # sacrificing some error checking when running with cuda graphs.
+        if (
+            torch.cuda.is_available()
+            and not torch.cuda.is_current_stream_capturing()
+            and torch.any(seq_len == 1).item()
+        ):
+            raise ValueError(
+                "normalize_batch with `per_feature` normalize_type received a tensor of length 1. This will result "
+                "in torch.std() returning nan. Make sure your audio length has enough samples for a single "
+                "feature (ex. at least `hop_length` for Mel Spectrograms)."
+            )
+        time_steps = torch.arange(max_time, device=x.device).unsqueeze(0).expand(batch_size, max_time)
+        valid_mask = time_steps < seq_len.unsqueeze(1)
+        x_mean_numerator = torch.where(valid_mask.unsqueeze(1), x, 0.0).sum(axis=2)
+        x_mean_denominator = valid_mask.sum(axis=1)
+        x_mean = x_mean_numerator / x_mean_denominator.unsqueeze(1)
+        # Subtract 1 in the denominator to correct for the bias.
+        x_std = torch.sqrt(
+            torch.sum(torch.where(valid_mask.unsqueeze(1), x - x_mean.unsqueeze(2), 0.0) ** 2, axis=2)
+            / (x_mean_denominator.unsqueeze(1) - 1.0)
+        )
+        # make sure x_std is not zero
+        x_std += CONSTANT
+        return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2), x_mean, x_std
+    elif normalize_type == "all_features":
+        x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
+        x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
+        for i in range(x.shape[0]):
+            x_mean[i] = x[i, :, : seq_len[i].item()].mean()
+            x_std[i] = x[i, :, : seq_len[i].item()].std()
+        # make sure x_std is not zero
+        x_std += CONSTANT
+        return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1), x_mean, x_std
+    elif "fixed_mean" in normalize_type and "fixed_std" in normalize_type:
+        x_mean = torch.tensor(normalize_type["fixed_mean"], device=x.device)
+        x_std = torch.tensor(normalize_type["fixed_std"], device=x.device)
+        return (
+            (x - x_mean.view(x.shape[0], x.shape[1]).unsqueeze(2)) / x_std.view(x.shape[0], x.shape[1]).unsqueeze(2),
+            x_mean,
+            x_std,
+        )
+    else:
+        return x, x_mean, x_std
+def clean_spectrogram_batch(spectrogram: torch.Tensor, spectrogram_len: torch.Tensor, fill_value=0.0) -> torch.Tensor:
+    """
+    Fill spectrogram values outside the length with `fill_value`
+    Args:
+        spectrogram: Tensor with shape [B, C, L] containing batched spectrograms
+        spectrogram_len: Tensor with shape [B] containing the sequence length of each batch element
+        fill_value: value to fill with, 0.0 by default
+    Returns:
+        cleaned spectrogram, tensor with shape equal to `spectrogram`
+    """
+    device = spectrogram.device
+    batch_size, _, max_len = spectrogram.shape
+    mask = torch.arange(max_len, device=device)[None, :] >= spectrogram_len[:, None]
+    mask = mask.unsqueeze(1).expand_as(spectrogram)
+    return spectrogram.masked_fill(mask, fill_value)
+def splice_frames(x, frame_splicing):
+    """Stacks frames together across feature dim
+    input is batch_size, feature_dim, num_frames
+    output is batch_size, feature_dim*frame_splicing, num_frames
+    """
+    seq = [x]
+    for n in range(1, frame_splicing):
+        seq.append(torch.cat([x[:, :, :n], x[:, :, n:]], dim=2))
+    return torch.cat(seq, dim=1)
+@torch.jit.script_if_tracing
+def make_seq_mask_like(
+    lengths: torch.Tensor, like: torch.Tensor, time_dim: int = -1, valid_ones: bool = True
+) -> torch.Tensor:
+    """
+    Args:
+        lengths: Tensor with shape [B] containing the sequence length of each batch element
+        like: The mask will contain the same number of dimensions as this Tensor, and will have the same max
+            length in the time dimension of this Tensor.
+        time_dim: Time dimension of the `shape_tensor` and the resulting mask. Zero-based.
+        valid_ones: If True, valid tokens will contain value `1` and padding will be `0`. Else, invert.
+    Returns:
+        A :class:`torch.Tensor` containing 1's and 0's for valid and invalid tokens, respectively, if `valid_ones`, else
+        vice-versa. Mask will have the same number of dimensions as `like`. Batch and time dimensions will match
+        the `like`. All other dimensions will be singletons. E.g., if `like.shape == [3, 4, 5]` and
+        `time_dim == -1', mask will have shape `[3, 1, 5]`.
+    """
+    # Mask with shape [B, T]
+    mask = torch.arange(like.shape[time_dim], device=like.device).repeat(lengths.shape[0], 1).lt(lengths.view(-1, 1))
+    # [B, T] -> [B, *, T] where * is any number of singleton dimensions to expand to like tensor
+    for _ in range(like.dim() - mask.dim()):
+        mask = mask.unsqueeze(1)
+    # If needed, transpose time dim
+    if time_dim != -1 and time_dim != mask.dim() - 1:
+        mask = mask.transpose(-1, time_dim)
+    # Maybe invert the padded vs. valid token values
+    if not valid_ones:
+        mask = ~mask
+    return mask
+class FilterbankFeatures(nn.Module):
+    """Featurizer that converts wavs to Mel Spectrograms.
+    See AudioToMelSpectrogramPreprocessor for args.
+    """
+    def __init__(
+        self,
+        sample_rate=16000,
+        n_window_size=320,
+        n_window_stride=160,
+        window="hann",
+        normalize="per_feature",
+        n_fft=None,
+        preemph=0.97,
+        nfilt=64,
+        lowfreq=0,
+        highfreq=None,
+        log=True,
+        log_zero_guard_type="add",
+        log_zero_guard_value=2**-24,
+        dither=CONSTANT,
+        pad_to=16,
+        max_duration=16.7,
+        frame_splicing=1,
+        exact_pad=False,
+        pad_value=0,
+        mag_power=2.0,
+        use_grads=False,
+        rng=None,
+        nb_augmentation_prob=0.0,
+        nb_max_freq=4000,
+        mel_norm="slaney",
+        stft_exact_pad=False,  # Deprecated arguments; kept for config compatibility
+        stft_conv=False,  # Deprecated arguments; kept for config compatibility
+    ):
+        super().__init__()
+        if stft_conv or stft_exact_pad:
+            print(
+                "Using torch_stft is deprecated and has been removed. The values have been forcibly set to False "
+                "for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True "
+                "as needed."
+            )
+        if exact_pad and n_window_stride % 2 == 1:
+            raise NotImplementedError(
+                f"{self} received exact_pad == True, but hop_size was odd. If audio_length % hop_size == 0. Then the "
+                "returned spectrogram would not be of length audio_length // hop_size. Please use an even hop_size."
+            )
+        self.log_zero_guard_value = log_zero_guard_value
+        if (
+            n_window_size is None
+            or n_window_stride is None
+            or not isinstance(n_window_size, int)
+            or not isinstance(n_window_stride, int)
+            or n_window_size <= 0
+            or n_window_stride <= 0
+        ):
+            raise ValueError(
+                f"{self} got an invalid value for either n_window_size or "
+                f"n_window_stride. Both must be positive ints."
+            )
+        self.win_length = n_window_size
+        self.hop_length = n_window_stride
+        self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length))
+        self.stft_pad_amount = (self.n_fft - self.hop_length) // 2 if exact_pad else None
+        self.exact_pad = exact_pad
+        if exact_pad:
+            print("STFT using exact pad")
+        torch_windows = {
+            'hann': torch.hann_window,
+            'hamming': torch.hamming_window,
+            'blackman': torch.blackman_window,
+            'bartlett': torch.bartlett_window,
+            'none': None,
+        }
+        window_fn = torch_windows.get(window, None)
+        window_tensor = window_fn(self.win_length, periodic=False) if window_fn else None
+        self.register_buffer("window", window_tensor)
+        self.normalize = normalize
+        self.log = log
+        self.dither = dither
+        self.frame_splicing = frame_splicing
+        self.nfilt = nfilt
+        self.preemph = preemph
+        self.pad_to = pad_to
+        highfreq = highfreq or sample_rate / 2
+        filterbanks = torch.tensor(
+            librosa.filters.mel(
+                sr=sample_rate, n_fft=self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq, norm=mel_norm
+            ),
+            dtype=torch.float,
+        ).unsqueeze(0)
+        self.register_buffer("fb", filterbanks)
+        # Calculate maximum sequence length
+        max_length = self.get_seq_len(torch.tensor(max_duration * sample_rate, dtype=torch.float))
+        max_pad = pad_to - (max_length % pad_to) if pad_to > 0 else 0
+        self.max_length = max_length + max_pad
+        self.pad_value = pad_value
+        self.mag_power = mag_power
+        # We want to avoid taking the log of zero
+        # There are two options: either adding or clamping to a small value
+        if log_zero_guard_type not in ["add", "clamp"]:
+            raise ValueError(
+                f"{self} received {log_zero_guard_type} for the "
+                f"log_zero_guard_type parameter. It must be either 'add' or "
+                f"'clamp'."
+            )
+        self.use_grads = use_grads
+        if not use_grads:
+            self.forward = torch.no_grad()(self.forward)
+        self._rng = random.Random() if rng is None else rng
+        self.nb_augmentation_prob = nb_augmentation_prob
+        if self.nb_augmentation_prob > 0.0:
+            if nb_max_freq >= sample_rate / 2:
+                self.nb_augmentation_prob = 0.0
+            else:
+                self._nb_max_fft_bin = int((nb_max_freq / sample_rate) * n_fft)
+        # log_zero_guard_value is the the small we want to use, we support
+        # an actual number, or "tiny", or "eps"
+        self.log_zero_guard_type = log_zero_guard_type
+    def stft(self, x):
+        return torch.stft(
+            x,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            center=False if self.exact_pad else True,
+            window=self.window.to(dtype=torch.float),
+            return_complex=True,
+        )
+    def log_zero_guard_value_fn(self, x):
+        if isinstance(self.log_zero_guard_value, str):
+            if self.log_zero_guard_value == "tiny":
+                return torch.finfo(x.dtype).tiny
+            elif self.log_zero_guard_value == "eps":
+                return torch.finfo(x.dtype).eps
+            else:
+                raise ValueError(
+                    f"{self} received {self.log_zero_guard_value} for the "
+                    f"log_zero_guard_type parameter. It must be either a "
+                    f"number, 'tiny', or 'eps'"
+                )
+        else:
+            return self.log_zero_guard_value
+    def get_seq_len(self, seq_len):
+        # Assuming that center is True is stft_pad_amount = 0
+        pad_amount = self.stft_pad_amount * 2 if self.stft_pad_amount is not None else self.n_fft // 2 * 2
+        seq_len = torch.floor_divide((seq_len + pad_amount - self.n_fft), self.hop_length) + 1
+        return seq_len.to(dtype=torch.long)
+    @property
+    def filter_banks(self):
+        return self.fb
+    def forward(self, x, seq_len, linear_spec=False):
+        seq_len = self.get_seq_len(seq_len)
+        if self.stft_pad_amount is not None:
+            x = torch.nn.functional.pad(
+                x.unsqueeze(1), (self.stft_pad_amount, self.stft_pad_amount), "reflect"
+            ).squeeze(1)
+        # dither (only in training mode for eval determinism)
+        if self.training and self.dither > 0:
+            x += self.dither * torch.randn_like(x)
+        # do preemphasis
+        if self.preemph is not None:
+            x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1)
+        # disable autocast to get full range of stft values
+        with torch.amp.autocast(x.device.type, enabled=False):
+            x = self.stft(x)
+        # torch stft returns complex tensor (of shape [B,N,T]); so convert to magnitude
+        # guard is needed for sqrt if grads are passed through
+        guard = 0 if not self.use_grads else CONSTANT
+        x = torch.view_as_real(x)
+        x = torch.sqrt(x.pow(2).sum(-1) + guard)
+        if self.training and self.nb_augmentation_prob > 0.0:
+            for idx in range(x.shape[0]):
+                if self._rng.random() < self.nb_augmentation_prob:
+                    x[idx, self._nb_max_fft_bin :, :] = 0.0
+        # get power spectrum
+        if self.mag_power != 1.0:
+            x = x.pow(self.mag_power)
+        # return plain spectrogram if required
+        if linear_spec:
+            return x, seq_len
+        # dot with filterbank energies
+        x = torch.matmul(self.fb.to(x.dtype), x)
+        # log features if required
+        if self.log:
+            if self.log_zero_guard_type == "add":
+                x = torch.log(x + self.log_zero_guard_value_fn(x))
+            elif self.log_zero_guard_type == "clamp":
+                x = torch.log(torch.clamp(x, min=self.log_zero_guard_value_fn(x)))
+            else:
+                raise ValueError("log_zero_guard_type was not understood")
+        # frame splicing if required
+        if self.frame_splicing > 1:
+            x = splice_frames(x, self.frame_splicing)
+        # normalize if required
+        if self.normalize:
+            x, _, _ = normalize_batch(x, seq_len, normalize_type=self.normalize)
+        # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency)
+        max_len = x.size(-1)
+        mask = torch.arange(max_len, device=x.device)
+        mask = mask.repeat(x.size(0), 1) >= seq_len.unsqueeze(1)
+        x = x.masked_fill(mask.unsqueeze(1).type(torch.bool).to(device=x.device), self.pad_value)
+        del mask
+        pad_to = self.pad_to
+        if pad_to == "max":
+            x = nn.functional.pad(x, (0, self.max_length - x.size(-1)), value=self.pad_value)
+        elif pad_to > 0:
+            pad_amt = x.size(-1) % pad_to
+            if pad_amt != 0:
+                x = nn.functional.pad(x, (0, pad_to - pad_amt), value=self.pad_value)
+        return x, seq_len
+class FilterbankFeaturesTA(nn.Module):
+    """
+    Exportable, `torchaudio`-based implementation of Mel Spectrogram extraction.
+    See `AudioToMelSpectrogramPreprocessor` for args.
+    """
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        n_window_size: int = 320,
+        n_window_stride: int = 160,
+        normalize: Optional[str] = "per_feature",
+        nfilt: int = 64,
+        n_fft: Optional[int] = None,
+        preemph: float = 0.97,
+        lowfreq: float = 0,
+        highfreq: Optional[float] = None,
+        log: bool = True,
+        log_zero_guard_type: str = "add",
+        log_zero_guard_value: Union[float, str] = 2**-24,
+        dither: float = 1e-5,
+        window: str = "hann",
+        pad_to: int = 0,
+        pad_value: float = 0.0,
+        mel_norm="slaney",
+        # Seems like no one uses these options anymore. Don't convolute the code by supporting thm.
+        use_grads: bool = False,  # Deprecated arguments; kept for config compatibility
+        max_duration: float = 16.7,  # Deprecated arguments; kept for config compatibility
+        frame_splicing: int = 1,  # Deprecated arguments; kept for config compatibility
+        exact_pad: bool = False,  # Deprecated arguments; kept for config compatibility
+        nb_augmentation_prob: float = 0.0,  # Deprecated arguments; kept for config compatibility
+        nb_max_freq: int = 4000,  # Deprecated arguments; kept for config compatibility
+        mag_power: float = 2.0,  # Deprecated arguments; kept for config compatibility
+        rng: Optional[random.Random] = None,  # Deprecated arguments; kept for config compatibility
+        stft_exact_pad: bool = False,  # Deprecated arguments; kept for config compatibility
+        stft_conv: bool = False,  # Deprecated arguments; kept for config compatibility
+    ):
+        super().__init__()
+        if not HAVE_TORCHAUDIO:
+            raise ValueError(f"Need to install torchaudio to instantiate a {self.__class__.__name__}")
+        # Make sure log zero guard is supported, if given as a string
+        supported_log_zero_guard_strings = {"eps", "tiny"}
+        if isinstance(log_zero_guard_value, str) and log_zero_guard_value not in supported_log_zero_guard_strings:
+            raise ValueError(
+                f"Log zero guard value must either be a float or a member of {supported_log_zero_guard_strings}"
+            )
+        # Copied from `AudioPreprocessor` due to the ad-hoc structuring of the Mel Spec extractor class
+        self.torch_windows = {
+            'hann': torch.hann_window,
+            'hamming': torch.hamming_window,
+            'blackman': torch.blackman_window,
+            'bartlett': torch.bartlett_window,
+            'ones': torch.ones,
+            None: torch.ones,
+        }
+        # Ensure we can look up the window function
+        if window not in self.torch_windows:
+            raise ValueError(f"Got window value '{window}' but expected a member of {self.torch_windows.keys()}")
+        self.win_length = n_window_size
+        self.hop_length = n_window_stride
+        self._sample_rate = sample_rate
+        self._normalize_strategy = normalize
+        self._use_log = log
+        self._preemphasis_value = preemph
+        self.log_zero_guard_type = log_zero_guard_type
+        self.log_zero_guard_value: Union[str, float] = log_zero_guard_value
+        self.dither = dither
+        self.pad_to = pad_to
+        self.pad_value = pad_value
+        self.n_fft = n_fft
+        self._mel_spec_extractor: torchaudio.transforms.MelSpectrogram = torchaudio.transforms.MelSpectrogram(
+            sample_rate=self._sample_rate,
+            win_length=self.win_length,
+            hop_length=self.hop_length,
+            n_mels=nfilt,
+            window_fn=self.torch_windows[window],
+            mel_scale="slaney",
+            norm=mel_norm,
+            n_fft=n_fft,
+            f_max=highfreq,
+            f_min=lowfreq,
+            wkwargs={"periodic": False},
+        )
+    @property
+    def filter_banks(self):
+        """Matches the analogous class"""
+        return self._mel_spec_extractor.mel_scale.fb
+    def _resolve_log_zero_guard_value(self, dtype: torch.dtype) -> float:
+        if isinstance(self.log_zero_guard_value, float):
+            return self.log_zero_guard_value
+        return getattr(torch.finfo(dtype), self.log_zero_guard_value)
+    def _apply_dithering(self, signals: torch.Tensor) -> torch.Tensor:
+        if self.training and self.dither > 0.0:
+            noise = torch.randn_like(signals) * self.dither
+            signals = signals + noise
+        return signals
+    def _apply_preemphasis(self, signals: torch.Tensor) -> torch.Tensor:
+        if self._preemphasis_value is not None:
+            padded = torch.nn.functional.pad(signals, (1, 0))
+            signals = signals - self._preemphasis_value * padded[:, :-1]
+        return signals
+    def _compute_output_lengths(self, input_lengths: torch.Tensor) -> torch.Tensor:
+        out_lengths = input_lengths.div(self.hop_length, rounding_mode="floor").add(1).long()
+        return out_lengths
+    def _apply_pad_to(self, features: torch.Tensor) -> torch.Tensor:
+        # Only apply during training; else need to capture dynamic shape for exported models
+        if not self.training or self.pad_to == 0 or features.shape[-1] % self.pad_to == 0:
+            return features
+        pad_length = self.pad_to - (features.shape[-1] % self.pad_to)
+        return torch.nn.functional.pad(features, pad=(0, pad_length), value=self.pad_value)
+    def _apply_log(self, features: torch.Tensor) -> torch.Tensor:
+        if self._use_log:
+            zero_guard = self._resolve_log_zero_guard_value(features.dtype)
+            if self.log_zero_guard_type == "add":
+                features = features + zero_guard
+            elif self.log_zero_guard_type == "clamp":
+                features = features.clamp(min=zero_guard)
+            else:
+                raise ValueError(f"Unsupported log zero guard type: '{self.log_zero_guard_type}'")
+            features = features.log()
+        return features
+    def _extract_spectrograms(self, signals: torch.Tensor) -> torch.Tensor:
+        # Complex FFT needs to be done in single precision
+        with torch.amp.autocast('cuda', enabled=False):
+            features = self._mel_spec_extractor(waveform=signals)
+        return features
+    def _apply_normalization(self, features: torch.Tensor, lengths: torch.Tensor, eps: float = 1e-5) -> torch.Tensor:
+        # For consistency, this function always does a masked fill even if not normalizing.
+        mask: torch.Tensor = make_seq_mask_like(lengths=lengths, like=features, time_dim=-1, valid_ones=False)
+        features = features.masked_fill(mask, 0.0)
+        # Maybe don't normalize
+        if self._normalize_strategy is None:
+            return features
+        # Use the log zero guard for the sqrt zero guard
+        guard_value = self._resolve_log_zero_guard_value(features.dtype)
+        if self._normalize_strategy == "per_feature" or self._normalize_strategy == "all_features":
+            # 'all_features' reduces over each sample; 'per_feature' reduces over each channel
+            reduce_dim = 2
+            if self._normalize_strategy == "all_features":
+                reduce_dim = [1, 2]
+            # [B, D, T] -> [B, D, 1] or [B, 1, 1]
+            means = features.sum(dim=reduce_dim, keepdim=True).div(lengths.view(-1, 1, 1))
+            stds = (
+                features.sub(means)
+                .masked_fill(mask, 0.0)
+                .pow(2.0)
+                .sum(dim=reduce_dim, keepdim=True)  # [B, D, T] -> [B, D, 1] or [B, 1, 1]
+                .div(lengths.view(-1, 1, 1) - 1)  # assume biased estimator
+                .clamp(min=guard_value)  # avoid sqrt(0)
+                .sqrt()
+            )
+            features = (features - means) / (stds + eps)
+        else:
+            # Deprecating constant std/mean
+            raise ValueError(f"Unsupported norm type: '{self._normalize_strategy}")
+        features = features.masked_fill(mask, 0.0)
+        return features
+    def forward(self, input_signal: torch.Tensor, length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        feature_lengths = self._compute_output_lengths(input_lengths=length)
+        signals = self._apply_dithering(signals=input_signal)
+        signals = self._apply_preemphasis(signals=signals)
+        features = self._extract_spectrograms(signals=signals)
+        features = self._apply_log(features=features)
+        features = self._apply_normalization(features=features, lengths=feature_lengths)
+        features = self._apply_pad_to(features=features)
+        return features, feature_lengths

logging.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import logging
+# Function to convert HEX to ANSI 24-bit escape code
+def hex_to_ansi(hex_color, is_background=False):
+    """Convert a hex color code to an ANSI escape sequence."""
+    hex_color = hex_color.lstrip("#")  # Remove '#' if present
+    if len(hex_color) != 6:
+        raise ValueError("Invalid hex color format. Use #RRGGBB.")
+    r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
+    return f"\033[{48 if is_background else 38};2;{r};{g};{b}m"
+# Custom log formatter with level-specific colors
+class ColoredFormatter(logging.Formatter):
+    # Define hex colors per log level
+    COLORS = {
+        "DEBUG":    {"HEADER": "#1E3A8A", "TIMESTAMP": "#2563EB"},  # Dark Blue / Blue
+        "INFO":     {"HEADER": "#166534", "TIMESTAMP": "#22C55E"},  # Dark Green / Green
+        "WARNING":  {"HEADER": "#92400E", "TIMESTAMP": "#FACC15"},  # Dark Yellow / Yellow
+        "ERROR":    {"HEADER": "#7F1D1D", "TIMESTAMP": "#EF4444"},  # Dark Red / Red
+        "CRITICAL": {"HEADER": "#581C87", "TIMESTAMP": "#C084FC"},  # Dark Purple / Purple
+    }
+    def format(self, record):
+        # Extract filename and line number
+        filename = record.pathname.split("/")[-1]
+        line_no = record.lineno
+        level_name = record.levelname
+        # Choose colors based on log level
+        level_colors = self.COLORS.get(level_name, self.COLORS["INFO"])
+        header_color = hex_to_ansi(level_colors["HEADER"])
+        timestamp_color = hex_to_ansi(level_colors["TIMESTAMP"])
+        reset_color = "\033[0m"  # Reset to default terminal color
+        # Format header as "[LEVEL|file.py:line]"
+        header = f"{header_color}[{level_name}|{filename}:{line_no}]{reset_color}"
+        # Format timestamp
+        timestamp = f"{timestamp_color}{self.formatTime(record, self.datefmt)}{reset_color}"
+        # Format message
+        message = f"\033[37m{record.getMessage()}{reset_color}"  # White message
+        return f"{header} {timestamp} >> {message}"
+# Set up logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+console_handler = logging.StreamHandler()
+formatter = ColoredFormatter(datefmt="%Y-%m-%d %H:%M:%S")
+console_handler.setFormatter(formatter)
+logger.addHandler(console_handler)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:553b3b3f58b772c5a184701cc761f96c36457df105da5fdc4336e9ad73f0209d
+size 28476124

modeling_xvector.py ADDED Viewed

	@@ -0,0 +1,153 @@

+from dataclasses import dataclass
+from typing import Optional, Union, Tuple
+from rich import print
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel
+from transformers.utils import ModelOutput
+from .configuration_xvector import XVectorConfig
+from .audio_processing import AudioToMelSpectrogramPreprocessor
+from .audio_processing import SpectrogramAugmentation
+from .conv_asr import XVectorEncoder, SpeakerDecoder
+from .angular_loss import AdditiveMarginSoftmaxLoss, AdditiveAngularMarginSoftmaxLoss
+@dataclass
+class XVectorBaseModelOutput(ModelOutput):
+    encoder_outputs: torch.FloatTensor = None
+    extract_features: torch.FloatTensor = None
+    output_lengths: torch.FloatTensor = None
+@dataclass
+class XVectorSequenceClassifierOutput(ModelOutput):
+    loss: torch.FloatTensor = None
+    logits: torch.FloatTensor = None
+    embeddings: torch.FloatTensor = None
+class XVectorPreTrainedModel(PreTrainedModel):
+    config_class = XVectorConfig
+    base_model_prefix = "xvector"
+    main_input_name = "input_values"
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        config: XVectorConfig = self.config
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Conv2d):
+            module.weight.data.normal_(mean=0.0, std=config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+    @property
+    def num_weights(self):
+        """
+        Utility property that returns the total number of parameters of NeuralModule.
+        """
+        return self._num_weights()
+    @torch.jit.ignore
+    def _num_weights(self):
+        num: int = 0
+        for p in self.parameters():
+            if p.requires_grad:
+                num += p.numel()
+        return num
+class XVectorModel(XVectorPreTrainedModel):
+    def __init__(self, config: XVectorConfig):
+        super().__init__(config)
+        self.config = config
+        self.preprocessor = AudioToMelSpectrogramPreprocessor(**config.mel_spectrogram_config)
+        self.spec_augment = SpectrogramAugmentation(**config.spectrogram_augmentation_config)
+        self.encoder = XVectorEncoder(**config.encoder_config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, XVectorBaseModelOutput]:
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_values).to(input_values)
+        lengths = attention_mask.sum(dim=1).long()
+        extract_features, output_lengths = self.preprocessor(input_values, lengths)
+        if self.training:
+            extract_features = self.spec_augment(extract_features, output_lengths)
+        encoder_outputs, output_lengths = self.encoder(extract_features, output_lengths)
+        return XVectorBaseModelOutput(
+            encoder_outputs=encoder_outputs,
+            extract_features=extract_features,
+            output_lengths=output_lengths,
+        )
+class XVectorForSequenceClassification(XVectorPreTrainedModel):
+    def __init__(self, config: XVectorConfig):
+        super().__init__(config)
+        self.xvector = XVectorModel(config)
+        self.classifier = SpeakerDecoder(**config.decoder_config)
+        if config.objective == 'additive_angular_margin':
+            self.loss_fct = AdditiveAngularMarginSoftmaxLoss(**config.objective_config)
+        elif config.objective == 'additive_margin':
+            self.loss_fct = AdditiveMarginSoftmaxLoss(**config.objective_config)
+        elif config.objective == 'cross_entropy':
+            self.loss_fct = nn.CrossEntropyLoss(**config.objective_config)
+        self.init_weights()
+    def freeze_base_model(self):
+        for param in self.xvector.parameters():
+            param.requires_grad = False
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, XVectorSequenceClassifierOutput]:
+        xvector_outputs = self.xvector(
+            input_values,
+            attention_mask,
+        )
+        logits, output_embeddings = self.classifier(
+            xvector_outputs.encoder_outputs,
+            xvector_outputs.output_lengths
+        )
+        logits = logits.view(-1, self.config.num_labels)
+        loss = None
+        if labels is not None:
+            loss = self.loss_fct(logits, labels.view(-1))
+        return XVectorSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+        )

module.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+import torch.nn as nn
+class NeuralModule(nn.Module):
+    @property
+    def num_weights(self):
+        """
+        Utility property that returns the total number of parameters of NeuralModule.
+        """
+        return self._num_weights()
+    @torch.jit.ignore
+    def _num_weights(self):
+        num: int = 0
+        for p in self.parameters():
+            if p.requires_grad:
+                num += p.numel()
+        return num
+    def freeze(self) -> None:
+        r"""
+        Freeze all params for inference.
+        This method sets `requires_grad` to False for all parameters of the module.
+        It also stores the original `requires_grad` state of each parameter in a dictionary,
+        so that `unfreeze()` can restore the original state if `partial=True` is set in `unfreeze()`.
+        """
+        grad_map = {}
+        for pname, param in self.named_parameters():
+            # Store the original grad state
+            grad_map[pname] = param.requires_grad
+            # Freeze the parameter
+            param.requires_grad = False
+        # Store the frozen grad map
+        if not hasattr(self, '_frozen_grad_map'):
+            self._frozen_grad_map = grad_map
+        else:
+            self._frozen_grad_map.update(grad_map)
+        self.eval()
+    def unfreeze(self, partial: bool = False) -> None:
+        """
+        Unfreeze all parameters for training.
+        Allows for either total unfreeze or partial unfreeze (if the module was explicitly frozen previously with `freeze()`).
+        The `partial` argument is used to determine whether to unfreeze all parameters or only the parameters that were
+        previously unfrozen prior `freeze()`.
+        Example:
+            Consider a model that has an encoder and a decoder module. Assume we want the encoder to be frozen always.
+            ```python
+            model.encoder.freeze()  # Freezes all parameters in the encoder explicitly
+            ```
+            During inference, all parameters of the model should be frozen - we do this by calling the model's freeze method.
+            This step records that the encoder module parameters were already frozen, and so if partial unfreeze is called,
+            we should keep the encoder parameters frozen.
+            ```python
+            model.freeze()  # Freezes all parameters in the model; encoder remains frozen
+            ```
+            Now, during fine-tuning, we want to unfreeze the decoder but keep the encoder frozen. We can do this by calling
+            `unfreeze(partial=True)`.
+            ```python
+            model.unfreeze(partial=True)  # Unfreezes only the decoder; encoder remains frozen
+            ```
+        Args:
+            partial: If True, only unfreeze parameters that were previously frozen. If the parameter was already frozen
+                when calling `freeze()`, it will remain frozen after calling `unfreeze(partial=True)`.
+        """
+        if partial and not hasattr(self, '_frozen_grad_map'):
+            raise ValueError("Cannot unfreeze partially without first freezing the module with `freeze()`")
+        for pname, param in self.named_parameters():
+            if not partial:
+                # Unfreeze all parameters
+                param.requires_grad = True
+            else:
+                # Unfreeze only parameters that were previously frozen
+                # Check if the parameter was frozen
+                if pname in self._frozen_grad_map:
+                    param.requires_grad = self._frozen_grad_map[pname]
+                else:
+                    # Log a warning if the parameter was not found in the frozen grad map
+                    print(
+                        f"Parameter {pname} not found in list of previously frozen parameters. "
+                        f"Unfreezing this parameter."
+                    )
+                    param.requires_grad = True
+        # Clean up the frozen grad map
+        if hasattr(self, '_frozen_grad_map'):
+            delattr(self, '_frozen_grad_map')
+        self.train()

normalization.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch.nn as nn
+class BatchNorm1d(nn.Module):
+    """Applies 1d batch normalization to the input tensor.
+    Arguments
+    ---------
+    input_shape : tuple
+        The expected shape of the input. Alternatively, use ``input_size``.
+    input_size : int
+        The expected size of the input. Alternatively, use ``input_shape``.
+    eps : float
+        This value is added to std deviation estimation to improve the numerical
+        stability.
+    momentum : float
+        It is a value used for the running_mean and running_var computation.
+    affine : bool
+        When set to True, the affine parameters are learned.
+    track_running_stats : bool
+        When set to True, this module tracks the running mean and variance,
+        and when set to False, this module does not track such statistics.
+    combine_batch_time : bool
+        When true, it combines batch an time axis.
+    skip_transpose : bool
+        Whether to skip the transposition.
+    Example
+    -------
+    >>> input = torch.randn(100, 10)
+    >>> norm = BatchNorm1d(input_shape=input.shape)
+    >>> output = norm(input)
+    >>> output.shape
+    torch.Size([100, 10])
+    """
+    def __init__(
+        self,
+        input_shape=None,
+        input_size=None,
+        eps=1e-05,
+        momentum=0.1,
+        affine=True,
+        track_running_stats=True,
+        combine_batch_time=False,
+        skip_transpose=False,
+    ):
+        super().__init__()
+        self.combine_batch_time = combine_batch_time
+        self.skip_transpose = skip_transpose
+        if input_size is None and skip_transpose:
+            input_size = input_shape[1]
+        elif input_size is None:
+            input_size = input_shape[-1]
+        self.norm = nn.BatchNorm1d(
+            input_size,
+            eps=eps,
+            momentum=momentum,
+            affine=affine,
+            track_running_stats=track_running_stats,
+        )
+    def forward(self, x, *args, **kwargs):
+        """Returns the normalized input tensor.
+        Arguments
+        ---------
+        x : torch.Tensor (batch, time, [channels])
+            input to normalize. 2d or 3d tensors are expected in input
+            4d tensors can be used when combine_dims=True.
+        Returns
+        -------
+        x_n : torch.Tensor
+            The normalized outputs.
+        """
+        shape_or = x.shape
+        if self.combine_batch_time:
+            if x.ndim == 3:
+                x = x.reshape(shape_or[0] * shape_or[1], shape_or[2])
+            else:
+                x = x.reshape(
+                    shape_or[0] * shape_or[1], shape_or[3], shape_or[2]
+                )
+        elif not self.skip_transpose:
+            x = x.transpose(-1, 1)
+        x_n = self.norm(x)
+        if self.combine_batch_time:
+            x_n = x_n.reshape(shape_or)
+        elif not self.skip_transpose:
+            x_n = x_n.transpose(1, -1)
+        return x_n

spectrogram_augment.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import random
+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+class SpecAugment(nn.Module):
+    """
+    Zeroes out(cuts) random continuous horisontal or
+    vertical segments of the spectrogram as described in
+    SpecAugment (https://arxiv.org/abs/1904.08779).
+    params:
+    freq_masks - how many frequency segments should be cut
+    time_masks - how many time segments should be cut
+    freq_width - maximum number of frequencies to be cut in one segment
+    time_width - maximum number of time steps to be cut in one segment.
+        Can be a positive integer or a float value in the range [0, 1].
+        If positive integer value, defines maximum number of time steps
+        to be cut in one segment.
+        If a float value, defines maximum percentage of timesteps that
+        are cut adaptively.
+    use_vectorized_code - GPU-based implementation with batched masking and GPU rng,
+        setting it to False reverts to the legacy implementation.
+        Fast implementation is inspired by torchaudio:
+        https://github.com/pytorch/audio/blob/ea437b31ce316ea3d66fe73768c0dcb94edb79ad/src/torchaudio/functional/functional.py#L816
+    """
+    FREQ_AXIS = 1  # Frequency axis in the spectrogram tensor
+    TIME_AXIS = 2  # Time axis in the spectrogram tensor
+    def __init__(
+        self,
+        freq_masks: int = 0,
+        time_masks: int = 0,
+        freq_width: int = 10,
+        time_width: Union[int, float] = 10,
+        rng: random.Random = None,
+        mask_value: float = 0.0,
+        use_vectorized_code: bool = True,
+    ):
+        super().__init__()
+        self._rng = random.Random() if rng is None else rng
+        self.freq_masks = freq_masks
+        self.time_masks = time_masks
+        self.freq_width = freq_width
+        self.time_width = time_width
+        self.mask_value = mask_value
+        self.use_vectorized_code = use_vectorized_code
+        if isinstance(time_width, int):
+            self.adaptive_temporal_width = False
+        else:
+            if time_width > 1.0 or time_width < 0.0:
+                raise ValueError("If `time_width` is a float value, must be in range [0, 1]")
+            self.adaptive_temporal_width = True
+    @torch.no_grad()
+    def forward(self, input_spec, length):
+        if self.use_vectorized_code:
+            return self._forward_vectorized(input_spec, length)
+        else:
+            return self._forward_legacy(input_spec, length)
+    def _forward_legacy(self, input_spec, length):
+        batch_size, num_freq_bins, _ = input_spec.shape
+        # Move lengths to CPU before repeated indexing
+        lengths_cpu = length.cpu().numpy()
+        # Generate a numpy boolean mask. `True` elements represent where the input spec will be augmented.
+        fill_mask: np.array = np.full(shape=input_spec.shape, fill_value=False)
+        freq_start_upper_bound = num_freq_bins - self.freq_width
+        # Choose different mask ranges for each element of the batch
+        for idx in range(batch_size):
+            # Set freq masking
+            for _ in range(self.freq_masks):
+                start = self._rng.randint(0, freq_start_upper_bound)
+                width = self._rng.randint(0, self.freq_width)
+                fill_mask[idx, start : start + width, :] = True
+            # Derive time width, sometimes based percentage of input length.
+            if self.adaptive_temporal_width:
+                time_max_width = max(1, int(lengths_cpu[idx] * self.time_width))
+            else:
+                time_max_width = self.time_width
+            time_start_upper_bound = max(1, lengths_cpu[idx] - time_max_width)
+            # Set time masking
+            for _ in range(self.time_masks):
+                start = self._rng.randint(0, time_start_upper_bound)
+                width = self._rng.randint(0, time_max_width)
+                fill_mask[idx, :, start : start + width] = True
+        # Bring the mask to device and fill spec
+        fill_mask = torch.from_numpy(fill_mask).to(input_spec.device)
+        masked_spec = input_spec.masked_fill(mask=fill_mask, value=self.mask_value)
+        return masked_spec
+    def _forward_vectorized(self, input_spec: torch.Tensor, length: torch.Tensor) -> torch.Tensor:
+        # time masks
+        input_spec = self._apply_masks(
+            input_spec=input_spec,
+            num_masks=self.time_masks,
+            length=length,
+            width=self.time_width,
+            axis=self.TIME_AXIS,
+            mask_value=self.mask_value,
+        )
+        # freq masks
+        input_spec = self._apply_masks(
+            input_spec=input_spec,
+            num_masks=self.freq_masks,
+            length=length,
+            width=self.freq_width,
+            axis=self.FREQ_AXIS,
+            mask_value=self.mask_value,
+        )
+        return input_spec
+    def _apply_masks(
+        self,
+        input_spec: torch.Tensor,
+        num_masks: int,
+        length: torch.Tensor,
+        width: Union[int, float],
+        mask_value: float,
+        axis: int,
+    ) -> torch.Tensor:
+        assert axis in (
+            self.FREQ_AXIS,
+            self.TIME_AXIS,
+        ), f"Axis can be only be equal to frequency \
+            ({self.FREQ_AXIS}) or time ({self.TIME_AXIS}). Received: {axis=}"
+        assert not (
+            isinstance(width, float) and axis == self.FREQ_AXIS
+        ), "Float width supported \
+            only with time axis."
+        batch_size = input_spec.shape[0]
+        axis_length = input_spec.shape[axis]
+        # If width is float then it is transformed into a tensor
+        if axis == self.TIME_AXIS and isinstance(width, float):
+            width = torch.clamp(width * length, max=axis_length).unsqueeze(1)
+        # Generate [0-1) random numbers and then scale the tensors.
+        # Use float32 dtype for begin/end mask markers before they are quantized to long.
+        mask_width = torch.rand((batch_size, num_masks), device=input_spec.device, dtype=torch.float32) * width
+        mask_width = mask_width.long()
+        mask_start = torch.rand((batch_size, num_masks), device=input_spec.device, dtype=torch.float32)
+        if axis == self.TIME_AXIS:
+            # length can only be used for the time axis
+            mask_start = mask_start * (length.unsqueeze(1) - mask_width)
+        else:
+            mask_start = mask_start * (axis_length - mask_width)
+        mask_start = mask_start.long()
+        mask_end = mask_start + mask_width
+        # Create mask values using vectorized indexing
+        indices = torch.arange(axis_length, device=input_spec.device)
+        # Create a mask_tensor with all the indices.
+        # The mask_tensor shape is (batch_size, num_masks, axis_length).
+        mask_tensor = (indices >= mask_start.unsqueeze(-1)) & (indices < mask_end.unsqueeze(-1))
+        # Reduce masks to one mask
+        mask_tensor = mask_tensor.any(dim=1)
+        # Create a final mask that aligns with the full tensor
+        mask = torch.zeros_like(input_spec, dtype=torch.bool)
+        if axis == self.TIME_AXIS:
+            mask_ranges = mask_tensor[:, None, :]
+        else:  # axis == self.FREQ_AXIS
+            mask_ranges = mask_tensor[:, :, None]
+        mask[:, :, :] = mask_ranges
+        # Apply the mask value
+        return input_spec.masked_fill(mask=mask, value=mask_value)
+class SpecCutout(nn.Module):
+    """
+    Zeroes out(cuts) random rectangles in the spectrogram
+    as described in (https://arxiv.org/abs/1708.04552).
+    params:
+    rect_masks - how many rectangular masks should be cut
+    rect_freq - maximum size of cut rectangles along the frequency dimension
+    rect_time - maximum size of cut rectangles along the time dimension
+    """
+    def __init__(self, rect_masks=0, rect_time=5, rect_freq=20, rng=None):
+        super(SpecCutout, self).__init__()
+        self._rng = random.Random() if rng is None else rng
+        self.rect_masks = rect_masks
+        self.rect_time = rect_time
+        self.rect_freq = rect_freq
+    @torch.no_grad()
+    def forward(self, input_spec):
+        sh = input_spec.shape
+        for idx in range(sh[0]):
+            for i in range(self.rect_masks):
+                rect_x = self._rng.randint(0, sh[1] - self.rect_freq)
+                rect_y = self._rng.randint(0, sh[2] - self.rect_time)
+                w_x = self._rng.randint(0, self.rect_freq)
+                w_y = self._rng.randint(0, self.rect_time)
+                input_spec[idx, rect_x : rect_x + w_x, rect_y : rect_y + w_y] = 0.0
+        return input_spec

tdnn_attention.py ADDED Viewed

	@@ -0,0 +1,550 @@

+import math
+from typing import List, Optional
+from numpy import inf
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.init import _calculate_correct_fan
+class StatsPoolLayer(nn.Module):
+    """Statistics and time average pooling (TAP) layer
+    This computes mean and, optionally, standard deviation statistics across the time dimension.
+    Args:
+        feat_in: Input features with shape [B, D, T]
+        pool_mode: Type of pool mode. Supported modes are 'xvector' (mean and standard deviation) and 'tap' (time
+            average pooling, i.e., mean)
+        eps: Epsilon, minimum value before taking the square root, when using 'xvector' mode.
+        unbiased: Whether to use the biased estimator for the standard deviation when using 'xvector' mode. The default
+            for torch.Tensor.std() is True.
+    Returns:
+        Pooled statistics with shape [B, D].
+    Raises:
+        ValueError if an unsupported pooling mode is specified.
+    """
+    def __init__(self, feat_in: int, pool_mode: str = 'xvector', eps: float = 1e-10, unbiased: bool = True):
+        super().__init__()
+        supported_modes = {"xvector", "tap"}
+        if pool_mode not in supported_modes:
+            raise ValueError(f"Pool mode must be one of {supported_modes}; got '{pool_mode}'")
+        self.pool_mode = pool_mode
+        self.feat_in = feat_in
+        self.eps = eps
+        self.unbiased = unbiased
+        if self.pool_mode == 'xvector':
+            # Mean + std
+            self.feat_in *= 2
+    def forward(self, encoder_output, length=None):
+        if length is None:
+            mean = encoder_output.mean(dim=-1)  # Time Axis
+            if self.pool_mode == 'xvector':
+                correction = 1 if self.unbiased else 0
+                std = encoder_output.std(dim=-1, correction=correction).clamp(min=self.eps)
+                pooled = torch.cat([mean, std], dim=-1)
+            else:
+                pooled = mean
+        else:
+            mask = make_seq_mask_like(like=encoder_output, lengths=length, valid_ones=False)
+            encoder_output = encoder_output.masked_fill(mask, 0.0)
+            # [B, D, T] -> [B, D]
+            means = encoder_output.mean(dim=-1)
+            # Re-scale to get padded means
+            means = means * (encoder_output.shape[-1] / length).unsqueeze(-1)
+            if self.pool_mode == "xvector":
+                correction = 1 if self.unbiased else 0
+                stds = (
+                    encoder_output.sub(means.unsqueeze(-1))
+                    .masked_fill(mask, 0.0)
+                    .pow(2.0)
+                    .sum(-1)  # [B, D, T] -> [B, D]
+                    .div(length.view(-1, 1).sub(correction))
+                    .clamp(min=self.eps)
+                    .sqrt()
+                )
+                pooled = torch.cat((means, stds), dim=-1)
+            else:
+                pooled = means
+        return pooled
+class AttentivePoolLayer(nn.Module):
+    """
+    Attention pooling layer for pooling speaker embeddings
+    Reference: ECAPA-TDNN Embeddings for Speaker Diarization (https://arxiv.org/pdf/2104.01466.pdf)
+    inputs:
+        inp_filters: input feature channel length from encoder
+        attention_channels: intermediate attention channel size
+        kernel_size: kernel_size for TDNN and attention conv1d layers (default: 1)
+        dilation: dilation size for TDNN and attention conv1d layers  (default: 1)
+    """
+    def __init__(
+        self,
+        inp_filters: int,
+        attention_channels: int = 128,
+        kernel_size: int = 1,
+        dilation: int = 1,
+        eps: float = 1e-10,
+    ):
+        super().__init__()
+        self.feat_in = 2 * inp_filters
+        self.attention_layer = nn.Sequential(
+            TDNNModule(inp_filters * 3, attention_channels, kernel_size=kernel_size, dilation=dilation),
+            nn.Tanh(),
+            nn.Conv1d(
+                in_channels=attention_channels,
+                out_channels=inp_filters,
+                kernel_size=kernel_size,
+                dilation=dilation,
+            ),
+        )
+        self.eps = eps
+    def forward(self, x, length=None):
+        max_len = x.size(2)
+        if length is None:
+            length = torch.ones(x.shape[0], device=x.device)
+        mask, num_values = lens_to_mask(length, max_len=max_len, device=x.device)
+        # encoder statistics
+        mean, std = get_statistics_with_mask(x, mask / num_values)
+        mean = mean.unsqueeze(2).repeat(1, 1, max_len)
+        std = std.unsqueeze(2).repeat(1, 1, max_len)
+        attn = torch.cat([x, mean, std], dim=1)
+        # attention statistics
+        attn = self.attention_layer(attn)  # attention pass
+        attn = attn.masked_fill(mask == 0, -inf)
+        alpha = F.softmax(attn, dim=2)  # attention values, α
+        mu, sg = get_statistics_with_mask(x, alpha)  # µ and ∑
+        # gather
+        return torch.cat((mu, sg), dim=1).unsqueeze(2)
+class TDNNModule(nn.Module):
+    """
+    Time Delayed Neural Module (TDNN) - 1D
+    input:
+        inp_filters: input filter channels for conv layer
+        out_filters: output filter channels for conv layer
+        kernel_size: kernel weight size for conv layer
+        dilation: dilation for conv layer
+        stride: stride for conv layer
+        padding: padding for conv layer (default None: chooses padding value such that input and output feature shape matches)
+    output:
+        tdnn layer output
+    """
+    def __init__(
+        self,
+        inp_filters: int,
+        out_filters: int,
+        kernel_size: int = 1,
+        dilation: int = 1,
+        stride: int = 1,
+        padding: int = None,
+    ):
+        super().__init__()
+        if padding is None:
+            padding = get_same_padding(kernel_size, stride=stride, dilation=dilation)
+        self.conv_layer = nn.Conv1d(
+            in_channels=inp_filters,
+            out_channels=out_filters,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+        )
+        self.activation = nn.ReLU()
+        self.bn = nn.BatchNorm1d(out_filters)
+    def forward(self, x, length=None):
+        x = self.conv_layer(x)
+        x = self.activation(x)
+        return self.bn(x)
+class MaskedSEModule(nn.Module):
+    """
+    Squeeze and Excite module implementation with conv1d layers
+    input:
+        inp_filters: input filter channel size
+        se_filters: intermediate squeeze and excite channel output and input size
+        out_filters: output filter channel size
+        kernel_size: kernel_size for both conv1d layers
+        dilation: dilation size for both conv1d layers
+    output:
+        squeeze and excite layer output
+    """
+    def __init__(self, inp_filters: int, se_filters: int, out_filters: int, kernel_size: int = 1, dilation: int = 1):
+        super().__init__()
+        self.se_layer = nn.Sequential(
+            nn.Conv1d(
+                inp_filters,
+                se_filters,
+                kernel_size=kernel_size,
+                dilation=dilation,
+            ),
+            nn.ReLU(),
+            nn.BatchNorm1d(se_filters),
+            nn.Conv1d(
+                se_filters,
+                out_filters,
+                kernel_size=kernel_size,
+                dilation=dilation,
+            ),
+            nn.Sigmoid(),
+        )
+    def forward(self, input, length=None):
+        if length is None:
+            x = torch.mean(input, dim=2, keep_dim=True)
+        else:
+            max_len = input.size(2)
+            mask, num_values = lens_to_mask(length, max_len=max_len, device=input.device)
+            x = torch.sum((input * mask), dim=2, keepdim=True) / (num_values)
+        out = self.se_layer(x)
+        return out * input
+class TDNNSEModule(nn.Module):
+    """
+    Modified building SE_TDNN group module block from ECAPA implementation for faster training and inference
+    Reference: ECAPA-TDNN Embeddings for Speaker Diarization (https://arxiv.org/pdf/2104.01466.pdf)
+    inputs:
+        inp_filters: input filter channel size
+        out_filters: output filter channel size
+        group_scale: scale value to group wider conv channels (deafult:8)
+        se_channels: squeeze and excite output channel size (deafult: 1024/8= 128)
+        kernel_size: kernel_size for group conv1d layers (default: 1)
+        dilation: dilation size for group conv1d layers  (default: 1)
+    """
+    def __init__(
+        self,
+        inp_filters: int,
+        out_filters: int,
+        group_scale: int = 8,
+        se_channels: int = 128,
+        kernel_size: int = 1,
+        dilation: int = 1,
+        init_mode: str = 'xavier_uniform',
+    ):
+        super().__init__()
+        self.out_filters = out_filters
+        padding_val = get_same_padding(kernel_size=kernel_size, dilation=dilation, stride=1)
+        group_conv = nn.Conv1d(
+            out_filters,
+            out_filters,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding_val,
+            groups=group_scale,
+        )
+        self.group_tdnn_block = nn.Sequential(
+            TDNNModule(inp_filters, out_filters, kernel_size=1, dilation=1),
+            group_conv,
+            nn.ReLU(),
+            nn.BatchNorm1d(out_filters),
+            TDNNModule(out_filters, out_filters, kernel_size=1, dilation=1),
+        )
+        self.se_layer = MaskedSEModule(out_filters, se_channels, out_filters)
+        self.apply(lambda x: init_weights(x, mode=init_mode))
+    def forward(self, input, length=None):
+        x = self.group_tdnn_block(input)
+        x = self.se_layer(x, length)
+        return x + input
+class MaskedConv1d(nn.Module):
+    __constants__ = ["use_conv_mask", "real_out_channels", "heads"]
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        heads=-1,
+        bias=False,
+        use_mask=True,
+        quantize=False,
+    ):
+        super(MaskedConv1d, self).__init__()
+        if not (heads == -1 or groups == in_channels):
+            raise ValueError("Only use heads for depthwise convolutions")
+        self.real_out_channels = out_channels
+        if heads != -1:
+            in_channels = heads
+            out_channels = heads
+            groups = heads
+        # preserve original padding
+        self._padding = padding
+        # if padding is a tuple/list, it is considered as asymmetric padding
+        if type(padding) in (tuple, list):
+            self.pad_layer = nn.ConstantPad1d(padding, value=0.0)
+            # reset padding for conv since pad_layer will handle this
+            padding = 0
+        else:
+            self.pad_layer = None
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.use_mask = use_mask
+        self.heads = heads
+        # Calculations for "same" padding cache
+        self.same_padding = (self.conv.stride[0] == 1) and (
+            2 * self.conv.padding[0] == self.conv.dilation[0] * (self.conv.kernel_size[0] - 1)
+        )
+        if self.pad_layer is None:
+            self.same_padding_asymmetric = False
+        else:
+            self.same_padding_asymmetric = (self.conv.stride[0] == 1) and (
+                sum(self._padding) == self.conv.dilation[0] * (self.conv.kernel_size[0] - 1)
+            )
+        # `self.lens` caches consecutive integers from 0 to `self.max_len` that are used to compute the mask for a
+        # batch. Recomputed to bigger size as needed. Stored on a device of the latest batch lens.
+        if self.use_mask:
+            self.max_len = torch.tensor(0)
+            self.lens = torch.tensor(0)
+    def get_seq_len(self, lens):
+        if self.same_padding or self.same_padding_asymmetric:
+            return lens
+        if self.pad_layer is None:
+            return (
+                torch.div(
+                    lens + 2 * self.conv.padding[0] - self.conv.dilation[0] * (self.conv.kernel_size[0] - 1) - 1,
+                    self.conv.stride[0],
+                    rounding_mode='trunc',
+                )
+                + 1
+            )
+        else:
+            return (
+                torch.div(
+                    lens + sum(self._padding) - self.conv.dilation[0] * (self.conv.kernel_size[0] - 1) - 1,
+                    self.conv.stride[0],
+                    rounding_mode='trunc',
+                )
+                + 1
+            )
+    def forward(self, x, lens):
+        if self.use_mask:
+            # Generally will be called by ConvASREncoder, but kept as single gpu backup.
+            if x.size(2) > self.max_len:
+                self.update_masked_length(x.size(2), device=lens.device)
+            x = self.mask_input(x, lens)
+        # Update lengths
+        lens = self.get_seq_len(lens)
+        # asymmtric pad if necessary
+        if self.pad_layer is not None:
+            x = self.pad_layer(x)
+        sh = x.shape
+        if self.heads != -1:
+            x = x.view(-1, self.heads, sh[-1])
+        out = self.conv(x)
+        if self.heads != -1:
+            out = out.view(sh[0], self.real_out_channels, -1)
+        return out, lens
+    def update_masked_length(self, max_len, seq_range=None, device=None):
+        if seq_range is None:
+            self.lens, self.max_len = _masked_conv_init_lens(self.lens, max_len, self.max_len)
+            self.lens = self.lens.to(device)
+        else:
+            self.lens = seq_range
+            self.max_len = torch.tensor(max_len)
+    def mask_input(self, x, lens):
+        max_len = x.size(2)
+        mask = self.lens[:max_len].unsqueeze(0).to(lens.device) < lens.unsqueeze(1)
+        x = x * mask.unsqueeze(1).to(device=x.device)
+        return x
+@torch.jit.script
+def _masked_conv_init_lens(lens: torch.Tensor, current_maxlen: int, original_maxlen: torch.Tensor):
+    if current_maxlen > original_maxlen:
+        new_lens = torch.arange(current_maxlen)
+        new_max_lens = torch.tensor(current_maxlen)
+    else:
+        new_lens = lens
+        new_max_lens = original_maxlen
+    return new_lens, new_max_lens
+def get_same_padding(kernel_size, stride, dilation) -> int:
+    if stride > 1 and dilation > 1:
+        raise ValueError("Only stride OR dilation may be greater than 1")
+    return (dilation * (kernel_size - 1)) // 2
+def lens_to_mask(lens: List[int], max_len: int, device: str = None):
+    """
+    outputs masking labels for list of lengths of audio features, with max length of any
+    mask as max_len
+    input:
+        lens: list of lens
+        max_len: max length of any audio feature
+    output:
+        mask: masked labels
+        num_values: sum of mask values for each feature (useful for computing statistics later)
+    """
+    lens_mat = torch.arange(max_len).to(device)
+    mask = lens_mat[:max_len].unsqueeze(0) < lens.unsqueeze(1)
+    mask = mask.unsqueeze(1)
+    num_values = torch.sum(mask, dim=2, keepdim=True)
+    return mask, num_values
+def get_statistics_with_mask(x: torch.Tensor, m: torch.Tensor, dim: int = 2, eps: float = 1e-10):
+    """
+    compute mean and standard deviation of input(x) provided with its masking labels (m)
+    input:
+        x: feature input
+        m: averaged mask labels
+    output:
+        mean: mean of input features
+        std: stadard deviation of input features
+    """
+    mean = torch.sum((m * x), dim=dim)
+    std = torch.sqrt((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
+    return mean, std
+@torch.jit.script_if_tracing
+def make_seq_mask_like(
+    like: torch.Tensor, lengths: torch.Tensor, valid_ones: bool = True, time_dim: int = -1
+) -> torch.Tensor:
+    mask = torch.arange(like.shape[time_dim], device=like.device).repeat(lengths.shape[0], 1).lt(lengths.unsqueeze(-1))
+    # Match number of dims in `like` tensor
+    for _ in range(like.dim() - mask.dim()):
+        mask = mask.unsqueeze(1)
+    # If time dim != -1, transpose to proper dim.
+    if time_dim != -1:
+        mask = mask.transpose(time_dim, -1)
+    if not valid_ones:
+        mask = ~mask
+    return mask
+def init_weights(m, mode: Optional[str] = 'xavier_uniform'):
+    if isinstance(m, MaskedConv1d):
+        init_weights(m.conv, mode)
+    if isinstance(m, (nn.Conv1d, nn.Linear)):
+        if mode is not None:
+            if mode == 'xavier_uniform':
+                nn.init.xavier_uniform_(m.weight, gain=1.0)
+            elif mode == 'xavier_normal':
+                nn.init.xavier_normal_(m.weight, gain=1.0)
+            elif mode == 'kaiming_uniform':
+                nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
+            elif mode == 'kaiming_normal':
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+            elif mode == 'tds_uniform':
+                tds_uniform_(m.weight)
+            elif mode == 'tds_normal':
+                tds_normal_(m.weight)
+            else:
+                raise ValueError("Unknown Initialization mode: {0}".format(mode))
+    elif isinstance(m, nn.BatchNorm1d):
+        if m.track_running_stats:
+            m.running_mean.zero_()
+            m.running_var.fill_(1)
+            m.num_batches_tracked.zero_()
+        if m.affine:
+            nn.init.ones_(m.weight)
+            nn.init.zeros_(m.bias)
+def tds_uniform_(tensor, mode='fan_in'):
+    """
+    Uniform Initialization from the paper [Sequence-to-Sequence Speech Recognition with Time-Depth Separable Convolutions](https://www.isca-speech.org/archive/Interspeech_2019/pdfs/2460.pdf)
+    Normalized to -
+    .. math::
+        \\text{bound} = \\text{2} \\times \\sqrt{\\frac{1}{\\text{fan\\_mode}}}
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+            backwards pass.
+    """
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = 2.0  # sqrt(4.0) = 2
+    std = gain / math.sqrt(fan)  # sqrt(4.0 / fan_in)
+    bound = std  # Calculate uniform bounds from standard deviation
+    with torch.no_grad():
+        return tensor.uniform_(-bound, bound)
+def tds_normal_(tensor, mode='fan_in'):
+    """
+    Normal Initialization from the paper [Sequence-to-Sequence Speech Recognition with Time-Depth Separable Convolutions](https://www.isca-speech.org/archive/Interspeech_2019/pdfs/2460.pdf)
+    Normalized to -
+    .. math::
+        \\text{bound} = \\text{2} \\times \\sqrt{\\frac{1}{\\text{fan\\_mode}}}
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+            backwards pass.
+    """
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = 2.0
+    std = gain / math.sqrt(fan)  # sqrt(4.0 / fan_in)
+    bound = std  # Calculate uniform bounds from standard deviation
+    with torch.no_grad():
+        return tensor.normal_(0.0, bound)