Respair
/

OTA_TextAligner

Model card Files Files and versions

xet

Community

Respair commited on Mar 30

Commit

bdb9479

verified ·

1 Parent(s): e3b8436

Create utils.py

Browse files

Files changed (1) hide show

utils.py +289 -0

utils.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import os
+import os.path as osp
+import sys
+import time
+from collections import defaultdict
+import matplotlib
+import numpy as np
+import soundfile as sf
+import torch
+from torch import nn
+import jiwer
+import matplotlib.pylab as plt
+import functools
+import os
+import random
+import traceback
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import librosa
+import numpy as np
+import torch
+from einops import rearrange
+from scipy import ndimage
+from torch.special import gammaln
+def calc_wer(target, pred, ignore_indexes=[0]):
+    target_chars = drop_duplicated(list(filter(lambda x: x not in ignore_indexes, map(str, list(target)))))
+    pred_chars = drop_duplicated(list(filter(lambda x: x not in ignore_indexes, map(str, list(pred)))))
+    target_str = ' '.join(target_chars)
+    pred_str = ' '.join(pred_chars)
+    error = jiwer.wer(target_str, pred_str)
+    return error
+def drop_duplicated(chars):
+    ret_chars = [chars[0]]
+    for prev, curr in zip(chars[:-1], chars[1:]):
+        if prev != curr:
+            ret_chars.append(curr)
+    return ret_chars
+# def build_criterion(critic_params={}):
+#     criterion = {
+#         "ce": nn.CrossEntropyLoss(ignore_index=-1),
+#         "ctc": torch.nn.CTCLoss(**critic_params.get('ctc', {})),
+#         "hinge": nn.HingeEmbeddingLoss(margin=critic_params.get('hinge', {}).get("margin", 1.0))
+#     }
+#     return criterion
+def build_criterion(critic_params={}):
+    criterion = {
+        "ce": nn.CrossEntropyLoss(ignore_index=-1),
+        "ctc": torch.nn.CTCLoss(**critic_params.get('ctc', {})),
+    }
+    return criterion
+def get_data_path_list(train_path=None, val_path=None):
+    if train_path is None:
+        train_path = "Data/train_list.txt"
+    if val_path is None:
+        val_path = "Data/val_list.txt"
+    with open(train_path, 'r') as f:
+        train_list = f.readlines()
+    with open(val_path, 'r') as f:
+        val_list = f.readlines()
+    return train_list, val_list
+def plot_image(image):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(image, aspect="auto", origin="lower",
+                   interpolation='none')
+    fig.canvas.draw()
+    plt.close()
+    return fig
+class PartialConv1d(torch.nn.Conv1d):
+    """
+    Zero padding creates a unique identifier for where the edge of the data is, such that the model can almost always identify
+    exactly where it is relative to either edge given a sufficient receptive field. Partial padding goes to some lengths to remove
+    this affect.
+    """
+    __constants__ = ['slide_winsize']
+    slide_winsize: float
+    def __init__(self, *args, **kwargs):
+        super(PartialConv1d, self).__init__(*args, **kwargs)
+        weight_maskUpdater = torch.ones(1, 1, self.kernel_size[0])
+        self.register_buffer("weight_maskUpdater", weight_maskUpdater, persistent=False)
+        self.slide_winsize = self.weight_maskUpdater.shape[1] * self.weight_maskUpdater.shape[2]
+    def forward(self, input, mask_in):
+        if mask_in is None:
+            mask = torch.ones(1, 1, input.shape[2], dtype=input.dtype, device=input.device)
+        else:
+            mask = mask_in
+            input = torch.mul(input, mask)
+        with torch.no_grad():
+            update_mask = F.conv1d(
+                mask,
+                self.weight_maskUpdater,
+                bias=None,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=1,
+            )
+            update_mask_filled = torch.masked_fill(update_mask, update_mask == 0, self.slide_winsize)
+            mask_ratio = self.slide_winsize / update_mask_filled
+            update_mask = torch.clamp(update_mask, 0, 1)
+            mask_ratio = torch.mul(mask_ratio, update_mask)
+        raw_out = self._conv_forward(input, self.weight, self.bias)
+        if self.bias is not None:
+            bias_view = self.bias.view(1, self.out_channels, 1)
+            output = torch.mul(raw_out - bias_view, mask_ratio) + bias_view
+            output = torch.mul(output, update_mask)
+        else:
+            output = torch.mul(raw_out, mask_ratio)
+        return output
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super().__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+        torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, x):
+        return self.linear_layer(x)
+class ConvNorm(torch.nn.Module):
+    __constants__ = ['use_partial_padding']
+    use_partial_padding: bool
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=1,
+        padding=None,
+        dilation=1,
+        bias=True,
+        w_init_gain='linear',
+        use_partial_padding=False,
+        use_weight_norm=False,
+        norm_fn=None,
+    ):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert kernel_size % 2 == 1
+            padding = int(dilation * (kernel_size - 1) / 2)
+        self.use_partial_padding = use_partial_padding
+        conv_fn = torch.nn.Conv1d
+        if use_partial_padding:
+            conv_fn = PartialConv1d
+        self.conv = conv_fn(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+        torch.nn.init.xavier_uniform_(self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+        if use_weight_norm:
+            self.conv = torch.nn.utils.weight_norm(self.conv)
+        if norm_fn is not None:
+            self.norm = norm_fn(out_channels, affine=True)
+        else:
+            self.norm = None
+    def forward(self, signal, mask=None):
+        if self.use_partial_padding:
+            ret = self.conv(signal, mask)
+            if self.norm is not None:
+                ret = self.norm(ret, mask)
+        else:
+            if mask is not None:
+                signal = signal.mul(mask)
+            ret = self.conv(signal)
+            if self.norm is not None:
+                ret = self.norm(ret)
+        # if self.is_adapter_available():
+        #     ret = self.forward_enabled_adapters(ret.transpose(1, 2)).transpose(1, 2)
+        return ret
+class BetaBinomialInterpolator:
+    """
+    This module calculates alignment prior matrices (based on beta-binomial distribution) using cached popular sizes and image interpolation.
+    The implementation is taken from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
+    """
+    def __init__(self, round_mel_len_to=50, round_text_len_to=10, cache_size=500, scaling_factor: float = 1.0):
+        self.round_mel_len_to = round_mel_len_to
+        self.round_text_len_to = round_text_len_to
+        cached_func = lambda x, y: beta_binomial_prior_distribution(x, y, scaling_factor=scaling_factor)
+        self.bank = functools.lru_cache(maxsize=cache_size)(cached_func)
+    @staticmethod
+    def round(val, to):
+        return max(1, int(np.round((val + 1) / to))) * to
+    def __call__(self, w, h):
+        bw = BetaBinomialInterpolator.round(w, to=self.round_mel_len_to)
+        bh = BetaBinomialInterpolator.round(h, to=self.round_text_len_to)
+        ret = ndimage.zoom(self.bank(bw, bh).T, zoom=(w / bw, h / bh), order=1)
+        assert ret.shape[0] == w, ret.shape
+        assert ret.shape[1] == h, ret.shape
+        return ret
+def general_padding(item, item_len, max_len, pad_value=0):
+    if item_len < max_len:
+        item = torch.nn.functional.pad(item, (0, max_len - item_len), value=pad_value)
+    return item
+def stack_tensors(tensors: List[torch.Tensor], max_lens: List[int], pad_value: float = 0.0) -> torch.Tensor:
+    """
+    Create batch by stacking input tensor list along the time axes.
+    Args:
+        tensors: List of tensors to pad and stack
+        max_lens: List of lengths to pad each axis to, starting with the last axis
+        pad_value: Value for padding
+    Returns:
+        Padded and stacked tensor.
+    """
+    padded_tensors = []
+    for tensor in tensors:
+        padding = []
+        for i, max_len in enumerate(max_lens, 1):
+            padding += [0, max_len - tensor.shape[-i]]
+        padded_tensor = torch.nn.functional.pad(tensor, pad=padding, value=pad_value)
+        padded_tensors.append(padded_tensor)
+    stacked_tensor = torch.stack(padded_tensors)
+    return stacked_tensor
+def logbeta(x, y):
+    return gammaln(x) + gammaln(y) - gammaln(x + y)
+def logcombinations(n, k):
+    return gammaln(n + 1) - gammaln(k + 1) - gammaln(n - k + 1)
+def logbetabinom(n, a, b, x):
+    return logcombinations(n, x) + logbeta(x + a, n - x + b) - logbeta(a, b)
+def beta_binomial_prior_distribution(phoneme_count: int, mel_count: int, scaling_factor: float = 1.0) -> np.array:
+    x = rearrange(torch.arange(0, phoneme_count), "b -> 1 b")
+    y = rearrange(torch.arange(1, mel_count + 1), "b -> b 1")
+    a = scaling_factor * y
+    b = scaling_factor * (mel_count + 1 - y)
+    n = torch.FloatTensor([phoneme_count - 1])
+    return logbetabinom(n, a, b, x).exp().numpy()
+# example : attn_prior = (torch.from_numpy(beta_binomial_interpolator(spect_len.item(), text_len.item())).unsqueeze(0).to(text.device))