Spaces:

Billpai
/

test2

Build error

App Files Files Community

Billpai commited on Apr 30, 2024

Commit

0312eff

1 Parent(s): f196feb

test

Browse files

Files changed (32) hide show

optimizer/__init__.py +0 -0
optimizer/optimizers.py +780 -0
preprocessors/__init__.py +189 -0
preprocessors/bigdata.py +145 -0
preprocessors/cdmusiceval.py +174 -0
preprocessors/coco.py +100 -0
preprocessors/cocoeval.py +99 -0
preprocessors/csd.py +202 -0
preprocessors/custom.py +143 -0
preprocessors/kising.py +116 -0
preprocessors/libritts.py +143 -0
preprocessors/lijian.py +151 -0
preprocessors/ljspeech.py +197 -0
preprocessors/ljspeech_vocoder.py +86 -0
preprocessors/m4singer.py +138 -0
preprocessors/metadata.py +138 -0
preprocessors/nus48e.py +203 -0
preprocessors/opencpop.py +73 -0
preprocessors/opensinger.py +169 -0
preprocessors/opera.py +186 -0
preprocessors/pjs.py +135 -0
preprocessors/popbutfy.py +153 -0
preprocessors/popcs.py +118 -0
preprocessors/processor.py +100 -0
preprocessors/svcc.py +85 -0
preprocessors/svcceval.py +80 -0
preprocessors/vctk.py +163 -0
preprocessors/vctkfewsinger.py +175 -0
preprocessors/vctksample.py +108 -0
preprocessors/vocalist.py +137 -0
pretrained/bigvgan/args.json +235 -0
pretrained/contentvec/README.md +5 -0

optimizer/__init__.py ADDED Viewed

File without changes

optimizer/optimizers.py ADDED Viewed

	@@ -0,0 +1,780 @@

+# This module is modified from https://github.com/Plachtaa/VALL-E-X/blob/3faaf8ccadb154d63b38070caf518ce9309ea0f4/modules/optim.py#L836
+import logging
+import contextlib
+import torch
+from torch import Tensor
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim import Optimizer
+from typing import List, Tuple
+from collections import defaultdict
+class NoamLR(_LRScheduler):
+    """
+    Implements the Noam Learning rate schedule. This corresponds to increasing the learning rate
+    linearly for the first ``num_warmup`` training steps, and decreasing it thereafter proportionally
+    to the inverse square root of the step number, scaled by the inverse square root of the
+    dimensionality of the model. Time will tell if this is just madness or it's actually important.
+    Parameters
+    ----------
+    num_warmup: ``int``, required.
+        The number of steps to linearly increase the learning rate.
+    """
+    def __init__(self, optimizer, num_warmup):
+        self.num_warmup = num_warmup
+        self.base_lr = optimizer.param_groups[0]["lr"]
+        super().__init__(optimizer)
+    def get_lr(self):
+        last_epoch = max(1, self.last_epoch)
+        scale = min(last_epoch ** (-0.5), last_epoch * self.num_warmup ** (-1.5))
+        return [scale * self.base_lr]
+class Eve(Optimizer):
+    """
+    Implements Eve algorithm.  This is a modified version of AdamW with a special
+    way of setting the weight-decay / shrinkage-factor, which is designed to make the
+    rms of the parameters approach a particular target_rms (default: 0.1).  This is
+    for use with networks with 'scaled' versions of modules (see scaling.py), which
+    will be close to invariant to the absolute scale on the parameter matrix.
+    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
+    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
+    Eve is unpublished so far.
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay coefficient (default: 3e-4;
+            this value means that the weight would decay significantly after
+            about 3k minibatches.  Is not multiplied by learning rate, but
+            is conditional on RMS-value of parameter being > target_rms.
+        target_rms (float, optional): target root-mean-square value of
+           parameters, if they fall below this we will stop applying weight decay.
+    .. _Adam: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        betas=(0.9, 0.98),
+        eps=1e-8,
+        weight_decay=1e-3,
+        target_rms=0.1,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0 <= weight_decay <= 0.1:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0 < target_rms <= 10.0:
+            raise ValueError("Invalid target_rms value: {}".format(target_rms))
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            target_rms=target_rms,
+        )
+        super(Eve, self).__init__(params, defaults)
+    def __setstate__(self, state):
+        super(Eve, self).__setstate__(state)
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                # Perform optimization step
+                grad = p.grad
+                if grad.is_sparse:
+                    raise RuntimeError("AdamW does not support sparse gradients")
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state["step"] = 0
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]
+                state["step"] += 1
+                bias_correction1 = 1 - beta1 ** state["step"]
+                bias_correction2 = 1 - beta2 ** state["step"]
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+                denom = (exp_avg_sq.sqrt() * (bias_correction2**-0.5)).add_(
+                    group["eps"]
+                )
+                step_size = group["lr"] / bias_correction1
+                target_rms = group["target_rms"]
+                weight_decay = group["weight_decay"]
+                if p.numel() > 1:
+                    # avoid applying this weight-decay on "scaling factors"
+                    # (which are scalar).
+                    is_above_target_rms = p.norm() > (target_rms * (p.numel() ** 0.5))
+                    p.mul_(1 - (weight_decay * is_above_target_rms))
+                p.addcdiv_(exp_avg, denom, value=-step_size)
+                # if random.random() < 0.0005:
+                #     step = (exp_avg / denom) * step_size
+                #     logging.info(
+                #         f"Delta rms = {(step**2).mean().item()}, shape = {step.shape}"
+                #     )
+        return loss
+class BatchedOptimizer(Optimizer):
+    """
+    This class adds to class Optimizer the capability to optimize parameters in batches:
+    it will stack the parameters and their grads for you so the optimizer can work
+    on tensors with an extra leading dimension.  This is intended for speed with GPUs,
+    as it reduces the number of kernels launched in the optimizer.
+    Args:
+      params:
+    """
+    def __init__(self, params, defaults):
+        super(BatchedOptimizer, self).__init__(params, defaults)
+    @contextlib.contextmanager
+    def batched_params(self, param_group, group_params_names):
+        """
+        This function returns (technically, yields) a list of
+          of tuples (p, state), where
+        p is a `fake` parameter that is stacked (over axis 0) from real parameters
+        that share the same shape, and its gradient is also stacked;
+        `state` is the state corresponding to this batch of parameters
+        (it will be physically located in the "state" for one of the real
+        parameters, the last one that has any particular shape and dtype).
+        This function is decorated as a context manager so that it can
+        write parameters back to their "real" locations.
+        The idea is, instead of doing:
+        <code>
+          for p in group["params"]:
+             state = self.state[p]
+             ...
+        </code>
+        you can do:
+        <code>
+          with self.batched_params(group["params"]) as batches:
+             for p, state, p_names in batches:
+                 ...
+        </code>
+        Args:
+          group: a parameter group, which is a list of parameters; should be
+                one of self.param_groups.
+          group_params_names: name for each parameter in group,
+                which is List[str].
+        """
+        batches = defaultdict(
+            list
+        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter
+        batches_names = defaultdict(
+            list
+        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of str
+        assert len(param_group) == len(group_params_names)
+        for p, named_p in zip(param_group, group_params_names):
+            key = (str(p.dtype), *p.shape)
+            batches[key].append(p)
+            batches_names[key].append(named_p)
+        batches_names_keys = list(batches_names.keys())
+        sorted_idx = sorted(
+            range(len(batches_names)), key=lambda i: batches_names_keys[i]
+        )
+        batches_names = [batches_names[batches_names_keys[idx]] for idx in sorted_idx]
+        batches = [batches[batches_names_keys[idx]] for idx in sorted_idx]
+        stacked_params_dict = dict()
+        # turn batches into a list, in deterministic order.
+        # tuples will contain tuples of (stacked_param, state, stacked_params_names),
+        # one for each batch in `batches`.
+        tuples = []
+        for batch, batch_names in zip(batches, batches_names):
+            p = batch[0]
+            # we arbitrarily store the state in the
+            # state corresponding to the 1st parameter in the
+            # group.  class Optimizer will take care of saving/loading state.
+            state = self.state[p]
+            p_stacked = torch.stack(batch)
+            grad = torch.stack(
+                [torch.zeros_like(p) if p.grad is None else p.grad for p in batch]
+            )
+            p_stacked.grad = grad
+            stacked_params_dict[key] = p_stacked
+            tuples.append((p_stacked, state, batch_names))
+        yield tuples
+        for ((stacked_params, _state, _names), batch) in zip(tuples, batches):
+            for i, p in enumerate(batch):
+                p.copy_(stacked_params[i])
+class ScaledAdam(BatchedOptimizer):
+    """
+     Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update
+     proportional to the norm of that parameter; and also learn the scale of the parameter,
+     in log space, subject to upper and lower limits (as if we had factored each parameter as
+     param = underlying_param * log_scale.exp())
+     Args:
+          params:  The parameters or param_groups to optimize (like other Optimizer subclasses)
+              lr:  The learning rate.  We will typically use a learning rate schedule that starts
+                   at 0.03 and decreases over time, i.e. much higher than other common
+                   optimizers.
+     clipping_scale: (e.g. 2.0)
+                   A scale for gradient-clipping: if specified, the normalized gradients
+                   over the whole model will be clipped to have 2-norm equal to
+                   `clipping_scale` times the median 2-norm over the most recent period
+                   of `clipping_update_period` minibatches.  By "normalized gradients",
+                   we mean after multiplying by the rms parameter value for this tensor
+                   [for non-scalars]; this is appropriate because our update is scaled
+                   by this quantity.
+            betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad.
+                   Must satisfy 0 < beta <= beta2 < 1.
+     scalar_lr_scale: A scaling factor on the learning rate, that we use to update the
+                   scale of each parameter tensor and scalar parameters of the mode..
+                   If each parameter were decomposed
+                   as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale
+                   would be a the scaling factor on the learning rate of p_scale.
+              eps:  A general-purpose epsilon to prevent division by zero
+    param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of
+                   learning the scale on the parameters (we'll constrain the rms of each non-scalar
+                   parameter tensor to be >= this value)
+    param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of
+                   learning the scale on the parameters (we'll constrain the rms of each non-scalar
+                   parameter tensor to be <= this value)
+       scalar_max: Maximum absolute value for scalar parameters (applicable if your
+                   model has any parameters with numel() == 1).
+    size_update_period: The periodicity, in steps, with which we update the size (scale)
+                   of the parameter tensor.  This is provided to save a little time
+                   in the update.
+     clipping_update_period: if clipping_scale is specified, this is the period
+    """
+    def __init__(
+        self,
+        params,
+        lr=3e-02,
+        clipping_scale=None,
+        betas=(0.9, 0.98),
+        scalar_lr_scale=0.1,
+        eps=1.0e-08,
+        param_min_rms=1.0e-05,
+        param_max_rms=3.0,
+        scalar_max=10.0,
+        size_update_period=4,
+        clipping_update_period=100,
+        parameters_names=None,
+        show_dominant_parameters=True,
+    ):
+        assert parameters_names is not None, (
+            "Please prepare parameters_names,"
+            "which is a List[List[str]]. Each List[str] is for a group"
+            "and each str is for a parameter"
+        )
+        defaults = dict(
+            lr=lr,
+            clipping_scale=clipping_scale,
+            betas=betas,
+            scalar_lr_scale=scalar_lr_scale,
+            eps=eps,
+            param_min_rms=param_min_rms,
+            param_max_rms=param_max_rms,
+            scalar_max=scalar_max,
+            size_update_period=size_update_period,
+            clipping_update_period=clipping_update_period,
+        )
+        super(ScaledAdam, self).__init__(params, defaults)
+        assert len(self.param_groups) == len(parameters_names)
+        self.parameters_names = parameters_names
+        self.show_dominant_parameters = show_dominant_parameters
+    def __setstate__(self, state):
+        super(ScaledAdam, self).__setstate__(state)
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        batch = True
+        for group, group_params_names in zip(self.param_groups, self.parameters_names):
+            with self.batched_params(group["params"], group_params_names) as batches:
+                # batches is list of pairs (stacked_param, state).  stacked_param is like
+                # a regular parameter, and will have a .grad, but the 1st dim corresponds to
+                # a stacking dim, it is not a real dim.
+                if (
+                    len(batches[0][1]) == 0
+                ):
+                    clipping_scale = 1
+                else:
+                    clipping_scale = self._get_clipping_scale(group, batches)
+                for p, state, _ in batches:
+                    # Perform optimization step.
+                    # grad is not going to be None, we handled that when creating the batches.
+                    grad = p.grad
+                    if grad.is_sparse:
+                        raise RuntimeError(
+                            "ScaledAdam optimizer does not support sparse gradients"
+                        )
+                    # State initialization
+                    if len(state) == 0:
+                        self._init_state(group, p, state)
+                    self._step_one_batch(group, p, state, clipping_scale)
+        return loss
+    def _init_state(self, group: dict, p: Tensor, state: dict):
+        """
+        Initializes state dict for parameter 'p'.  Assumes that dim 0 of tensor p
+        is actually the batch dimension, corresponding to batched-together
+        parameters of a given shape.
+        Args:
+           group:   Dict to look up configuration values.
+               p: The parameter that we are initializing the state for
+           state: Dict from string to whatever state we are initializing
+        """
+        size_update_period = group["size_update_period"]
+        state["step"] = 0
+        kwargs = {"device": p.device, "dtype": p.dtype}
+        # 'delta' implements conventional momentum.  There are
+        # several different kinds of update going on, so rather than
+        # compute "exp_avg" like in Adam, we store and decay a
+        # parameter-change "delta", which combines all forms of
+        # update.  this is equivalent to how it's done in Adam,
+        # except for the first few steps.
+        state["delta"] = torch.zeros_like(p, memory_format=torch.preserve_format)
+        batch_size = p.shape[0]
+        numel = p.numel() // batch_size
+        numel = p.numel()
+        if numel > 1:
+            # "param_rms" just periodically records the scalar root-mean-square value of
+            # the parameter tensor.
+            # it has a shape like (batch_size, 1, 1, 1, 1)
+            param_rms = (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
+            state["param_rms"] = param_rms
+            state["scale_exp_avg_sq"] = torch.zeros_like(param_rms)
+            state["scale_grads"] = torch.zeros(
+                size_update_period, *param_rms.shape, **kwargs
+            )
+        # exp_avg_sq is the weighted sum of scaled gradients. as in Adam.
+        state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
+    def _get_clipping_scale(
+        self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]]
+    ) -> float:
+        """
+        Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients
+        by this amount before applying the rest of the update.
+        Args:
+           group: the parameter group, an item in self.param_groups
+           tuples: a list of tuples of (param, state, param_names)
+                where param is a batched set of parameters,
+                with a .grad (1st dim is batch dim)
+                and state is the state-dict where optimization parameters are kept.
+                param_names is a List[str] while each str is name for a parameter
+                in batched set of parameters "param".
+        """
+        assert len(tuples) >= 1
+        clipping_scale = group["clipping_scale"]
+        (first_p, first_state, _) = tuples[0]
+        step = first_state["step"]
+        if clipping_scale is None or step == 0:
+            # no clipping.  return early on step == 0 because the other
+            # parameters' state won't have been initialized yet.
+            return 1.0
+        clipping_update_period = group["clipping_update_period"]
+        tot_sumsq = torch.tensor(0.0, device=first_p.device)
+        for (p, state, param_names) in tuples:
+            grad = p.grad
+            if grad.is_sparse:
+                raise RuntimeError(
+                    "ScaledAdam optimizer does not support sparse gradients"
+                )
+            if p.numel() == p.shape[0]:  # a batch of scalars
+                tot_sumsq += (grad**2).sum()  # sum() to change shape [1] to []
+            else:
+                tot_sumsq += ((grad * state["param_rms"]) ** 2).sum()
+        tot_norm = tot_sumsq.sqrt()
+        if "model_norms" not in first_state:
+            first_state["model_norms"] = torch.zeros(
+                clipping_update_period, device=p.device
+            )
+        first_state["model_norms"][step % clipping_update_period] = tot_norm
+        if step % clipping_update_period == 0:
+            # Print some stats.
+            # We don't reach here if step == 0 because we would have returned
+            # above.
+            sorted_norms = first_state["model_norms"].sort()[0].to("cpu")
+            quartiles = []
+            for n in range(0, 5):
+                index = min(
+                    clipping_update_period - 1,
+                    (clipping_update_period // 4) * n,
+                )
+                quartiles.append(sorted_norms[index].item())
+            median = quartiles[2]
+            threshold = clipping_scale * median
+            first_state["model_norm_threshold"] = threshold
+            percent_clipped = (
+                first_state["num_clipped"] * 100.0 / clipping_update_period
+                if "num_clipped" in first_state
+                else 0.0
+            )
+            first_state["num_clipped"] = 0
+            quartiles = " ".join(["%.3e" % x for x in quartiles])
+            logging.info(
+                f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, "
+                f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}"
+            )
+        if step < clipping_update_period:
+            return 1.0  # We have not yet estimated a norm to clip to.
+        else:
+            try:
+                model_norm_threshold = first_state["model_norm_threshold"]
+            except KeyError:
+                logging.info(
+                    "Warning: model_norm_threshold not in state: possibly "
+                    "you changed config when restarting, adding clipping_scale option?"
+                )
+                return 1.0
+            ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item())
+            if ans < 1.0:
+                first_state["num_clipped"] += 1
+            if ans < 0.1:
+                logging.warn(
+                    f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}"
+                )
+                if self.show_dominant_parameters:
+                    assert p.shape[0] == len(param_names)
+                    self._show_gradient_dominating_parameter(tuples, tot_sumsq)
+            return ans
+    def _show_gradient_dominating_parameter(
+        self, tuples: List[Tuple[Tensor, dict, List[str]]], tot_sumsq: Tensor
+    ):
+        """
+        Show information of parameter wihch dominanting tot_sumsq.
+        Args:
+           tuples: a list of tuples of (param, state, param_names)
+                where param is a batched set of parameters,
+                with a .grad (1st dim is batch dim)
+                and state is the state-dict where optimization parameters are kept.
+                param_names is a List[str] while each str is name for a parameter
+                in batched set of parameters "param".
+            tot_sumsq: sumsq of all parameters. Though it's could be calculated
+                from tuples, we still pass it to save some time.
+        """
+        all_sumsq_orig = {}
+        for (p, state, batch_param_names) in tuples:
+            # p is a stacked batch parameters.
+            batch_grad = p.grad
+            if p.numel() == p.shape[0]:  # a batch of scalars
+                batch_sumsq_orig = batch_grad**2
+                # Dummpy values used by following `zip` statement.
+                batch_rms_orig = torch.ones(p.shape[0])
+            else:
+                batch_rms_orig = state["param_rms"]
+                batch_sumsq_orig = ((batch_grad * batch_rms_orig) ** 2).sum(
+                    dim=list(range(1, batch_grad.ndim))
+                )
+            for name, sumsq_orig, rms, grad in zip(
+                batch_param_names, batch_sumsq_orig, batch_rms_orig, batch_grad
+            ):
+                proportion_orig = sumsq_orig / tot_sumsq
+                all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad)
+        assert torch.isclose(
+            sum([value[0] for value in all_sumsq_orig.values()]).cpu(),
+            torch.tensor(1.0),
+        )
+        sorted_by_proportion = {
+            k: v
+            for k, v in sorted(
+                all_sumsq_orig.items(),
+                key=lambda item: item[1][0],
+                reverse=True,
+            )
+        }
+        dominant_param_name = next(iter(sorted_by_proportion))
+        (
+            dominant_proportion,
+            dominant_sumsq,
+            dominant_rms,
+            dominant_grad,
+        ) = sorted_by_proportion[dominant_param_name]
+        logging.info(
+            f"Parameter Dominanting tot_sumsq {dominant_param_name}"
+            f" with proportion {dominant_proportion:.2f},"
+            f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)"
+            f"={dominant_sumsq:.3e},"
+            f" grad_sumsq = {(dominant_grad**2).sum():.3e},"
+            f" orig_rms_sq={(dominant_rms**2).item():.3e}"
+        )
+    def _step_one_batch(
+        self, group: dict, p: Tensor, state: dict, clipping_scale: float
+    ):
+        """
+        Do the step for one parameter, which is actually going to be a batch of
+        `real` parameters, with dim 0 as the batch dim.
+        Args:
+                  group:  dict to look up configuration values
+                    p: parameter to update (actually multiple parameters stacked together
+                       as a batch)
+                  state: state-dict for p, to look up the optimizer state
+        """
+        lr = group["lr"]
+        size_update_period = group["size_update_period"]
+        beta1 = group["betas"][0]
+        grad = p.grad
+        if clipping_scale != 1.0:
+            grad = grad * clipping_scale
+        step = state["step"]
+        delta = state["delta"]
+        delta.mul_(beta1)
+        batch_size = p.shape[0]
+        numel = p.numel() // batch_size
+        if numel > 1:
+            # Update the size/scale of p, and set param_rms
+            scale_grads = state["scale_grads"]
+            scale_grads[step % size_update_period] = (p * grad).sum(
+                dim=list(range(1, p.ndim)), keepdim=True
+            )
+            if step % size_update_period == size_update_period - 1:
+                param_rms = state["param_rms"]  # shape: (batch_size, 1, 1, ..)
+                param_rms.copy_(
+                    (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
+                )
+                if step > 0:
+                    # self._size_update() learns the overall scale on the
+                    # parameter, by shrinking or expanding it.
+                    self._size_update(group, scale_grads, p, state)
+        if numel == 1:
+            # For parameters with 1 element we just use regular Adam.
+            # Updates delta.
+            self._step_scalar(group, p, state)
+        else:
+            self._step(group, p, state)
+        state["step"] = step + 1
+    def _size_update(
+        self, group: dict, scale_grads: Tensor, p: Tensor, state: dict
+    ) -> None:
+        """
+               Called only where p.numel() > 1, this updates the scale of the parameter.
+               If we imagine: p =  underlying_param * scale.exp(), and we are doing
+               gradient descent on underlying param and on scale, this function does the update
+               on `scale`.
+               Args:
+              group: dict to look up configuration values
+        scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing
+                      grads w.r.t. the scales.
+                  p:  The parameter to update
+               state: The state-dict of p
+        """
+        param_rms = state["param_rms"]
+        beta1, beta2 = group["betas"]
+        size_lr = group["lr"] * group["scalar_lr_scale"]
+        param_min_rms = group["param_min_rms"]
+        param_max_rms = group["param_max_rms"]
+        eps = group["eps"]
+        step = state["step"]
+        batch_size = p.shape[0]
+        size_update_period = scale_grads.shape[0]
+        # correct beta2 for the size update period: we will have
+        # faster decay at this level.
+        beta2_corr = beta2**size_update_period
+        scale_exp_avg_sq = state["scale_exp_avg_sq"]  # shape: (batch_size, 1, 1, ..)
+        scale_exp_avg_sq.mul_(beta2_corr).add_(
+            (scale_grads**2).mean(dim=0),  # mean over dim `size_update_period`
+            alpha=1 - beta2_corr,
+        )  # shape is (batch_size, 1, 1, ...)
+        # The 1st time we reach here is when size_step == 1.
+        size_step = (step + 1) // size_update_period
+        bias_correction2 = 1 - beta2_corr**size_step
+        # we don't bother with bias_correction1; this will help prevent divergence
+        # at the start of training.
+        denom = scale_exp_avg_sq.sqrt() + eps
+        scale_step = (
+            -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom
+        )
+        is_too_small = param_rms < param_min_rms
+        is_too_large = param_rms > param_max_rms
+        # when the param gets too small, just don't shrink it any further.
+        scale_step.masked_fill_(is_too_small, 0.0)
+        # when it gets too large, stop it from getting any larger.
+        scale_step.masked_fill_(is_too_large, -size_lr * size_update_period)
+        delta = state["delta"]
+        # the factor of (1-beta1) relates to momentum.
+        delta.add_(p * scale_step, alpha=(1 - beta1))
+    def _step(self, group: dict, p: Tensor, state: dict):
+        """
+        This function does the core update of self.step(), in the case where the members of
+        the batch have more than 1 element.
+        Args:
+            group: A dict which will be used to look up configuration values
+                p: The parameter to be updated
+             grad: The grad of p
+            state: The state-dict corresponding to parameter p
+        This function modifies p.
+        """
+        grad = p.grad
+        lr = group["lr"]
+        beta1, beta2 = group["betas"]
+        eps = group["eps"]
+        param_min_rms = group["param_min_rms"]
+        step = state["step"]
+        exp_avg_sq = state["exp_avg_sq"]
+        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2))
+        this_step = state["step"] - (state["zero_step"] if "zero_step" in state else 0)
+        bias_correction2 = 1 - beta2 ** (this_step + 1)
+        if bias_correction2 < 0.99:
+            # note: not in-place.
+            exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2)
+        denom = exp_avg_sq.sqrt()
+        denom += eps
+        grad = grad / denom
+        alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms)
+        delta = state["delta"]
+        delta.add_(grad * alpha)
+        p.add_(delta)
+    def _step_scalar(self, group: dict, p: Tensor, state: dict):
+        """
+        A simplified form of the core update for scalar tensors, where we cannot get a good
+        estimate of the parameter rms.
+        """
+        beta1, beta2 = group["betas"]
+        scalar_max = group["scalar_max"]
+        eps = group["eps"]
+        lr = group["lr"] * group["scalar_lr_scale"]
+        grad = p.grad
+        exp_avg_sq = state["exp_avg_sq"]  # shape: (batch_size,)
+        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+        # bias_correction2 is like in Adam.  Don't bother with bias_correction1;
+        # slower update at the start will help stability anyway.
+        bias_correction2 = 1 - beta2 ** (state["step"] + 1)
+        denom = (exp_avg_sq / bias_correction2).sqrt() + eps
+        delta = state["delta"]
+        delta.add_(grad / denom, alpha=-lr * (1 - beta1))
+        p.clamp_(min=-scalar_max, max=scalar_max)
+        p.add_(delta)

preprocessors/__init__.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+For source datasets' standard samples
+"""
+from collections import defaultdict
+import os
+import json
+SPEECH_DATASETS = ["vctk", "vctksample"]
+GOLDEN_TEST_SAMPLES = defaultdict(list)
+GOLDEN_TEST_SAMPLES["m4singer"] = [
+    "Alto-1_美错_0014",
+    "Bass-1_十年_0008",
+    "Soprano-2_同桌的你_0018",
+    "Tenor-5_爱笑的眼睛_0010",
+]
+GOLDEN_TEST_SAMPLES["svcc"] = [
+    # IDF1
+    "IDF1_10030",
+    "IDF1_10120",
+    "IDF1_10140",
+    # IDM1
+    "IDM1_10001",
+    "IDM1_10030",
+    "IDM1_10120",
+    # CDF1
+    "CDF1_10030",
+    "CDF1_10120",
+    "CDF1_10140",
+    # CDM1
+    "CDM1_10001",
+    "CDM1_10030",
+    "CDM1_10120",
+]
+GOLDEN_TEST_SAMPLES["svcceval"] = [
+    # SF1
+    "SF1_30001",
+    "SF1_30002",
+    "SF1_30003",
+    # SM1
+    "SM1_30001",
+    "SM1_30002",
+    "SM1_30003",
+]
+GOLDEN_TEST_SAMPLES["popbutfy"] = [
+    "Female1#you_are_my_sunshine_Professional#0",
+    "Female4#Someone_Like_You_Professional#10",
+    "Male2#Lemon_Tree_Professional#12",
+    "Male5#can_you_feel_the_love_tonight_Professional#20",
+]
+GOLDEN_TEST_SAMPLES["opensinger"] = [
+    "Man_0_大鱼_10",
+    "Man_21_丑八怪_14",
+    "Woman_39_mojito_22",
+    "Woman_40_易燃易爆炸_12",
+]
+GOLDEN_TEST_SAMPLES["nus48e"] = [
+    "ADIZ_read#01#0000",
+    "MCUR_sing#10#0000",
+    "JLEE_read#08#0001",
+    "SAMF_sing#18#0001",
+]
+GOLDEN_TEST_SAMPLES["popcs"] = [
+    "明天会更好_0004",
+    "欧若拉_0005",
+    "虫儿飞_0006",
+    "隐形的翅膀_0008",
+]
+GOLDEN_TEST_SAMPLES["kising"] = [
+    "421_0040",
+    "424_0013",
+    "431_0026",
+]
+GOLDEN_TEST_SAMPLES["csd"] = [
+    "en_004a_0001",
+    "en_042b_0006",
+    "kr_013a_0006",
+    "kr_045b_0004",
+]
+GOLDEN_TEST_SAMPLES["opera"] = [
+    "fem_01#neg_1#0000",
+    "fem_12#pos_3#0003",
+    "male_02#neg_1#0002",
+    "male_11#pos_2#0001",
+]
+GOLDEN_TEST_SAMPLES["lijian"] = [
+    "058矜持_0000",
+    "079绒花_0000",
+    "120遥远的天空底下_0000",
+]
+GOLDEN_TEST_SAMPLES["cdmusiceval"] = ["陶喆_普通朋友", "蔡琴_给电影人的情书"]
+GOLDEN_TRAIN_SAMPLES = defaultdict(list)
+def get_golden_samples_indexes(
+    dataset_name,
+    dataset_dir=None,
+    cfg=None,
+    split=None,
+    min_samples=5,
+):
+    """
+    # Get Standard samples' indexes
+    """
+    if dataset_dir is None:
+        assert cfg is not None
+        dataset_dir = os.path.join(
+            cfg.OUTPUT_PATH,
+            "preprocess/{}_version".format(cfg.PREPROCESS_VERSION),
+            dataset_name,
+        )
+    assert split is not None
+    utt_file = os.path.join(dataset_dir, "{}.json".format(split))
+    with open(utt_file, "r", encoding="utf-8") as f:
+        samples = json.load(f)
+    if "train" in split:
+        golden_samples = GOLDEN_TRAIN_SAMPLES[dataset_name]
+    if "test" in split:
+        golden_samples = GOLDEN_TEST_SAMPLES[dataset_name]
+    res = []
+    for idx, utt in enumerate(samples):
+        if utt["Uid"] in golden_samples:
+            res.append(idx)
+        if dataset_name == "cdmusiceval":
+            if "_".join(utt["Uid"].split("_")[:2]) in golden_samples:
+                res.append(idx)
+    if len(res) == 0:
+        res = [i for i in range(min_samples)]
+    return res
+def get_specific_singer_indexes(dataset_dir, singer_name, split):
+    utt_file = os.path.join(dataset_dir, "{}.json".format(split))
+    with open(utt_file, "r", encoding="utf-8") as f:
+        samples = json.load(f)
+    res = []
+    for idx, utt in enumerate(samples):
+        if utt["Singer"] == singer_name:
+            res.append(idx)
+    assert len(res) != 0
+    return res
+def get_uids_and_wav_paths(
+    cfg, dataset, dataset_type="train", only_specific_singer=None, return_singers=False
+):
+    dataset_dir = os.path.join(
+        cfg.OUTPUT_PATH, "preprocess/{}_version".format(cfg.PREPROCESS_VERSION), dataset
+    )
+    dataset_file = os.path.join(
+        dataset_dir, "{}.json".format(dataset_type.split("_")[-1])
+    )
+    with open(dataset_file, "r") as f:
+        utterances = json.load(f)
+    indexes = range(len(utterances))
+    if "golden" in dataset_type:
+        # golden_train or golden_test
+        indexes = get_golden_samples_indexes(
+            dataset, dataset_dir, split=dataset_type.split("_")[-1]
+        )
+    if only_specific_singer is not None:
+        indexes = get_specific_singer_indexes(
+            dataset_dir, only_specific_singer, dataset_type
+        )
+    uids = [utterances[i]["Uid"] for i in indexes]
+    wav_paths = [utterances[i]["Path"] for i in indexes]
+    singers = [utterances[i]["Singer"] for i in indexes]
+    if not return_singers:
+        return uids, wav_paths
+    else:
+        return uids, wav_paths, singers

preprocessors/bigdata.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import os
+from collections import defaultdict
+from tqdm import tqdm
+def get_uids_and_wav_paths(cfg, dataset, dataset_type):
+    assert dataset == "bigdata"
+    dataset_dir = os.path.join(
+        cfg.OUTPUT_PATH,
+        "preprocess/{}_version".format(cfg.PREPROCESS_VERSION),
+        "bigdata/{}".format(cfg.BIGDATA_VERSION),
+    )
+    dataset_file = os.path.join(
+        dataset_dir, "{}.json".format(dataset_type.split("_")[-1])
+    )
+    with open(dataset_file, "r") as f:
+        utterances = json.load(f)
+    # Uids
+    uids = [u["Uid"] for u in utterances]
+    # Wav paths
+    wav_paths = [u["Path"] for u in utterances]
+    return uids, wav_paths
+def take_duration(utt):
+    return utt["Duration"]
+def main(output_path, cfg):
+    datasets = cfg.dataset
+    print("-" * 10)
+    print("Preparing samples for bigdata...")
+    print("Including: \n{}\n".format("\n".join(datasets)))
+    datasets.sort()
+    bigdata_version = "_".join(datasets)
+    save_dir = os.path.join(output_path, bigdata_version)
+    os.makedirs(save_dir, exist_ok=True)
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    singer_dict_file = os.path.join(save_dir, cfg.preprocess.spk2id)
+    utt2singer_file = os.path.join(save_dir, cfg.preprocess.utt2spk)
+    utt2singer = open(utt2singer_file, "a+")
+    # We select songs of standard samples as test songs
+    train = []
+    test = []
+    train_total_duration = 0
+    test_total_duration = 0
+    # Singer unique names
+    singer_names = set()
+    for dataset in datasets:
+        dataset_path = os.path.join(output_path, dataset)
+        train_json = os.path.join(dataset_path, "train.json")
+        test_json = os.path.join(dataset_path, "test.json")
+        with open(train_json, "r", encoding="utf-8") as f:
+            train_utterances = json.load(f)
+        with open(test_json, "r", encoding="utf-8") as f:
+            test_utterances = json.load(f)
+        for utt in tqdm(train_utterances):
+            train.append(utt)
+            train_total_duration += utt["Duration"]
+            singer_names.add("{}_{}".format(utt["Dataset"], utt["Singer"]))
+            utt2singer.write(
+                "{}_{}\t{}_{}\n".format(
+                    utt["Dataset"], utt["Uid"], utt["Dataset"], utt["Singer"]
+                )
+            )
+        for utt in test_utterances:
+            test.append(utt)
+            test_total_duration += utt["Duration"]
+            singer_names.add("{}_{}".format(utt["Dataset"], utt["Singer"]))
+            utt2singer.write(
+                "{}_{}\t{}_{}\n".format(
+                    utt["Dataset"], utt["Uid"], utt["Dataset"], utt["Singer"]
+                )
+            )
+    utt2singer.close()
+    train.sort(key=take_duration)
+    test.sort(key=take_duration)
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Singer Look Up Table
+    singer_names = list(singer_names)
+    singer_names.sort()
+    singer_lut = {name: i for i, name in enumerate(singer_names)}
+    print("#Singers: {}\n".format(len(singer_lut)))
+    # Save
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)
+    # Save meta info
+    meta_info = {
+        "datasets": datasets,
+        "train": {"size": len(train), "hours": round(train_total_duration / 3600, 4)},
+        "test": {"size": len(test), "hours": round(test_total_duration / 3600, 4)},
+        "singers": {"size": len(singer_lut)},
+    }
+    singer2mins = defaultdict(float)
+    for utt in train:
+        dataset, singer, duration = utt["Dataset"], utt["Singer"], utt["Duration"]
+        singer2mins["{}_{}".format(dataset, singer)] += duration / 60
+    singer2mins = sorted(singer2mins.items(), key=lambda x: x[1], reverse=True)
+    singer2mins = dict(
+        zip([i[0] for i in singer2mins], [round(i[1], 2) for i in singer2mins])
+    )
+    meta_info["singers"]["training_minutes"] = singer2mins
+    with open(os.path.join(save_dir, "meta_info.json"), "w") as f:
+        json.dump(meta_info, f, indent=4, ensure_ascii=False)
+    for singer, min in singer2mins.items():
+        print("Singer {}: {} mins".format(singer, min))
+    print("-" * 10, "\n")

preprocessors/cdmusiceval.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from glob import glob
+import os
+import json
+import torchaudio
+from tqdm import tqdm
+from collections import defaultdict
+from utils.util import has_existed, remove_and_create
+from utils.audio_slicer import split_utterances_from_audio
+def split_to_utterances(input_dir, output_dir):
+    print("Splitting to utterances for {}...".format(input_dir))
+    files_list = glob("*", root_dir=input_dir)
+    files_list.sort()
+    for wav_file in tqdm(files_list):
+        # # Load waveform
+        # waveform, fs = torchaudio.load(os.path.join(input_dir, wav_file))
+        # Singer name, Song name
+        song_name, singer_name = wav_file.split("_")[2].split("-")
+        save_dir = os.path.join(output_dir, singer_name, song_name)
+        split_utterances_from_audio(
+            os.path.join(input_dir, wav_file), save_dir, max_duration_of_utterance=10
+        )
+        # # Split
+        # slicer = Slicer(sr=fs, threshold=-30.0, max_sil_kept=3000, min_interval=1000)
+        # chunks = slicer.slice(waveform)
+        # for i, chunk in enumerate(chunks):
+        #     save_dir = os.path.join(output_dir, singer_name, song_name)
+        #     os.makedirs(save_dir, exist_ok=True)
+        #     output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
+        #     save_audio(output_file, chunk, fs)
+def _main(dataset_path):
+    """
+    Split to utterances
+    """
+    utterance_dir = os.path.join(dataset_path, "utterances")
+    remove_and_create(utterance_dir)
+    split_to_utterances(os.path.join(dataset_path, "vocal"), utterance_dir)
+def statistics(utterance_dir):
+    singers = []
+    songs = []
+    singers2songs = defaultdict(lambda: defaultdict(list))
+    singer_infos = glob(utterance_dir + "/*")
+    for singer_info in singer_infos:
+        singer = singer_info.split("/")[-1]
+        song_infos = glob(singer_info + "/*")
+        for song_info in song_infos:
+            song = song_info.split("/")[-1]
+            singers.append(singer)
+            songs.append(song)
+            utts = glob(song_info + "/*.wav")
+            for utt in utts:
+                uid = utt.split("/")[-1].split(".")[0]
+                singers2songs[singer][song].append(uid)
+    unique_singers = list(set(singers))
+    unique_songs = list(set(songs))
+    unique_singers.sort()
+    unique_songs.sort()
+    print(
+        "Statistics: {} singers, {} utterances ({} unique songs)".format(
+            len(unique_singers), len(songs), len(unique_songs)
+        )
+    )
+    print("Singers: \n{}".format("\t".join(unique_singers)))
+    return singers2songs, unique_singers
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing samples for CD Music Eval...\n")
+    if not os.path.exists(os.path.join(dataset_path, "utterances")):
+        print("Spliting into utterances...\n")
+        _main(dataset_path)
+    save_dir = os.path.join(output_path, "cdmusiceval")
+    os.makedirs(save_dir, exist_ok=True)
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    singer_dict_file = os.path.join(save_dir, "singers.json")
+    utt2singer_file = os.path.join(save_dir, "utt2singer")
+    if (
+        has_existed(train_output_file)
+        and has_existed(test_output_file)
+        and has_existed(singer_dict_file)
+        and has_existed(utt2singer_file)
+    ):
+        return
+    utt2singer = open(utt2singer_file, "w")
+    # Load
+    utt_path = os.path.join(dataset_path, "utterances")
+    singers2songs, unique_singers = statistics(utt_path)
+    # We select songs of standard samples as test songs
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    for singer, songs in tqdm(singers2songs.items()):
+        song_names = list(songs.keys())
+        for chosen_song in song_names:
+            for chosen_uid in songs[chosen_song]:
+                res = {
+                    "Dataset": "cdmusiceval",
+                    "Singer": singer,
+                    "Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
+                }
+                res["Path"] = "{}/{}/{}.wav".format(singer, chosen_song, chosen_uid)
+                res["Path"] = os.path.join(utt_path, res["Path"])
+                assert os.path.exists(res["Path"])
+                waveform, sample_rate = torchaudio.load(res["Path"])
+                duration = waveform.size(-1) / sample_rate
+                res["Duration"] = duration
+                if duration <= 1e-8:
+                    continue
+                res["index"] = test_index_count
+                test_total_duration += duration
+                test.append(res)
+                test_index_count += 1
+                utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save train.json and test.json
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)
+    # Save singers.json
+    singer_lut = {name: i for i, name in enumerate(unique_singers)}
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)

preprocessors/coco.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import torchaudio
+from tqdm import tqdm
+from glob import glob
+from collections import defaultdict
+from utils.util import has_existed
+from preprocessors import GOLDEN_TEST_SAMPLES
+def get_test_songs():
+    return ["007Di Da Di"]
+def coco_statistics(data_dir):
+    song2utts = defaultdict(list)
+    song_infos = glob(data_dir + "/*")
+    for song in song_infos:
+        song_name = song.split("/")[-1]
+        utts = glob(song + "/*.wav")
+        for utt in utts:
+            uid = utt.split("/")[-1].split(".")[0]
+            song2utts[song_name].append(uid)
+    print("Coco: {} songs".format(len(song_infos)))
+    return song2utts
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing datasets for Coco...\n")
+    save_dir = os.path.join(output_path, "coco")
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    if has_existed(test_output_file):
+        return
+    # Load
+    song2utts = coco_statistics(dataset_path)
+    test_songs = get_test_songs()
+    # We select songs of standard samples as test songs
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    for song_name, uids in tqdm(song2utts.items()):
+        for chosen_uid in uids:
+            res = {
+                "Dataset": "coco",
+                "Singer": "coco",
+                "Song": song_name,
+                "Uid": "{}_{}".format(song_name, chosen_uid),
+            }
+            res["Path"] = "{}/{}.wav".format(song_name, chosen_uid)
+            res["Path"] = os.path.join(dataset_path, res["Path"])
+            assert os.path.exists(res["Path"])
+            waveform, sample_rate = torchaudio.load(res["Path"])
+            duration = waveform.size(-1) / sample_rate
+            res["Duration"] = duration
+            if song_name in test_songs:
+                res["index"] = test_index_count
+                test_total_duration += duration
+                test.append(res)
+                test_index_count += 1
+            else:
+                res["index"] = train_index_count
+                train_total_duration += duration
+                train.append(res)
+                train_index_count += 1
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save
+    os.makedirs(save_dir, exist_ok=True)
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)

preprocessors/cocoeval.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import random
+import os
+import json
+import torchaudio
+from tqdm import tqdm
+from glob import glob
+from collections import defaultdict
+from utils.util import has_existed
+from utils.audio_slicer import split_utterances_from_audio
+from preprocessors import GOLDEN_TEST_SAMPLES
+def _split_utts():
+    raw_dir = "/mnt/chongqinggeminiceph1fs/geminicephfs/wx-mm-spr-xxxx/xueyaozhang/dataset/李玟/cocoeval/raw"
+    output_root = "/mnt/chongqinggeminiceph1fs/geminicephfs/wx-mm-spr-xxxx/xueyaozhang/dataset/李玟/cocoeval/utterances"
+    if os.path.exists(output_root):
+        os.system("rm -rf {}".format(output_root))
+    vocal_files = glob(os.path.join(raw_dir, "*/vocal.wav"))
+    for vocal_f in tqdm(vocal_files):
+        song_name = vocal_f.split("/")[-2]
+        output_dir = os.path.join(output_root, song_name)
+        os.makedirs(output_dir, exist_ok=True)
+        split_utterances_from_audio(vocal_f, output_dir, min_interval=300)
+def cocoeval_statistics(data_dir):
+    song2utts = defaultdict(list)
+    song_infos = glob(data_dir + "/*")
+    for song in song_infos:
+        song_name = song.split("/")[-1]
+        utts = glob(song + "/*.wav")
+        for utt in utts:
+            uid = utt.split("/")[-1].split(".")[0]
+            song2utts[song_name].append(uid)
+    print("Cocoeval: {} songs".format(len(song_infos)))
+    return song2utts
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing datasets for Cocoeval...\n")
+    save_dir = os.path.join(output_path, "cocoeval")
+    test_output_file = os.path.join(save_dir, "test.json")
+    if has_existed(test_output_file):
+        return
+    # Load
+    song2utts = cocoeval_statistics(dataset_path)
+    train, test = [], []
+    train_index_count, test_index_count = 0, 0
+    train_total_duration, test_total_duration = 0.0, 0.0
+    for song_name, uids in tqdm(song2utts.items()):
+        for chosen_uid in uids:
+            res = {
+                "Dataset": "cocoeval",
+                "Singer": "TBD",
+                "Song": song_name,
+                "Uid": "{}_{}".format(song_name, chosen_uid),
+            }
+            res["Path"] = "{}/{}.wav".format(song_name, chosen_uid)
+            res["Path"] = os.path.join(dataset_path, res["Path"])
+            assert os.path.exists(res["Path"])
+            waveform, sample_rate = torchaudio.load(res["Path"])
+            duration = waveform.size(-1) / sample_rate
+            res["Duration"] = duration
+            res["index"] = test_index_count
+            test_total_duration += duration
+            test.append(res)
+            test_index_count += 1
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save
+    os.makedirs(save_dir, exist_ok=True)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)

preprocessors/csd.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import os
+import glob
+from tqdm import tqdm
+import torchaudio
+import pandas as pd
+from glob import glob
+from collections import defaultdict
+from utils.io import save_audio
+from utils.util import has_existed
+from preprocessors import GOLDEN_TEST_SAMPLES
+def save_utterance(output_file, waveform, fs, start, end, overlap=0.1):
+    """
+    waveform: [#channel, audio_len]
+    start, end, overlap: seconds
+    """
+    start = int((start - overlap) * fs)
+    end = int((end + overlap) * fs)
+    utterance = waveform[:, start:end]
+    save_audio(output_file, utterance, fs)
+def split_to_utterances(language_dir, output_dir):
+    print("Splitting to utterances for {}...".format(language_dir))
+    wav_dir = os.path.join(language_dir, "wav")
+    phoneme_dir = os.path.join(language_dir, "txt")
+    annot_dir = os.path.join(language_dir, "csv")
+    pitches = set()
+    for wav_file in tqdm(glob("{}/*.wav".format(wav_dir))):
+        # Load waveform
+        song_name = wav_file.split("/")[-1].split(".")[0]
+        waveform, fs = torchaudio.load(wav_file)
+        # Load utterances
+        phoneme_file = os.path.join(phoneme_dir, "{}.txt".format(song_name))
+        with open(phoneme_file, "r") as f:
+            lines = f.readlines()
+        utterances = [l.strip().split() for l in lines]
+        utterances = [utt for utt in utterances if len(utt) > 0]
+        # Load annotation
+        annot_file = os.path.join(annot_dir, "{}.csv".format(song_name))
+        annot_df = pd.read_csv(annot_file)
+        pitches = pitches.union(set(annot_df["pitch"]))
+        starts = annot_df["start"].tolist()
+        ends = annot_df["end"].tolist()
+        syllables = annot_df["syllable"].tolist()
+        # Split
+        curr = 0
+        for i, phones in enumerate(utterances):
+            sz = len(phones)
+            assert phones[0] == syllables[curr]
+            assert phones[-1] == syllables[curr + sz - 1]
+            s = starts[curr]
+            e = ends[curr + sz - 1]
+            curr += sz
+            save_dir = os.path.join(output_dir, song_name)
+            os.makedirs(save_dir, exist_ok=True)
+            output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
+            save_utterance(output_file, waveform, fs, start=s, end=e)
+def _main(dataset_path):
+    """
+    Split to utterances
+    """
+    utterance_dir = os.path.join(dataset_path, "utterances")
+    for lang in ["english", "korean"]:
+        split_to_utterances(os.path.join(dataset_path, lang), utterance_dir)
+def get_test_songs():
+    golden_samples = GOLDEN_TEST_SAMPLES["csd"]
+    # every item is a tuple (language, song)
+    golden_songs = [s.split("_")[:2] for s in golden_samples]
+    # language_song, eg: en_001a
+    return golden_songs
+def csd_statistics(data_dir):
+    languages = []
+    songs = []
+    languages2songs = defaultdict(lambda: defaultdict(list))
+    folder_infos = glob(data_dir + "/*")
+    for folder_info in folder_infos:
+        folder_info_split = folder_info.split("/")[-1]
+        language = folder_info_split[:2]
+        song = folder_info_split[2:]
+        languages.append(language)
+        songs.append(song)
+        utts = glob(folder_info + "/*")
+        for utt in utts:
+            uid = utt.split("/")[-1].split(".")[0]
+            languages2songs[language][song].append(uid)
+    unique_languages = list(set(languages))
+    unique_songs = list(set(songs))
+    unique_languages.sort()
+    unique_songs.sort()
+    print(
+        "csd: {} languages, {} utterances ({} unique songs)".format(
+            len(unique_languages), len(songs), len(unique_songs)
+        )
+    )
+    print("Languages: \n{}".format("\t".join(unique_languages)))
+    return languages2songs
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing test samples for csd...\n")
+    if not os.path.exists(os.path.join(dataset_path, "utterances")):
+        print("Spliting into utterances...\n")
+        _main(dataset_path)
+    save_dir = os.path.join(output_path, "csd")
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    if has_existed(test_output_file):
+        return
+    # Load
+    csd_path = os.path.join(dataset_path, "utterances")
+    language2songs = csd_statistics(csd_path)
+    test_songs = get_test_songs()
+    # We select songs of standard samples as test songs
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    for language, songs in tqdm(language2songs.items()):
+        song_names = list(songs.keys())
+        for chosen_song in song_names:
+            for chosen_uid in songs[chosen_song]:
+                res = {
+                    "Dataset": "csd",
+                    "Singer": "Female1_{}".format(language),
+                    "Uid": "{}_{}_{}".format(language, chosen_song, chosen_uid),
+                }
+                res["Path"] = "{}{}/{}.wav".format(language, chosen_song, chosen_uid)
+                res["Path"] = os.path.join(csd_path, res["Path"])
+                assert os.path.exists(res["Path"])
+                waveform, sample_rate = torchaudio.load(res["Path"])
+                duration = waveform.size(-1) / sample_rate
+                res["Duration"] = duration
+                if [language, chosen_song] in test_songs:
+                    res["index"] = test_index_count
+                    test_total_duration += duration
+                    test.append(res)
+                    test_index_count += 1
+                else:
+                    res["index"] = train_index_count
+                    train_total_duration += duration
+                    train.append(res)
+                    train_index_count += 1
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save
+    os.makedirs(save_dir, exist_ok=True)
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)

preprocessors/custom.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from glob import glob
+import os
+import json
+import torchaudio
+from tqdm import tqdm
+from collections import defaultdict
+from utils.util import has_existed
+def statistics(utterance_dir):
+    singers = []
+    songs = []
+    singers2songs = defaultdict(lambda: defaultdict(list))
+    singer_infos = glob(utterance_dir + "/*")
+    for singer_info in singer_infos:
+        singer = singer_info.split("/")[-1]
+        song_infos = glob(singer_info + "/*")
+        for song_info in song_infos:
+            song = song_info.split("/")[-1]
+            singers.append(singer)
+            songs.append(song)
+            utts = glob(song_info + "/*.wav")
+            for utt in utts:
+                uid = utt.split("/")[-1].split(".")[0]
+                singers2songs[singer][song].append(uid)
+    unique_singers = list(set(singers))
+    unique_songs = list(set(songs))
+    unique_singers.sort()
+    unique_songs.sort()
+    print(
+        "Statistics: {} singers, {} utterances ({} unique songs)".format(
+            len(unique_singers), len(songs), len(unique_songs)
+        )
+    )
+    print("Singers: \n{}".format("\t".join(unique_singers)))
+    return singers2songs, unique_singers
+def main(output_path, dataset_path, dataset_name):
+    print("-" * 10)
+    print("Preparing samples for {}...\n".format(dataset_name))
+    save_dir = os.path.join(output_path, dataset_name)
+    os.makedirs(save_dir, exist_ok=True)
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    singer_dict_file = os.path.join(save_dir, "singers.json")
+    utt2singer_file = os.path.join(save_dir, "utt2singer")
+    if (
+        has_existed(train_output_file)
+        and has_existed(test_output_file)
+        and has_existed(singer_dict_file)
+        and has_existed(utt2singer_file)
+    ):
+        return
+    utt2singer = open(utt2singer_file, "w")
+    # Load
+    singers2songs, unique_singers = statistics(dataset_path)
+    # We select songs of standard samples as test songs
+    train = []
+    test = []
+    test_songs = set()
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    for singer, songs in singers2songs.items():
+        song_names = list(songs.keys())
+        print("Singer {}...".format(singer))
+        for chosen_song in tqdm(song_names):
+            for chosen_uid in songs[chosen_song]:
+                res = {
+                    "Dataset": dataset_name,
+                    "Singer": singer,
+                    "Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
+                }
+                res["Path"] = "{}/{}/{}.wav".format(singer, chosen_song, chosen_uid)
+                res["Path"] = os.path.join(dataset_path, res["Path"])
+                assert os.path.exists(res["Path"])
+                waveform, sample_rate = torchaudio.load(res["Path"])
+                duration = waveform.size(-1) / sample_rate
+                res["Duration"] = duration
+                # Remove the utterance whose duration is shorter than 0.1s
+                if duration <= 1e-2:
+                    continue
+                # Place into train or test
+                if "{}_{}".format(singer, chosen_song) not in test_songs:
+                    test_songs.add("{}_{}".format(singer, chosen_song))
+                    res["index"] = test_index_count
+                    test_total_duration += duration
+                    test.append(res)
+                    test_index_count += 1
+                else:
+                    res["index"] = train_index_count
+                    train_total_duration += duration
+                    train.append(res)
+                    train_index_count += 1
+                utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save train.json and test.json
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)
+    # Save singers.json
+    singer_lut = {name: i for i, name in enumerate(unique_singers)}
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)

preprocessors/kising.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import random
+import os
+import json
+import torchaudio
+from tqdm import tqdm
+from glob import glob
+from collections import defaultdict
+from utils.util import has_existed
+from preprocessors import GOLDEN_TEST_SAMPLES
+def get_test_folders():
+    golden_samples = GOLDEN_TEST_SAMPLES["kising"]
+    # every item is a string
+    golden_folders = [s.split("_")[:1] for s in golden_samples]
+    # folder, eg: 422
+    return golden_folders
+def KiSing_statistics(data_dir):
+    folders = []
+    folders2utts = defaultdict(list)
+    folder_infos = glob(data_dir + "/*")
+    for folder_info in folder_infos:
+        folder = folder_info.split("/")[-1]
+        folders.append(folder)
+        utts = glob(folder_info + "/*.wav")
+        for utt in utts:
+            uid = utt.split("/")[-1].split(".")[0]
+            folders2utts[folder].append(uid)
+    unique_folders = list(set(folders))
+    unique_folders.sort()
+    print("KiSing: {} unique songs".format(len(unique_folders)))
+    return folders2utts
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing test samples for KiSing...\n")
+    save_dir = os.path.join(output_path, "kising")
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    if has_existed(test_output_file):
+        return
+    # Load
+    KiSing_dir = dataset_path
+    folders2utts = KiSing_statistics(KiSing_dir)
+    test_folders = get_test_folders()
+    # We select songs of standard samples as test songs
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    folder_names = list(folders2utts.keys())
+    for chosen_folder in folder_names:
+        for chosen_uid in folders2utts[chosen_folder]:
+            res = {
+                "Dataset": "kising",
+                "Singer": "female1",
+                "Uid": "{}_{}".format(chosen_folder, chosen_uid),
+            }
+            res["Path"] = "{}/{}.wav".format(chosen_folder, chosen_uid)
+            res["Path"] = os.path.join(KiSing_dir, res["Path"])
+            assert os.path.exists(res["Path"])
+            waveform, sample_rate = torchaudio.load(res["Path"])
+            duration = waveform.size(-1) / sample_rate
+            res["Duration"] = duration
+            if ([chosen_folder]) in test_folders:
+                res["index"] = test_index_count
+                test_total_duration += duration
+                test.append(res)
+                test_index_count += 1
+            else:
+                res["index"] = train_index_count
+                train_total_duration += duration
+                train.append(res)
+                train_index_count += 1
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save
+    os.makedirs(save_dir, exist_ok=True)
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)

preprocessors/libritts.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import torchaudio
+from tqdm import tqdm
+from glob import glob
+from collections import defaultdict
+from utils.util import has_existed
+def libritts_statistics(data_dir):
+    speakers = []
+    distribution2speakers2pharases2utts = defaultdict(
+        lambda: defaultdict(lambda: defaultdict(list))
+    )
+    distribution_infos = glob(data_dir + "/*")
+    for distribution_info in distribution_infos:
+        distribution = distribution_info.split("/")[-1]
+        print(distribution)
+        speaker_infos = glob(distribution_info + "/*")
+        if len(speaker_infos) == 0:
+            continue
+        for speaker_info in speaker_infos:
+            speaker = speaker_info.split("/")[-1]
+            speakers.append(speaker)
+            pharase_infos = glob(speaker_info + "/*")
+            for pharase_info in pharase_infos:
+                pharase = pharase_info.split("/")[-1]
+                utts = glob(pharase_info + "/*.wav")
+                for utt in utts:
+                    uid = utt.split("/")[-1].split(".")[0]
+                    distribution2speakers2pharases2utts[distribution][speaker][
+                        pharase
+                    ].append(uid)
+    unique_speakers = list(set(speakers))
+    unique_speakers.sort()
+    print("Speakers: \n{}".format("\t".join(unique_speakers)))
+    return distribution2speakers2pharases2utts, unique_speakers
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing samples for libritts...\n")
+    save_dir = os.path.join(output_path, "libritts")
+    os.makedirs(save_dir, exist_ok=True)
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    singer_dict_file = os.path.join(save_dir, "singers.json")
+    utt2singer_file = os.path.join(save_dir, "utt2singer")
+    if has_existed(train_output_file):
+        return
+    utt2singer = open(utt2singer_file, "w")
+    # Load
+    libritts_path = dataset_path
+    distribution2speakers2pharases2utts, unique_speakers = libritts_statistics(
+        libritts_path
+    )
+    # We select pharases of standard spekaer as test songs
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    for distribution, speakers2pharases2utts in tqdm(
+        distribution2speakers2pharases2utts.items()
+    ):
+        for speaker, pharases2utts in tqdm(speakers2pharases2utts.items()):
+            pharase_names = list(pharases2utts.keys())
+            for chosen_pharase in pharase_names:
+                for chosen_uid in pharases2utts[chosen_pharase]:
+                    res = {
+                        "Dataset": "libritts",
+                        "Singer": speaker,
+                        "Uid": "{}#{}#{}#{}".format(
+                            distribution, speaker, chosen_pharase, chosen_uid
+                        ),
+                    }
+                    res["Path"] = "{}/{}/{}/{}.wav".format(
+                        distribution, speaker, chosen_pharase, chosen_uid
+                    )
+                    res["Path"] = os.path.join(libritts_path, res["Path"])
+                    assert os.path.exists(res["Path"])
+                    waveform, sample_rate = torchaudio.load(res["Path"])
+                    duration = waveform.size(-1) / sample_rate
+                    res["Duration"] = duration
+                    if not "train" in distribution:
+                        res["index"] = test_index_count
+                        test_total_duration += duration
+                        test.append(res)
+                        test_index_count += 1
+                    else:
+                        res["index"] = train_index_count
+                        train_total_duration += duration
+                        train.append(res)
+                        train_index_count += 1
+                    utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save train.json and test.json
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)
+    # Save singers.json
+    singer_lut = {name: i for i, name in enumerate(unique_speakers)}
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)

preprocessors/lijian.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import glob
+import os
+import json
+import torchaudio
+from tqdm import tqdm
+from collections import defaultdict
+from utils.io import save_audio
+from utils.util import has_existed, remove_and_create
+from utils.audio_slicer import Slicer
+from preprocessors import GOLDEN_TEST_SAMPLES
+def split_to_utterances(input_dir, output_dir):
+    print("Splitting to utterances for {}...".format(input_dir))
+    files_list = glob.glob("*.flac", root_dir=input_dir)
+    files_list.sort()
+    for wav_file in tqdm(files_list):
+        # Load waveform
+        waveform, fs = torchaudio.load(os.path.join(input_dir, wav_file))
+        # Song name
+        filename = wav_file.replace(" ", "")
+        filename = filename.replace("(Live)", "")
+        song_id, filename = filename.split("李健-")
+        song_id = song_id.split("_")[0]
+        song_name = "{:03d}".format(int(song_id)) + filename.split("_")[0].split("-")[0]
+        # Split
+        slicer = Slicer(sr=fs, threshold=-30.0, max_sil_kept=3000)
+        chunks = slicer.slice(waveform)
+        save_dir = os.path.join(output_dir, song_name)
+        remove_and_create(save_dir)
+        for i, chunk in enumerate(chunks):
+            output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
+            save_audio(output_file, chunk, fs)
+def _main(dataset_path):
+    """
+    Split to utterances
+    """
+    utterance_dir = os.path.join(dataset_path, "utterances")
+    split_to_utterances(os.path.join(dataset_path, "vocal_v2"), utterance_dir)
+def get_test_songs():
+    golden_samples = GOLDEN_TEST_SAMPLES["lijian"]
+    golden_songs = [s.split("_")[0] for s in golden_samples]
+    return golden_songs
+def statistics(utt_dir):
+    song2utts = defaultdict(list)
+    song_infos = glob.glob(utt_dir + "/*")
+    song_infos.sort()
+    for song in song_infos:
+        song_name = song.split("/")[-1]
+        utt_infos = glob.glob(song + "/*.wav")
+        utt_infos.sort()
+        for utt in utt_infos:
+            uid = utt.split("/")[-1].split(".")[0]
+            song2utts[song_name].append(uid)
+    utt_sum = sum([len(utts) for utts in song2utts.values()])
+    print("Li Jian: {} unique songs, {} utterances".format(len(song2utts), utt_sum))
+    return song2utts
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing test samples for Li Jian...\n")
+    if not os.path.exists(os.path.join(dataset_path, "utterances")):
+        print("Spliting into utterances...\n")
+        _main(dataset_path)
+    save_dir = os.path.join(output_path, "lijian")
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    if has_existed(test_output_file):
+        return
+    # Load
+    lijian_path = os.path.join(dataset_path, "utterances")
+    song2utts = statistics(lijian_path)
+    test_songs = get_test_songs()
+    # We select songs of standard samples as test songs
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    for chosen_song, utts in tqdm(song2utts.items()):
+        for chosen_uid in song2utts[chosen_song]:
+            res = {
+                "Dataset": "lijian",
+                "Singer": "lijian",
+                "Uid": "{}_{}".format(chosen_song, chosen_uid),
+            }
+            res["Path"] = "{}/{}.wav".format(chosen_song, chosen_uid)
+            res["Path"] = os.path.join(lijian_path, res["Path"])
+            assert os.path.exists(res["Path"])
+            waveform, sample_rate = torchaudio.load(res["Path"])
+            duration = waveform.size(-1) / sample_rate
+            res["Duration"] = duration
+            if duration <= 1e-8:
+                continue
+            if chosen_song in test_songs:
+                res["index"] = test_index_count
+                test_total_duration += duration
+                test.append(res)
+                test_index_count += 1
+            else:
+                res["index"] = train_index_count
+                train_total_duration += duration
+                train.append(res)
+                train_index_count += 1
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save
+    os.makedirs(save_dir, exist_ok=True)
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)

preprocessors/ljspeech.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+from tqdm import tqdm
+import os
+import torchaudio
+from utils import audio
+import csv
+import random
+from utils.util import has_existed
+from text import _clean_text
+import librosa
+import soundfile as sf
+from scipy.io import wavfile
+from pathlib import Path
+import numpy as np
+def textgird_extract(
+    corpus_directory,
+    output_directory,
+    mfa_path=os.path.join("mfa", "montreal-forced-aligner", "bin", "mfa_align"),
+    lexicon=os.path.join("mfa", "lexicon", "librispeech-lexicon.txt"),
+    acoustic_model_path=os.path.join(
+        "mfa", "montreal-forced-aligner", "pretrained_models", "english.zip"
+    ),
+    jobs="8",
+):
+    assert os.path.exists(
+        corpus_directory
+    ), "Please check the directionary contains *.wav, *.lab"
+    assert (
+        os.path.exists(mfa_path)
+        and os.path.exists(lexicon)
+        and os.path.exists(acoustic_model_path)
+    ), f"Please download the MFA tools to {mfa_path} firstly"
+    Path(output_directory).mkdir(parents=True, exist_ok=True)
+    print(f"MFA results are save in {output_directory}")
+    os.system(
+        f".{os.path.sep}{mfa_path} {corpus_directory} {lexicon} {acoustic_model_path} {output_directory} -j {jobs} --clean"
+    )
+def get_lines(file):
+    lines = []
+    with open(file, encoding="utf-8") as f:
+        for line in tqdm(f):
+            lines.append(line.strip())
+    return lines
+def get_uid2utt(ljspeech_path, dataset, cfg):
+    index_count = 0
+    total_duration = 0
+    uid2utt = []
+    for l in tqdm(dataset):
+        items = l.split("|")
+        uid = items[0]
+        text = items[2]
+        res = {
+            "Dataset": "LJSpeech",
+            "index": index_count,
+            "Singer": "LJSpeech",
+            "Uid": uid,
+            "Text": text,
+        }
+        # Duration in wav files
+        audio_file = os.path.join(ljspeech_path, "wavs/{}.wav".format(uid))
+        res["Path"] = audio_file
+        waveform, sample_rate = torchaudio.load(audio_file)
+        duration = waveform.size(-1) / sample_rate
+        res["Duration"] = duration
+        uid2utt.append(res)
+        index_count = index_count + 1
+        total_duration += duration
+    return uid2utt, total_duration / 3600
+def split_dataset(lines, test_rate=0.05, test_size=None):
+    if test_size == None:
+        test_size = int(len(lines) * test_rate)
+    random.shuffle(lines)
+    train_set = []
+    test_set = []
+    for line in lines[:test_size]:
+        test_set.append(line)
+    for line in lines[test_size:]:
+        train_set.append(line)
+    return train_set, test_set
+max_wav_value = 32768.0
+def prepare_align(dataset, dataset_path, cfg, output_path):
+    in_dir = dataset_path
+    out_dir = os.path.join(output_path, dataset, cfg.raw_data)
+    sampling_rate = cfg.sample_rate
+    cleaners = cfg.text_cleaners
+    speaker = "LJSpeech"
+    with open(os.path.join(dataset_path, "metadata.csv"), encoding="utf-8") as f:
+        for line in tqdm(f):
+            parts = line.strip().split("|")
+            base_name = parts[0]
+            text = parts[2]
+            text = _clean_text(text, cleaners)
+            output_wav_path = os.path.join(out_dir, speaker, "{}.wav".format(base_name))
+            output_lab_path = os.path.join(out_dir, speaker, "{}.lab".format(base_name))
+            if os.path.exists(output_wav_path) and os.path.exists(output_lab_path):
+                continue
+            wav_path = os.path.join(in_dir, "wavs", "{}.wav".format(base_name))
+            if os.path.exists(wav_path):
+                os.makedirs(os.path.join(out_dir, speaker), exist_ok=True)
+                wav, _ = librosa.load(wav_path, sampling_rate)
+                wav = wav / max(abs(wav)) * max_wav_value
+                wavfile.write(
+                    os.path.join(out_dir, speaker, "{}.wav".format(base_name)),
+                    sampling_rate,
+                    wav.astype(np.int16),
+                )
+                with open(
+                    os.path.join(out_dir, speaker, "{}.lab".format(base_name)),
+                    "w",
+                ) as f1:
+                    f1.write(text)
+    # Extract textgird with MFA
+    textgird_extract(
+        corpus_directory=out_dir,
+        output_directory=os.path.join(output_path, dataset, "TextGrid"),
+    )
+def main(output_path, dataset_path, cfg):
+    print("-" * 10)
+    print("Dataset splits for {}...\n".format("LJSpeech"))
+    dataset = "LJSpeech"
+    save_dir = os.path.join(output_path, dataset)
+    os.makedirs(save_dir, exist_ok=True)
+    ljspeech_path = dataset_path
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    singer_dict_file = os.path.join(save_dir, "singers.json")
+    speaker = "LJSpeech"
+    speakers = [dataset + "_" + speaker]
+    singer_lut = {name: i for i, name in enumerate(sorted(speakers))}
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)
+    if has_existed(train_output_file) and has_existed(test_output_file):
+        return
+    meta_file = os.path.join(ljspeech_path, "metadata.csv")
+    lines = get_lines(meta_file)
+    train_set, test_set = split_dataset(lines)
+    res, hours = get_uid2utt(ljspeech_path, train_set, cfg)
+    # Save train
+    os.makedirs(save_dir, exist_ok=True)
+    with open(train_output_file, "w") as f:
+        json.dump(res, f, indent=4, ensure_ascii=False)
+    print("Train_hours= {}".format(hours))
+    res, hours = get_uid2utt(ljspeech_path, test_set, cfg)
+    # Save test
+    os.makedirs(save_dir, exist_ok=True)
+    with open(test_output_file, "w") as f:
+        json.dump(res, f, indent=4, ensure_ascii=False)
+    print("Test_hours= {}".format(hours))

preprocessors/ljspeech_vocoder.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import torchaudio
+from tqdm import tqdm
+from glob import glob
+from utils.util import has_existed
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Dataset splits for ljspeech...\n")
+    save_dir = os.path.join(output_path, "ljspeech")
+    ljspeech_path = dataset_path
+    wave_files = glob(ljspeech_path + "/wavs/*.wav")
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    if has_existed(train_output_file):
+        return
+    utts = []
+    for wave_file in tqdm(wave_files):
+        res = {
+            "Dataset": "ljspeech",
+            "Singer": "female1",
+            "Uid": "{}".format(wave_file.split("/")[-1].split(".")[0]),
+        }
+        res["Path"] = wave_file
+        assert os.path.exists(res["Path"])
+        waveform, sample_rate = torchaudio.load(res["Path"])
+        duration = waveform.size(-1) / sample_rate
+        res["Duration"] = duration
+        if duration <= 1e-8:
+            continue
+        utts.append(res)
+    test_length = len(utts) // 20
+    train_utts = []
+    train_index_count = 0
+    train_total_duration = 0
+    for i in tqdm(range(len(utts) - test_length)):
+        tmp = utts[i]
+        tmp["index"] = train_index_count
+        train_index_count += 1
+        train_total_duration += tmp["Duration"]
+        train_utts.append(tmp)
+    test_utts = []
+    test_index_count = 0
+    test_total_duration = 0
+    for i in tqdm(range(len(utts) - test_length, len(utts))):
+        tmp = utts[i]
+        tmp["index"] = test_index_count
+        test_index_count += 1
+        test_total_duration += tmp["Duration"]
+        test_utts.append(tmp)
+    print("#Train = {}, #Test = {}".format(len(train_utts), len(test_utts)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save
+    os.makedirs(save_dir, exist_ok=True)
+    with open(train_output_file, "w") as f:
+        json.dump(train_utts, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test_utts, f, indent=4, ensure_ascii=False)

preprocessors/m4singer.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import librosa
+from tqdm import tqdm
+from collections import defaultdict
+from utils.util import has_existed
+from preprocessors import GOLDEN_TEST_SAMPLES
+def get_test_songs():
+    golden_samples = GOLDEN_TEST_SAMPLES["m4singer"]
+    # every item is a tuple (singer, song)
+    golden_songs = [s.split("_")[:2] for s in golden_samples]
+    # singer_song, eg: Alto-1_美错
+    golden_songs = ["_".join(t) for t in golden_songs]
+    return golden_songs
+def m4singer_statistics(meta):
+    singers = []
+    songs = []
+    singer2songs = defaultdict(lambda: defaultdict(list))
+    for utt in meta:
+        p, s, uid = utt["item_name"].split("#")
+        singers.append(p)
+        songs.append(s)
+        singer2songs[p][s].append(uid)
+    unique_singers = list(set(singers))
+    unique_songs = list(set(songs))
+    unique_singers.sort()
+    unique_songs.sort()
+    print(
+        "M4Singer: {} singers, {} utterances ({} unique songs)".format(
+            len(unique_singers), len(songs), len(unique_songs)
+        )
+    )
+    print("Singers: \n{}".format("\t".join(unique_singers)))
+    return singer2songs, unique_singers
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing test samples for m4singer...\n")
+    save_dir = os.path.join(output_path, "m4singer")
+    os.makedirs(save_dir, exist_ok=True)
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    singer_dict_file = os.path.join(save_dir, "singers.json")
+    utt2singer_file = os.path.join(save_dir, "utt2singer")
+    if (
+        has_existed(train_output_file)
+        and has_existed(test_output_file)
+        and has_existed(singer_dict_file)
+        and has_existed(utt2singer_file)
+    ):
+        return
+    utt2singer = open(utt2singer_file, "w")
+    # Load
+    m4singer_dir = dataset_path
+    meta_file = os.path.join(m4singer_dir, "meta.json")
+    with open(meta_file, "r", encoding="utf-8") as f:
+        meta = json.load(f)
+    singer2songs, unique_singers = m4singer_statistics(meta)
+    test_songs = get_test_songs()
+    # We select songs of standard samples as test songs
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    for singer, songs in tqdm(singer2songs.items()):
+        song_names = list(songs.keys())
+        for chosen_song in song_names:
+            chosen_song = chosen_song.replace(" ", "-")
+            for chosen_uid in songs[chosen_song]:
+                res = {
+                    "Dataset": "m4singer",
+                    "Singer": singer,
+                    "Song": chosen_song,
+                    "Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
+                }
+                res["Path"] = os.path.join(
+                    m4singer_dir, "{}#{}/{}.wav".format(singer, chosen_song, chosen_uid)
+                )
+                assert os.path.exists(res["Path"])
+                duration = librosa.get_duration(filename=res["Path"])
+                res["Duration"] = duration
+                if "_".join([singer, chosen_song]) in test_songs:
+                    res["index"] = test_index_count
+                    test_total_duration += duration
+                    test.append(res)
+                    test_index_count += 1
+                else:
+                    res["index"] = train_index_count
+                    train_total_duration += duration
+                    train.append(res)
+                    train_index_count += 1
+                utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save train.json and test.json
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)
+    # Save singers.json
+    singer_lut = {name: i for i, name in enumerate(unique_singers)}
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)

preprocessors/metadata.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+from tqdm import tqdm
+def cal_metadata(cfg):
+    """
+    Dump metadata (singers.json, meta_info.json, utt2singer) for singer dataset or multi-datasets.
+    """
+    from collections import Counter
+    datasets = cfg.dataset
+    print("-" * 10)
+    print("Preparing metadata...")
+    print("Including: \n{}\n".format("\n".join(datasets)))
+    datasets.sort()
+    for dataset in tqdm(datasets):
+        save_dir = os.path.join(cfg.preprocess.processed_dir, dataset)
+        assert os.path.exists(save_dir)
+        # 'train.json' and 'test.json' of target dataset
+        train_metadata = os.path.join(save_dir, "train.json")
+        test_metadata = os.path.join(save_dir, "test.json")
+        # Sort the metadata as the duration order
+        with open(train_metadata, "r", encoding="utf-8") as f:
+            train_utterances = json.load(f)
+        with open(test_metadata, "r", encoding="utf-8") as f:
+            test_utterances = json.load(f)
+        train_utterances = sorted(train_utterances, key=lambda x: x["Duration"])
+        test_utterances = sorted(test_utterances, key=lambda x: x["Duration"])
+        # Write back the sorted metadata
+        with open(train_metadata, "w") as f:
+            json.dump(train_utterances, f, indent=4, ensure_ascii=False)
+        with open(test_metadata, "w") as f:
+            json.dump(test_utterances, f, indent=4, ensure_ascii=False)
+        # Paths of metadata needed to be generated
+        singer_dict_file = os.path.join(save_dir, cfg.preprocess.spk2id)
+        utt2singer_file = os.path.join(save_dir, cfg.preprocess.utt2spk)
+        # Get the total duration and singer names for train and test utterances
+        train_total_duration = sum(utt["Duration"] for utt in train_utterances)
+        test_total_duration = sum(utt["Duration"] for utt in test_utterances)
+        singer_names = set(
+            f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
+            for utt in train_utterances + test_utterances
+        )
+        # Write the utt2singer file and sort the singer names
+        with open(utt2singer_file, "w", encoding="utf-8") as f:
+            for utt in train_utterances + test_utterances:
+                f.write(
+                    f"{utt['Dataset']}_{utt['Uid']}\t{replace_augment_name(utt['Dataset'])}_{utt['Singer']}\n"
+                )
+        singer_names = sorted(singer_names)
+        singer_lut = {name: i for i, name in enumerate(singer_names)}
+        # dump singers.json
+        with open(singer_dict_file, "w", encoding="utf-8") as f:
+            json.dump(singer_lut, f, indent=4, ensure_ascii=False)
+        meta_info = {
+            "dataset": dataset,
+            "statistics": {
+                "size": len(train_utterances) + len(test_utterances),
+                "hours": round(train_total_duration / 3600, 4)
+                + round(test_total_duration / 3600, 4),
+            },
+            "train": {
+                "size": len(train_utterances),
+                "hours": round(train_total_duration / 3600, 4),
+            },
+            "test": {
+                "size": len(test_utterances),
+                "hours": round(test_total_duration / 3600, 4),
+            },
+            "singers": {"size": len(singer_lut)},
+        }
+        # Use Counter to count the minutes for each singer
+        total_singer2mins = Counter()
+        training_singer2mins = Counter()
+        for utt in train_utterances:
+            k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
+            training_singer2mins[k] += utt["Duration"] / 60
+            total_singer2mins[k] += utt["Duration"] / 60
+        for utt in test_utterances:
+            k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
+            total_singer2mins[k] += utt["Duration"] / 60
+        training_singer2mins = dict(
+            sorted(training_singer2mins.items(), key=lambda x: x[1], reverse=True)
+        )
+        training_singer2mins = {k: round(v, 2) for k, v in training_singer2mins.items()}
+        meta_info["singers"]["training_minutes"] = training_singer2mins
+        total_singer2mins = dict(
+            sorted(total_singer2mins.items(), key=lambda x: x[1], reverse=True)
+        )
+        total_singer2mins = {k: round(v, 2) for k, v in total_singer2mins.items()}
+        meta_info["singers"]["minutes"] = total_singer2mins
+        with open(os.path.join(save_dir, "meta_info.json"), "w") as f:
+            json.dump(meta_info, f, indent=4, ensure_ascii=False)
+        for singer, min in training_singer2mins.items():
+            print(f"Singer {singer}: {min} mins for training")
+        print("-" * 10, "\n")
+def replace_augment_name(dataset: str) -> str:
+    """Replace the augmented dataset name with the original dataset name.
+    >>> print(replace_augment_name("dataset_equalizer"))
+    dataset
+    """
+    if "equalizer" in dataset:
+        dataset = dataset.replace("_equalizer", "")
+    elif "formant_shift" in dataset:
+        dataset = dataset.replace("_formant_shift", "")
+    elif "pitch_shift" in dataset:
+        dataset = dataset.replace("_pitch_shift", "")
+    elif "time_stretch" in dataset:
+        dataset = dataset.replace("_time_stretch", "")
+    else:
+        pass
+    return dataset

preprocessors/nus48e.py ADDED Viewed

	@@ -0,0 +1,203 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import torchaudio
+from tqdm import tqdm
+from glob import glob
+from collections import defaultdict
+from utils.io import save_audio
+from utils.util import has_existed
+from utils.audio_slicer import Slicer
+from preprocessors import GOLDEN_TEST_SAMPLES
+def split_to_utterances(dataset_path, singer, style, output_dir):
+    data_dir = os.path.join(dataset_path, singer, style)
+    print("Splitting to utterances for {}...".format(data_dir))
+    wave_files = glob(data_dir + "/*.wav")
+    for wav_file in tqdm(wave_files):
+        # Load waveform
+        song_name = wav_file.split("/")[-1].split(".")[0]
+        waveform, fs = torchaudio.load(wav_file)
+        # Split
+        slicer = Slicer(sr=fs, threshold=-40.0, max_sil_kept=4000)
+        chunks = slicer.slice(waveform)
+        for i, chunk in enumerate(chunks):
+            save_dir = os.path.join(output_dir, singer, style, song_name)
+            os.makedirs(save_dir, exist_ok=True)
+            output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
+            save_audio(output_file, chunk, fs)
+def _main(dataset_path):
+    """
+    Split to utterances
+    """
+    utterance_dir = os.path.join(dataset_path, "utterances")
+    singer_infos = glob(dataset_path + "/*")
+    for singer_info in singer_infos:
+        singer = singer_info.split("/")[-1]
+        for style in ["read", "sing"]:
+            split_to_utterances(dataset_path, singer, style, utterance_dir)
+def get_test_songs():
+    golden_samples = GOLDEN_TEST_SAMPLES["nus48e"]
+    # every item is a tuple (singer, song)
+    golden_songs = [s.split("#")[:2] for s in golden_samples]
+    # singer_song, eg: Female1#Almost_lover_Amateur
+    return golden_songs
+def nus48e_statistics(data_dir):
+    singers = []
+    songs = []
+    singer2songs = defaultdict(lambda: defaultdict(list))
+    singer_infos = glob(data_dir + "/*")
+    for singer_info in singer_infos:
+        singer_info_split = singer_info.split("/")[-1]
+        style_infos = glob(singer_info + "/*")
+        for style_info in style_infos:
+            style_info_split = style_info.split("/")[-1]
+            singer = singer_info_split + "_" + style_info_split
+            singers.append(singer)
+            song_infos = glob(style_info + "/*")
+            for song_info in song_infos:
+                song = song_info.split("/")[-1]
+                songs.append(song)
+                utts = glob(song_info + "/*.wav")
+                for utt in utts:
+                    uid = utt.split("/")[-1].split(".")[0]
+                    singer2songs[singer][song].append(uid)
+    unique_singers = list(set(singers))
+    unique_songs = list(set(songs))
+    unique_singers.sort()
+    unique_songs.sort()
+    print(
+        "nus_48_e: {} singers, {} utterances ({} unique songs)".format(
+            len(unique_singers), len(songs), len(unique_songs)
+        )
+    )
+    print("Singers: \n{}".format("\t".join(unique_singers)))
+    return singer2songs, unique_singers
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing test samples for nus48e...\n")
+    if not os.path.exists(os.path.join(dataset_path, "utterances")):
+        print("Spliting into utterances...\n")
+        _main(dataset_path)
+    save_dir = os.path.join(output_path, "nus48e")
+    os.makedirs(save_dir, exist_ok=True)
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    singer_dict_file = os.path.join(save_dir, "singers.json")
+    utt2singer_file = os.path.join(save_dir, "utt2singer")
+    if (
+        has_existed(train_output_file)
+        and has_existed(test_output_file)
+        and has_existed(singer_dict_file)
+        and has_existed(utt2singer_file)
+    ):
+        return
+    utt2singer = open(utt2singer_file, "w")
+    # Load
+    nus48e_path = os.path.join(dataset_path, "utterances")
+    singer2songs, unique_singers = nus48e_statistics(nus48e_path)
+    test_songs = get_test_songs()
+    # We select songs of standard samples as test songs
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    for singer, songs in singer2songs.items():
+        song_names = list(songs.keys())
+        for chosen_song in song_names:
+            for chosen_uid in songs[chosen_song]:
+                res = {
+                    "Dataset": "nus48e",
+                    "Singer": singer,
+                    "Uid": "{}#{}#{}".format(singer, chosen_song, chosen_uid),
+                }
+                res["Path"] = "{}/{}/{}/{}.wav".format(
+                    singer.split("_")[0], singer.split("_")[-1], chosen_song, chosen_uid
+                )
+                res["Path"] = os.path.join(nus48e_path, res["Path"])
+                assert os.path.exists(res["Path"])
+                waveform, sample_rate = torchaudio.load(res["Path"])
+                duration = waveform.size(-1) / sample_rate
+                res["Duration"] = duration
+                if duration <= 1e-8:
+                    continue
+                if ([singer, chosen_song]) in test_songs:
+                    res["index"] = test_index_count
+                    test_total_duration += duration
+                    test.append(res)
+                    test_index_count += 1
+                else:
+                    res["index"] = train_index_count
+                    train_total_duration += duration
+                    train.append(res)
+                    train_index_count += 1
+                utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save train.json and test.json
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)
+    # Save singers.json
+    singer_lut = {name: i for i, name in enumerate(unique_singers)}
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)

preprocessors/opencpop.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+from tqdm import tqdm
+import os
+import librosa
+from utils.util import has_existed
+def get_lines(file):
+    with open(file, "r") as f:
+        lines = f.readlines()
+        lines = [l.strip() for l in lines]
+    return lines
+def get_uid2utt(opencpop_path, dataset, dataset_type):
+    index_count = 0
+    total_duration = 0
+    file = os.path.join(opencpop_path, "segments", "{}.txt".format(dataset_type))
+    lines = get_lines(file)
+    uid2utt = []
+    for l in tqdm(lines):
+        items = l.split("|")
+        uid = items[0]
+        res = {
+            "Dataset": dataset,
+            "index": index_count,
+            "Singer": "female1",
+            "Uid": uid,
+        }
+        # Duration in wav files
+        audio_file = os.path.join(opencpop_path, "segments/wavs/{}.wav".format(uid))
+        res["Path"] = audio_file
+        duration = librosa.get_duration(filename=res["Path"])
+        res["Duration"] = duration
+        uid2utt.append(res)
+        index_count = index_count + 1
+        total_duration += duration
+    return uid2utt, total_duration / 3600
+def main(dataset, output_path, dataset_path):
+    print("-" * 10)
+    print("Dataset splits for {}...\n".format(dataset))
+    save_dir = os.path.join(output_path, dataset)
+    opencpop_path = dataset_path
+    for dataset_type in ["train", "test"]:
+        output_file = os.path.join(save_dir, "{}.json".format(dataset_type))
+        if has_existed(output_file):
+            continue
+        res, hours = get_uid2utt(opencpop_path, dataset, dataset_type)
+        # Save
+        os.makedirs(save_dir, exist_ok=True)
+        with open(output_file, "w") as f:
+            json.dump(res, f, indent=4, ensure_ascii=False)
+        print("{}_{}_hours= {}".format(dataset, dataset_type, hours))

preprocessors/opensinger.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import random
+import os
+import json
+import librosa
+from tqdm import tqdm
+from glob import glob
+from collections import defaultdict
+from utils.util import has_existed
+from preprocessors import GOLDEN_TEST_SAMPLES
+def get_test_songs():
+    golden_samples = GOLDEN_TEST_SAMPLES["opensinger"]
+    # every item is a tuple (singer, song)
+    golden_songs = [s.split("_")[:3] for s in golden_samples]
+    # singer_song, eg: Female1#Almost_lover_Amateur
+    return golden_songs
+def opensinger_statistics(data_dir):
+    singers = []
+    songs = []
+    singer2songs = defaultdict(lambda: defaultdict(list))
+    gender_infos = glob(data_dir + "/*")
+    for gender_info in gender_infos:
+        gender_info_split = gender_info.split("/")[-1][:-3]
+        singer_and_song_infos = glob(gender_info + "/*")
+        for singer_and_song_info in singer_and_song_infos:
+            singer_and_song_info_split = singer_and_song_info.split("/")[-1].split("_")
+            singer_id, song = (
+                singer_and_song_info_split[0],
+                singer_and_song_info_split[1],
+            )
+            singer = gender_info_split + "_" + singer_id
+            singers.append(singer)
+            songs.append(song)
+            utts = glob(singer_and_song_info + "/*.wav")
+            for utt in utts:
+                uid = utt.split("/")[-1].split("_")[-1].split(".")[0]
+                singer2songs[singer][song].append(uid)
+    unique_singers = list(set(singers))
+    unique_songs = list(set(songs))
+    unique_singers.sort()
+    unique_songs.sort()
+    print(
+        "opensinger: {} singers, {} songs ({} unique songs)".format(
+            len(unique_singers), len(songs), len(unique_songs)
+        )
+    )
+    print("Singers: \n{}".format("\t".join(unique_singers)))
+    return singer2songs, unique_singers
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing test samples for opensinger...\n")
+    save_dir = os.path.join(output_path, "opensinger")
+    os.makedirs(save_dir, exist_ok=True)
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    singer_dict_file = os.path.join(save_dir, "singers.json")
+    utt2singer_file = os.path.join(save_dir, "utt2singer")
+    if (
+        has_existed(train_output_file)
+        and has_existed(test_output_file)
+        and has_existed(singer_dict_file)
+        and has_existed(utt2singer_file)
+    ):
+        return
+    utt2singer = open(utt2singer_file, "w")
+    # Load
+    opensinger_path = dataset_path
+    singer2songs, unique_singers = opensinger_statistics(opensinger_path)
+    test_songs = get_test_songs()
+    # We select songs of standard samples as test songs
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    for i, (singer, songs) in enumerate(singer2songs.items()):
+        song_names = list(songs.keys())
+        for chosen_song in tqdm(
+            song_names, desc="Singer {}/{}".format(i, len(singer2songs))
+        ):
+            for chosen_uid in songs[chosen_song]:
+                res = {
+                    "Dataset": "opensinger",
+                    "Singer": singer,
+                    "Song": chosen_song,
+                    "Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
+                }
+                res["Path"] = "{}Raw/{}_{}/{}_{}_{}.wav".format(
+                    singer.split("_")[0],
+                    singer.split("_")[1],
+                    chosen_song,
+                    singer.split("_")[1],
+                    chosen_song,
+                    chosen_uid,
+                )
+                res["Path"] = os.path.join(opensinger_path, res["Path"])
+                assert os.path.exists(res["Path"])
+                duration = librosa.get_duration(filename=res["Path"])
+                res["Duration"] = duration
+                if duration > 30:
+                    print(
+                        "Wav file: {}, the duration = {:.2f}s > 30s, which has been abandoned.".format(
+                            res["Path"], duration
+                        )
+                    )
+                    continue
+                if (
+                    [singer.split("_")[0], singer.split("_")[1], chosen_song]
+                ) in test_songs:
+                    res["index"] = test_index_count
+                    test_total_duration += duration
+                    test.append(res)
+                    test_index_count += 1
+                else:
+                    res["index"] = train_index_count
+                    train_total_duration += duration
+                    train.append(res)
+                    train_index_count += 1
+                utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save train.json and test.json
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)
+    # Save singers.json
+    singer_lut = {name: i for i, name in enumerate(unique_singers)}
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)

preprocessors/opera.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import os
+from tqdm import tqdm
+import torchaudio
+from glob import glob
+from collections import defaultdict
+from utils.util import has_existed
+from utils.io import save_audio
+from utils.audio_slicer import Slicer
+from preprocessors import GOLDEN_TEST_SAMPLES
+def split_to_utterances(language_dir, output_dir):
+    print("Splitting to utterances for {}...".format(language_dir))
+    for wav_file in tqdm(glob("{}/*/*".format(language_dir))):
+        # Load waveform
+        singer_name, song_name = wav_file.split("/")[-2:]
+        song_name = song_name.split(".")[0]
+        waveform, fs = torchaudio.load(wav_file)
+        # Split
+        slicer = Slicer(sr=fs, threshold=-30.0, max_sil_kept=3000)
+        chunks = slicer.slice(waveform)
+        for i, chunk in enumerate(chunks):
+            save_dir = os.path.join(output_dir, singer_name, song_name)
+            os.makedirs(save_dir, exist_ok=True)
+            output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
+            save_audio(output_file, chunk, fs)
+def _main(dataset_path):
+    """
+    Split to utterances
+    """
+    utterance_dir = os.path.join(dataset_path, "utterances")
+    for lang in ["chinese", "western"]:
+        split_to_utterances(os.path.join(dataset_path, lang), utterance_dir)
+def get_test_songs():
+    golden_samples = GOLDEN_TEST_SAMPLES["opera"]
+    # every item is a tuple (singer, song)
+    golden_songs = [s.split("#")[:2] for s in golden_samples]
+    # singer#song, eg:fem_01#neg_01
+    return golden_songs
+def opera_statistics(data_dir):
+    singers = []
+    songs = []
+    singers2songs = defaultdict(lambda: defaultdict(list))
+    singer_infos = glob(data_dir + "/*")
+    for singer_info in singer_infos:
+        singer = singer_info.split("/")[-1]
+        song_infos = glob(singer_info + "/*")
+        for song_info in song_infos:
+            song = song_info.split("/")[-1]
+            singers.append(singer)
+            songs.append(song)
+            utts = glob(song_info + "/*.wav")
+            for utt in utts:
+                uid = utt.split("/")[-1].split(".")[0]
+                singers2songs[singer][song].append(uid)
+    unique_singers = list(set(singers))
+    unique_songs = list(set(songs))
+    unique_singers.sort()
+    unique_songs.sort()
+    print(
+        "opera: {} singers, {} utterances ({} unique songs)".format(
+            len(unique_singers), len(songs), len(unique_songs)
+        )
+    )
+    print("Singers: \n{}".format("\t".join(unique_singers)))
+    return singers2songs, unique_singers
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing test samples for opera...\n")
+    if not os.path.exists(os.path.join(dataset_path, "utterances")):
+        print("Spliting into utterances...\n")
+        _main(dataset_path)
+    save_dir = os.path.join(output_path, "opera")
+    os.makedirs(save_dir, exist_ok=True)
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    singer_dict_file = os.path.join(save_dir, "singers.json")
+    utt2singer_file = os.path.join(save_dir, "utt2singer")
+    if (
+        has_existed(train_output_file)
+        and has_existed(test_output_file)
+        and has_existed(singer_dict_file)
+        and has_existed(utt2singer_file)
+    ):
+        return
+    utt2singer = open(utt2singer_file, "w")
+    # Load
+    opera_path = os.path.join(dataset_path, "utterances")
+    singers2songs, unique_singers = opera_statistics(opera_path)
+    test_songs = get_test_songs()
+    # We select songs of standard samples as test songs
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    for singer, songs in tqdm(singers2songs.items()):
+        song_names = list(songs.keys())
+        for chosen_song in song_names:
+            for chosen_uid in songs[chosen_song]:
+                res = {
+                    "Dataset": "opera",
+                    "Singer": singer,
+                    "Uid": "{}#{}#{}".format(singer, chosen_song, chosen_uid),
+                }
+                res["Path"] = "{}/{}/{}.wav".format(singer, chosen_song, chosen_uid)
+                res["Path"] = os.path.join(opera_path, res["Path"])
+                assert os.path.exists(res["Path"])
+                waveform, sample_rate = torchaudio.load(res["Path"])
+                duration = waveform.size(-1) / sample_rate
+                res["Duration"] = duration
+                if duration <= 1e-8:
+                    continue
+                if ([singer, chosen_song]) in test_songs:
+                    res["index"] = test_index_count
+                    test_total_duration += duration
+                    test.append(res)
+                    test_index_count += 1
+                else:
+                    res["index"] = train_index_count
+                    train_total_duration += duration
+                    train.append(res)
+                    train_index_count += 1
+                utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save train.json and test.json
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)
+    # Save singers.json
+    singer_lut = {name: i for i, name in enumerate(unique_singers)}
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)

preprocessors/pjs.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+from tqdm import tqdm
+import glob
+import json
+import torchaudio
+from utils.util import has_existed
+from utils.io import save_audio
+def get_splitted_utterances(
+    raw_wav_dir, trimed_wav_dir, n_utterance_splits, overlapping
+):
+    res = []
+    raw_song_files = glob.glob(
+        os.path.join(raw_wav_dir, "**/pjs*_song.wav"), recursive=True
+    )
+    trimed_song_files = glob.glob(
+        os.path.join(trimed_wav_dir, "**/*.wav"), recursive=True
+    )
+    if len(raw_song_files) * n_utterance_splits == len(trimed_song_files):
+        print("Splitted done...")
+        for wav_file in tqdm(trimed_song_files):
+            uid = wav_file.split("/")[-1].split(".")[0]
+            utt = {"Dataset": "pjs", "Singer": "male1", "Uid": uid, "Path": wav_file}
+            waveform, sample_rate = torchaudio.load(wav_file)
+            duration = waveform.size(-1) / sample_rate
+            utt["Duration"] = duration
+            res.append(utt)
+    else:
+        for wav_file in tqdm(raw_song_files):
+            song_id = wav_file.split("/")[-1].split(".")[0]
+            waveform, sample_rate = torchaudio.load(wav_file)
+            trimed_waveform = torchaudio.functional.vad(waveform, sample_rate)
+            trimed_waveform = torchaudio.functional.vad(
+                trimed_waveform.flip(dims=[1]), sample_rate
+            ).flip(dims=[1])
+            audio_len = trimed_waveform.size(-1)
+            lapping_len = overlapping * sample_rate
+            for i in range(n_utterance_splits):
+                start = i * audio_len // 3
+                end = start + audio_len // 3 + lapping_len
+                splitted_waveform = trimed_waveform[:, start:end]
+                utt = {
+                    "Dataset": "pjs",
+                    "Singer": "male1",
+                    "Uid": "{}_{}".format(song_id, i),
+                }
+                # Duration
+                duration = splitted_waveform.size(-1) / sample_rate
+                utt["Duration"] = duration
+                # Save trimed wav
+                splitted_waveform_file = os.path.join(
+                    trimed_wav_dir, "{}.wav".format(utt["Uid"])
+                )
+                save_audio(splitted_waveform_file, splitted_waveform, sample_rate)
+                # Path
+                utt["Path"] = splitted_waveform_file
+                res.append(utt)
+    res = sorted(res, key=lambda x: x["Uid"])
+    return res
+def main(output_path, dataset_path, n_utterance_splits=3, overlapping=1):
+    """
+    1. Split one raw utterance to three splits (since some samples are too long)
+    2. Overlapping of ajacent splits is 1 s
+    """
+    print("-" * 10)
+    print("Preparing training dataset for PJS...")
+    save_dir = os.path.join(output_path, "pjs")
+    raw_wav_dir = os.path.join(dataset_path, "PJS_corpus_ver1.1")
+    # Trim for silence
+    trimed_wav_dir = os.path.join(dataset_path, "trim")
+    os.makedirs(trimed_wav_dir, exist_ok=True)
+    # Total utterances
+    utterances = get_splitted_utterances(
+        raw_wav_dir, trimed_wav_dir, n_utterance_splits, overlapping
+    )
+    total_uids = [utt["Uid"] for utt in utterances]
+    # Test uids
+    n_test_songs = 3
+    test_uids = []
+    for i in range(1, n_test_songs + 1):
+        test_uids += [
+            "pjs00{}_song_{}".format(i, split_id)
+            for split_id in range(n_utterance_splits)
+        ]
+    # Train uids
+    train_uids = [uid for uid in total_uids if uid not in test_uids]
+    for dataset_type in ["train", "test"]:
+        output_file = os.path.join(save_dir, "{}.json".format(dataset_type))
+        if has_existed(output_file):
+            continue
+        uids = eval("{}_uids".format(dataset_type))
+        res = [utt for utt in utterances if utt["Uid"] in uids]
+        for i in range(len(res)):
+            res[i]["index"] = i
+        time = sum([utt["Duration"] for utt in res])
+        print(
+            "{}, Total size: {}, Total Duraions = {} s = {:.2f} hour\n".format(
+                dataset_type, len(res), time, time / 3600
+            )
+        )
+        # Save
+        os.makedirs(save_dir, exist_ok=True)
+        with open(output_file, "w") as f:
+            json.dump(res, f, indent=4, ensure_ascii=False)

preprocessors/popbutfy.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import torchaudio
+import librosa
+from tqdm import tqdm
+from glob import glob
+from collections import defaultdict
+from utils.util import has_existed
+from preprocessors import GOLDEN_TEST_SAMPLES
+def get_test_songs():
+    golden_samples = GOLDEN_TEST_SAMPLES["popbutfy"]
+    # every item is a tuple (singer, song)
+    golden_songs = [s.split("#")[:2] for s in golden_samples]
+    # singer#song, eg: Female1#Almost_lover_Amateur
+    return golden_songs
+def popbutfy_statistics(data_dir):
+    singers = []
+    songs = []
+    singer2songs = defaultdict(lambda: defaultdict(list))
+    data_infos = glob(data_dir + "/*")
+    for data_info in data_infos:
+        data_info_split = data_info.split("/")[-1].split("#")
+        singer, song = data_info_split[0], data_info_split[-1]
+        singers.append(singer)
+        songs.append(song)
+        utts = glob(data_info + "/*")
+        for utt in utts:
+            uid = utt.split("/")[-1].split("_")[-1].split(".")[0]
+            singer2songs[singer][song].append(uid)
+    unique_singers = list(set(singers))
+    unique_songs = list(set(songs))
+    unique_singers.sort()
+    unique_songs.sort()
+    print(
+        "PopBuTFy: {} singers, {} utterances ({} unique songs)".format(
+            len(unique_singers), len(songs), len(unique_songs)
+        )
+    )
+    print("Singers: \n{}".format("\t".join(unique_singers)))
+    return singer2songs, unique_singers
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing test samples for popbutfy...\n")
+    save_dir = os.path.join(output_path, "popbutfy")
+    os.makedirs(save_dir, exist_ok=True)
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    singer_dict_file = os.path.join(save_dir, "singers.json")
+    utt2singer_file = os.path.join(save_dir, "utt2singer")
+    if (
+        has_existed(train_output_file)
+        and has_existed(test_output_file)
+        and has_existed(singer_dict_file)
+        and has_existed(utt2singer_file)
+    ):
+        return
+    utt2singer = open(utt2singer_file, "w")
+    # Load
+    popbutfy_dir = dataset_path
+    singer2songs, unique_singers = popbutfy_statistics(popbutfy_dir)
+    test_songs = get_test_songs()
+    # We select songs of standard samples as test songs
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    for singer, songs in tqdm(singer2songs.items()):
+        song_names = list(songs.keys())
+        for chosen_song in song_names:
+            for chosen_uid in songs[chosen_song]:
+                res = {
+                    "Dataset": "popbutfy",
+                    "Singer": singer,
+                    "Song": chosen_song,
+                    "Uid": "{}#{}#".format(singer, chosen_song, chosen_uid),
+                }
+                res["Path"] = "{}#singing#{}/{}#singing#{}_{}.mp3".format(
+                    singer, chosen_song, singer, chosen_song, chosen_uid
+                )
+                if not os.path.exists(os.path.join(popbutfy_dir, res["Path"])):
+                    res["Path"] = "{}#singing#{}/{}#singing#{}_{}.wav".format(
+                        singer, chosen_song, singer, chosen_song, chosen_uid
+                    )
+                res["Path"] = os.path.join(popbutfy_dir, res["Path"])
+                assert os.path.exists(res["Path"])
+                if res["Path"].split("/")[-1].split(".")[-1] == "wav":
+                    waveform, sample_rate = torchaudio.load(res["Path"])
+                    duration = waveform.size(-1) / sample_rate
+                else:
+                    waveform, sample_rate = librosa.load(res["Path"])
+                    duration = waveform.shape[-1] / sample_rate
+                res["Duration"] = duration
+                if ([singer, chosen_song]) in test_songs:
+                    res["index"] = test_index_count
+                    test_total_duration += duration
+                    test.append(res)
+                    test_index_count += 1
+                else:
+                    res["index"] = train_index_count
+                    train_total_duration += duration
+                    train.append(res)
+                    train_index_count += 1
+                utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save train.json and test.json
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)
+    # Save singers.json
+    singer_lut = {name: i for i, name in enumerate(unique_singers)}
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)

preprocessors/popcs.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import torchaudio
+from glob import glob
+from collections import defaultdict
+from utils.util import has_existed
+from preprocessors import GOLDEN_TEST_SAMPLES
+def get_test_songs():
+    golden_samples = GOLDEN_TEST_SAMPLES["popcs"]
+    # every item is a string
+    golden_songs = [s.split("_")[:1] for s in golden_samples]
+    # song, eg: 万有引力
+    return golden_songs
+def popcs_statistics(data_dir):
+    songs = []
+    songs2utts = defaultdict(list)
+    song_infos = glob(data_dir + "/*")
+    for song_info in song_infos:
+        song_info_split = song_info.split("/")[-1].split("-")[-1]
+        songs.append(song_info_split)
+        utts = glob(song_info + "/*.wav")
+        for utt in utts:
+            uid = utt.split("/")[-1].split("_")[0]
+            songs2utts[song_info_split].append(uid)
+    unique_songs = list(set(songs))
+    unique_songs.sort()
+    print(
+        "popcs: {} utterances ({} unique songs)".format(len(songs), len(unique_songs))
+    )
+    print("Songs: \n{}".format("\t".join(unique_songs)))
+    return songs2utts
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing test samples for popcs...\n")
+    save_dir = os.path.join(output_path, "popcs")
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    if has_existed(test_output_file):
+        return
+    # Load
+    popcs_dir = dataset_path
+    songs2utts = popcs_statistics(popcs_dir)
+    test_songs = get_test_songs()
+    # We select songs of standard samples as test songs
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    song_names = list(songs2utts.keys())
+    for chosen_song in song_names:
+        for chosen_uid in songs2utts[chosen_song]:
+            res = {
+                "Dataset": "popcs",
+                "Singer": "female1",
+                "Song": chosen_song,
+                "Uid": "{}_{}".format(chosen_song, chosen_uid),
+            }
+            res["Path"] = "popcs-{}/{}_wf0.wav".format(chosen_song, chosen_uid)
+            res["Path"] = os.path.join(popcs_dir, res["Path"])
+            assert os.path.exists(res["Path"])
+            waveform, sample_rate = torchaudio.load(res["Path"])
+            duration = waveform.size(-1) / sample_rate
+            res["Duration"] = duration
+            if ([chosen_song]) in test_songs:
+                res["index"] = test_index_count
+                test_total_duration += duration
+                test.append(res)
+                test_index_count += 1
+            else:
+                res["index"] = train_index_count
+                train_total_duration += duration
+                train.append(res)
+                train_index_count += 1
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save
+    os.makedirs(save_dir, exist_ok=True)
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)

preprocessors/processor.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+from preprocessors import (
+    m4singer,
+    opencpop,
+    svcc,
+    pjs,
+    popbutfy,
+    opensinger,
+    popcs,
+    kising,
+    csd,
+    opera,
+    nus48e,
+    svcceval,
+    vctk,
+    vctksample,
+    libritts,
+    lijian,
+    cdmusiceval,
+    ljspeech,
+    coco,
+    cocoeval,
+    custom,
+    vocalist,
+    ljspeech_vocoder,
+)
+def preprocess_dataset(
+    dataset, dataset_path, output_path, cfg, is_custom_dataset=False
+):
+    """Call specific function to handle specific dataset
+    Args:
+        dataset (str): name of a dataset, e.g. opencpop, m4singer
+        dataset_path (str): path to dataset
+        output_path (str): path to store preprocessing result files
+    """
+    if is_custom_dataset:
+        custom.main(output_path, dataset_path, dataset_name=dataset)
+        return
+    if re.match("opencpop*", dataset):
+        opencpop.main(dataset, output_path, dataset_path)
+    if dataset == "m4singer":
+        m4singer.main(output_path, dataset_path)
+    if dataset == "svcc":
+        svcc.main(output_path, dataset_path)
+    if dataset == "pjs":
+        pjs.main(output_path, dataset_path)
+    if dataset == "popbutfy":
+        popbutfy.main(output_path, dataset_path)
+    if dataset == "opensinger":
+        opensinger.main(output_path, dataset_path)
+    if dataset == "popcs":
+        popcs.main(output_path, dataset_path)
+    if dataset == "kising":
+        kising.main(output_path, dataset_path)
+    if dataset == "csd":
+        csd.main(output_path, dataset_path)
+    if dataset == "opera":
+        opera.main(output_path, dataset_path)
+    if dataset == "nus48e":
+        nus48e.main(output_path, dataset_path)
+    if dataset == "vctk":
+        vctk.main(output_path, dataset_path)
+    if dataset == "svcceval":
+        svcceval.main(output_path, dataset_path)
+    if dataset == "libritts":
+        libritts.main(output_path, dataset_path)
+    if dataset == "lijian":
+        lijian.main(output_path, dataset_path)
+    if dataset == "cdmusiceval":
+        cdmusiceval.main(output_path, dataset_path)
+    if dataset == "LJSpeech":
+        ljspeech.main(output_path, dataset_path, cfg)
+    if dataset == "ljspeech":
+        ljspeech_vocoder.main(output_path, dataset_path)
+    if dataset == "coco":
+        coco.main(output_path, dataset_path)
+    if dataset == "cocoeval":
+        cocoeval.main(output_path, dataset_path)
+    if dataset == "vocalist":
+        vocalist.main(output_path, dataset_path)
+def prepare_align(dataset, dataset_path, cfg, output_path):
+    """Call specific function to handle specific dataset
+    Args:
+        dataset (str): name of a dataset, e.g. ljspeech
+        dataset_path (str): path to dataset
+        output_path (str): path to store preprocessing result files
+    """
+    if dataset == "LJSpeech":
+        ljspeech.prepare_align(dataset, dataset_path, cfg, output_path)

preprocessors/svcc.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import glob
+import librosa
+import json
+from utils.util import has_existed
+from preprocessors import GOLDEN_TEST_SAMPLES
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing training dataset for svcc...")
+    data_dir = os.path.join(dataset_path, "Data")
+    save_dir = os.path.join(output_path, "svcc")
+    os.makedirs(save_dir, exist_ok=True)
+    singer_dict_file = os.path.join(save_dir, "singers.json")
+    utt2singer_file = os.path.join(save_dir, "utt2singer")
+    utt2singer = open(utt2singer_file, "w")
+    # Load utterances
+    train = []
+    test = []
+    singers = []
+    for wav_file in glob.glob(os.path.join(data_dir, "*/*.wav")):
+        singer, filename = wav_file.split("/")[-2:]
+        uid = filename.split(".")[0]
+        utt = {
+            "Dataset": "svcc",
+            "Singer": singer,
+            "Uid": "{}_{}".format(singer, uid),
+            "Path": wav_file,
+        }
+        # Duration
+        duration = librosa.get_duration(filename=wav_file)
+        utt["Duration"] = duration
+        if utt["Uid"] in GOLDEN_TEST_SAMPLES["svcc"]:
+            test.append(utt)
+        else:
+            train.append(utt)
+        singers.append(singer)
+        utt2singer.write("{}\t{}\n".format(utt["Uid"], utt["Singer"]))
+    # Save singers.json
+    unique_singers = list(set(singers))
+    unique_singers.sort()
+    singer_lut = {name: i for i, name in enumerate(unique_singers)}
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)
+    train_total_duration = sum([utt["Duration"] for utt in train])
+    test_total_duration = sum([utt["Duration"] for utt in test])
+    for dataset_type in ["train", "test"]:
+        output_file = os.path.join(save_dir, "{}.json".format(dataset_type))
+        if has_existed(output_file):
+            continue
+        utterances = eval(dataset_type)
+        utterances = sorted(utterances, key=lambda x: x["Uid"])
+        for i in range(len(utterances)):
+            utterances[i]["index"] = i
+        print("{}: Total size: {}\n".format(dataset_type, len(utterances)))
+        # Save
+        with open(output_file, "w") as f:
+            json.dump(utterances, f, indent=4, ensure_ascii=False)
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )

preprocessors/svcceval.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import glob
+import librosa
+import json
+from utils.util import has_existed
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing training dataset for svcceval...")
+    data_dir = os.path.join(dataset_path, "Data")
+    save_dir = os.path.join(output_path, "svcceval")
+    os.makedirs(save_dir, exist_ok=True)
+    singer_dict_file = os.path.join(save_dir, "singers.json")
+    utt2singer_file = os.path.join(save_dir, "utt2singer")
+    utt2singer = open(utt2singer_file, "w")
+    # Load utterances
+    train = []
+    test = []
+    singers = []
+    for wav_file in glob.glob(os.path.join(data_dir, "*/*.wav")):
+        singer, filename = wav_file.split("/")[-2:]
+        uid = filename.split(".")[0]
+        utt = {
+            "Dataset": "svcceval",
+            "Singer": singer,
+            "Uid": "{}_{}".format(singer, uid),
+            "Path": wav_file,
+        }
+        # Duration
+        duration = librosa.get_duration(filename=wav_file)
+        utt["Duration"] = duration
+        test.append(utt)
+        singers.append(singer)
+        utt2singer.write("{}\t{}\n".format(utt["Uid"], utt["Singer"]))
+    # Save singers.json
+    unique_singers = list(set(singers))
+    unique_singers.sort()
+    singer_lut = {name: i for i, name in enumerate(unique_singers)}
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)
+    train_total_duration = sum([utt["Duration"] for utt in train])
+    test_total_duration = sum([utt["Duration"] for utt in test])
+    for dataset_type in ["train", "test"]:
+        output_file = os.path.join(save_dir, "{}.json".format(dataset_type))
+        if has_existed(output_file):
+            continue
+        utterances = eval(dataset_type)
+        utterances = sorted(utterances, key=lambda x: x["Uid"])
+        for i in range(len(utterances)):
+            utterances[i]["index"] = i
+        print("{}: Total size: {}\n".format(dataset_type, len(utterances)))
+        # Save
+        with open(output_file, "w") as f:
+            json.dump(utterances, f, indent=4, ensure_ascii=False)
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )

preprocessors/vctk.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import librosa
+from tqdm import tqdm
+from glob import glob
+from collections import defaultdict
+from utils.util import has_existed
+def get_lines(file):
+    with open(file, "r") as f:
+        lines = f.readlines()
+        lines = [l.strip() for l in lines]
+    return lines
+def vctk_statistics(data_dir):
+    speakers = []
+    speakers2utts = defaultdict(list)
+    speaker_infos = glob(data_dir + "/wav48_silence_trimmed" + "/*")
+    for speaker_info in speaker_infos:
+        speaker = speaker_info.split("/")[-1]
+        if speaker == "log.txt":
+            continue
+        speakers.append(speaker)
+        utts = glob(speaker_info + "/*")
+        for utt in utts:
+            uid = (
+                utt.split("/")[-1].split("_")[1]
+                + "_"
+                + utt.split("/")[-1].split("_")[2].split(".")[0]
+            )
+            speakers2utts[speaker].append(uid)
+    unique_speakers = list(set(speakers))
+    unique_speakers.sort()
+    print("Speakers: \n{}".format("\t".join(unique_speakers)))
+    return speakers2utts, unique_speakers
+def vctk_speaker_infos(data_dir):
+    file = os.path.join(data_dir, "speaker-info.txt")
+    lines = get_lines(file)
+    ID2speakers = defaultdict()
+    for l in tqdm(lines):
+        items = l.replace(" ", "")
+        if items[:2] == "ID":
+            # The header line
+            continue
+        if items[0] == "p":
+            id = items[:4]
+            gender = items[6]
+        elif items[0] == "s":
+            id = items[:2]
+            gender = items[4]
+        if gender == "F":
+            speaker = "female_{}".format(id)
+        elif gender == "M":
+            speaker = "male_{}".format(id)
+        ID2speakers[id] = speaker
+    return ID2speakers
+def main(output_path, dataset_path, TEST_NUM_OF_EVERY_SPEAKER=3):
+    print("-" * 10)
+    print("Preparing test samples for vctk...")
+    save_dir = os.path.join(output_path, "vctk")
+    os.makedirs(save_dir, exist_ok=True)
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    singer_dict_file = os.path.join(save_dir, "singers.json")
+    utt2singer_file = os.path.join(save_dir, "utt2singer")
+    if has_existed(train_output_file):
+        return
+    utt2singer = open(utt2singer_file, "w")
+    # Load
+    vctk_dir = dataset_path
+    ID2speakers = vctk_speaker_infos(vctk_dir)
+    speaker2utts, unique_speakers = vctk_statistics(vctk_dir)
+    # We select speakers of standard samples as test utts
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    test_speaker_count = defaultdict(int)
+    train_total_duration = 0
+    test_total_duration = 0
+    for i, speaker in enumerate(speaker2utts.keys()):
+        for chosen_uid in tqdm(
+            speaker2utts[speaker],
+            desc="Speaker {}/{}, #Train = {}, #Test = {}".format(
+                i + 1, len(speaker2utts), train_index_count, test_index_count
+            ),
+        ):
+            res = {
+                "Dataset": "vctk",
+                "Singer": ID2speakers[speaker],
+                "Uid": "{}#{}".format(ID2speakers[speaker], chosen_uid),
+            }
+            res["Path"] = "{}/{}_{}.flac".format(speaker, speaker, chosen_uid)
+            res["Path"] = os.path.join(vctk_dir, "wav48_silence_trimmed", res["Path"])
+            assert os.path.exists(res["Path"])
+            duration = librosa.get_duration(filename=res["Path"])
+            res["Duration"] = duration
+            if test_speaker_count[speaker] < TEST_NUM_OF_EVERY_SPEAKER:
+                res["index"] = test_index_count
+                test_total_duration += duration
+                test.append(res)
+                test_index_count += 1
+                test_speaker_count[speaker] += 1
+            else:
+                res["index"] = train_index_count
+                train_total_duration += duration
+                train.append(res)
+                train_index_count += 1
+            utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save train.json and test.json
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)
+    # Save singers.json
+    singer_lut = {name: i for i, name in enumerate(unique_speakers)}
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)

preprocessors/vctkfewsinger.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import pickle
+import glob
+from collections import defaultdict
+from tqdm import tqdm
+# Train: male 20 hours, female 10 hours
+TRAIN_MALE_MAX_SECONDS = 20 * 3600
+TRAIN_FEMALE_MAX_SECONDS = 10 * 3600
+TEST_MAX_NUM_EVERY_PERSON = 5
+def select_sample_idxs():
+    chosen_speakers = get_chosen_speakers()
+    with open(os.path.join(vctk_dir, "train.json"), "r") as f:
+        raw_train = json.load(f)
+    with open(os.path.join(vctk_dir, "test.json"), "r") as f:
+        raw_test = json.load(f)
+    train_idxs, test_idxs = [], []
+    # =========== Test ===========
+    test_nums = defaultdict(int)
+    for utt in tqdm(raw_train):
+        idx = utt["index"]
+        singer = utt["Singer"]
+        if singer in chosen_speakers and test_nums[singer] < TEST_MAX_NUM_EVERY_PERSON:
+            test_nums[singer] += 1
+            test_idxs.append("train_{}".format(idx))
+    for utt in tqdm(raw_test):
+        idx = utt["index"]
+        singer = utt["Singer"]
+        if singer in chosen_speakers and test_nums[singer] < TEST_MAX_NUM_EVERY_PERSON:
+            test_nums[singer] += 1
+            test_idxs.append("test_{}".format(idx))
+    # =========== Train ===========
+    for utt in tqdm(raw_train):
+        idx = utt["index"]
+        singer = utt["Singer"]
+        if singer in chosen_speakers and "train_{}".format(idx) not in test_idxs:
+            train_idxs.append("train_{}".format(idx))
+    for utt in tqdm(raw_test):
+        idx = utt["index"]
+        singer = utt["Singer"]
+        if singer in chosen_speakers and "test_{}".format(idx) not in test_idxs:
+            train_idxs.append("test_{}".format(idx))
+    train_idxs.sort()
+    test_idxs.sort()
+    return train_idxs, test_idxs, raw_train, raw_test
+def statistics_of_speakers():
+    speaker2time = defaultdict(float)
+    sex2time = defaultdict(float)
+    with open(os.path.join(vctk_dir, "train.json"), "r") as f:
+        train = json.load(f)
+    with open(os.path.join(vctk_dir, "test.json"), "r") as f:
+        test = json.load(f)
+    for utt in train + test:
+        # minutes
+        speaker2time[utt["Singer"]] += utt["Duration"]
+        # hours
+        sex2time[utt["Singer"].split("_")[0]] += utt["Duration"]
+    print(
+        "Female: {:.2f} hours, Male: {:.2f} hours.\n".format(
+            sex2time["female"] / 3600, sex2time["male"] / 3600
+        )
+    )
+    speaker2time = sorted(speaker2time.items(), key=lambda x: x[-1], reverse=True)
+    for singer, seconds in speaker2time:
+        print("{}\t{:.2f} mins".format(singer, seconds / 60))
+    return speaker2time
+def get_chosen_speakers():
+    speaker2time = statistics_of_speakers()
+    chosen_time = defaultdict(float)
+    chosen_speaker = defaultdict(list)
+    train_constrait = {
+        "male": TRAIN_MALE_MAX_SECONDS,
+        "female": TRAIN_FEMALE_MAX_SECONDS,
+    }
+    for speaker, seconds in speaker2time:
+        sex = speaker.split("_")[0]
+        if chosen_time[sex] < train_constrait[sex]:
+            chosen_time[sex] += seconds
+            chosen_speaker[sex].append(speaker)
+    speaker2time = dict(speaker2time)
+    chosen_speaker = chosen_speaker["male"] + chosen_speaker["female"]
+    print("\n#Chosen speakers = {}".format(len(chosen_speaker)))
+    for spk in chosen_speaker:
+        print("{}\t{:.2f} mins".format(spk, speaker2time[spk] / 60))
+    return chosen_speaker
+if __name__ == "__main__":
+    root_path = ""
+    vctk_dir = os.path.join(root_path, "vctk")
+    fewspeaker_dir = os.path.join(root_path, "vctkfewspeaker")
+    os.makedirs(fewspeaker_dir, exist_ok=True)
+    train_idxs, test_idxs, raw_train, raw_test = select_sample_idxs()
+    print("#Train = {}, #Test = {}".format(len(train_idxs), len(test_idxs)))
+    # There are no data leakage
+    assert len(set(train_idxs).intersection(set(test_idxs))) == 0
+    for idx in train_idxs + test_idxs:
+        # No test data of raw vctk
+        assert "test_" not in idx
+    for split, chosen_idxs in zip(["train", "test"], [train_idxs, test_idxs]):
+        print("{}: #chosen idx = {}\n".format(split, len(chosen_idxs)))
+        # Select features
+        feat_files = glob.glob("**/train.pkl", root_dir=vctk_dir, recursive=True)
+        for file in tqdm(feat_files):
+            raw_file = os.path.join(vctk_dir, file)
+            new_file = os.path.join(
+                fewspeaker_dir, file.replace("train.pkl", "{}.pkl".format(split))
+            )
+            new_dir = "/".join(new_file.split("/")[:-1])
+            os.makedirs(new_dir, exist_ok=True)
+            if "mel_min" in file or "mel_max" in file:
+                os.system("cp {} {}".format(raw_file, new_file))
+                continue
+            with open(raw_file, "rb") as f:
+                raw_feats = pickle.load(f)
+            print("file: {}, #raw_feats = {}".format(file, len(raw_feats)))
+            new_feats = []
+            for idx in chosen_idxs:
+                chosen_split_is_train, raw_idx = idx.split("_")
+                assert chosen_split_is_train == "train"
+                new_feats.append(raw_feats[int(raw_idx)])
+            with open(new_file, "wb") as f:
+                pickle.dump(new_feats, f)
+            print("New file: {}, #new_feats = {}".format(new_file, len(new_feats)))
+        # Utterance re-index
+        news_utts = [raw_train[int(idx.split("_")[-1])] for idx in chosen_idxs]
+        for i, utt in enumerate(news_utts):
+            utt["Dataset"] = "vctkfewsinger"
+            utt["index"] = i
+        with open(os.path.join(fewspeaker_dir, "{}.json".format(split)), "w") as f:
+            json.dump(news_utts, f, indent=4)

preprocessors/vctksample.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import pickle
+import glob
+from collections import defaultdict
+from tqdm import tqdm
+from preprocessors import get_golden_samples_indexes
+TRAIN_MAX_NUM_EVERY_PERSON = 250
+TEST_MAX_NUM_EVERY_PERSON = 25
+def select_sample_idxs():
+    # =========== Train ===========
+    with open(os.path.join(vctk_dir, "train.json"), "r") as f:
+        raw_train = json.load(f)
+    train_idxs = []
+    train_nums = defaultdict(int)
+    for utt in tqdm(raw_train):
+        idx = utt["index"]
+        singer = utt["Singer"]
+        if train_nums[singer] < TRAIN_MAX_NUM_EVERY_PERSON:
+            train_idxs.append(idx)
+            train_nums[singer] += 1
+    # =========== Test ===========
+    with open(os.path.join(vctk_dir, "test.json"), "r") as f:
+        raw_test = json.load(f)
+    # golden test
+    test_idxs = get_golden_samples_indexes(
+        dataset_name="vctk", split="test", dataset_dir=vctk_dir
+    )
+    test_nums = defaultdict(int)
+    for idx in test_idxs:
+        singer = raw_test[idx]["Singer"]
+        test_nums[singer] += 1
+    for utt in tqdm(raw_test):
+        idx = utt["index"]
+        singer = utt["Singer"]
+        if test_nums[singer] < TEST_MAX_NUM_EVERY_PERSON:
+            test_idxs.append(idx)
+            test_nums[singer] += 1
+    train_idxs.sort()
+    test_idxs.sort()
+    return train_idxs, test_idxs, raw_train, raw_test
+if __name__ == "__main__":
+    root_path = ""
+    vctk_dir = os.path.join(root_path, "vctk")
+    sample_dir = os.path.join(root_path, "vctksample")
+    os.makedirs(sample_dir, exist_ok=True)
+    train_idxs, test_idxs, raw_train, raw_test = select_sample_idxs()
+    print("#Train = {}, #Test = {}".format(len(train_idxs), len(test_idxs)))
+    for split, chosen_idxs, utterances in zip(
+        ["train", "test"], [train_idxs, test_idxs], [raw_train, raw_test]
+    ):
+        print(
+            "#{} = {}, #chosen idx = {}\n".format(
+                split, len(utterances), len(chosen_idxs)
+            )
+        )
+        # Select features
+        feat_files = glob.glob(
+            "**/{}.pkl".format(split), root_dir=vctk_dir, recursive=True
+        )
+        for file in tqdm(feat_files):
+            raw_file = os.path.join(vctk_dir, file)
+            new_file = os.path.join(sample_dir, file)
+            new_dir = "/".join(new_file.split("/")[:-1])
+            os.makedirs(new_dir, exist_ok=True)
+            if "mel_min" in file or "mel_max" in file:
+                os.system("cp {} {}".format(raw_file, new_file))
+                continue
+            with open(raw_file, "rb") as f:
+                raw_feats = pickle.load(f)
+            print("file: {}, #raw_feats = {}".format(file, len(raw_feats)))
+            new_feats = [raw_feats[idx] for idx in chosen_idxs]
+            with open(new_file, "wb") as f:
+                pickle.dump(new_feats, f)
+        # Utterance re-index
+        news_utts = [utterances[idx] for idx in chosen_idxs]
+        for i, utt in enumerate(news_utts):
+            utt["Dataset"] = "vctksample"
+            utt["index"] = i
+        with open(os.path.join(sample_dir, "{}.json".format(split)), "w") as f:
+            json.dump(news_utts, f, indent=4)

preprocessors/vocalist.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import torchaudio
+from tqdm import tqdm
+from glob import glob
+from collections import defaultdict
+from utils.util import has_existed
+def vocalist_statistics(data_dir):
+    singers = []
+    songs = []
+    global2singer2songs = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    global_infos = glob(data_dir + "/*")
+    for global_info in global_infos:
+        global_split = global_info.split("/")[-1]
+        singer_infos = glob(global_info + "/*")
+        for singer_info in singer_infos:
+            singer = singer_info.split("/")[-1]
+            singers.append(singer)
+            song_infos = glob(singer_info + "/*")
+            for song_info in song_infos:
+                song = song_info.split("/")[-1]
+                songs.append(song)
+                utts = glob(song_info + "/*.wav")
+                for utt in utts:
+                    uid = utt.split("/")[-1].split(".")[0]
+                    global2singer2songs[global_split][singer][song].append(uid)
+    unique_singers = list(set(singers))
+    unique_songs = list(set(songs))
+    unique_singers.sort()
+    unique_songs.sort()
+    print(
+        "vocalist: {} singers, {} songs ({} unique songs)".format(
+            len(unique_singers), len(songs), len(unique_songs)
+        )
+    )
+    print("Singers: \n{}".format("\t".join(unique_singers)))
+    return global2singer2songs, unique_singers
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing test samples for vocalist...\n")
+    save_dir = os.path.join(output_path, "vocalist")
+    os.makedirs(save_dir, exist_ok=True)
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    singer_dict_file = os.path.join(save_dir, "singers.json")
+    utt2singer_file = os.path.join(save_dir, "utt2singer")
+    if (
+        has_existed(train_output_file)
+        and has_existed(test_output_file)
+        and has_existed(singer_dict_file)
+        and has_existed(utt2singer_file)
+    ):
+        return
+    utt2singer = open(utt2singer_file, "w")
+    # Load
+    vocalist_path = dataset_path
+    global2singer2songs, unique_singers = vocalist_statistics(vocalist_path)
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    for global_info, singer2songs in tqdm(global2singer2songs.items()):
+        for singer, songs in tqdm(singer2songs.items()):
+            song_names = list(songs.keys())
+            for chosen_song in song_names:
+                for chosen_uid in songs[chosen_song]:
+                    res = {
+                        "Dataset": "opensinger",
+                        "Singer": singer,
+                        "Song": chosen_song,
+                        "Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
+                    }
+                    res["Path"] = "{}/{}/{}/{}.wav".format(
+                        global_info, singer, chosen_song, chosen_uid
+                    )
+                    res["Path"] = os.path.join(vocalist_path, res["Path"])
+                    assert os.path.exists(res["Path"])
+                    waveform, sample_rate = torchaudio.load(res["Path"])
+                    duration = waveform.size(-1) / sample_rate
+                    res["Duration"] = duration
+                    res["index"] = test_index_count
+                    test_total_duration += duration
+                    test.append(res)
+                    test_index_count += 1
+                    utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save train.json and test.json
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)
+    # Save singers.json
+    singer_lut = {name: i for i, name in enumerate(unique_singers)}
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)

pretrained/bigvgan/args.json ADDED Viewed

	@@ -0,0 +1,235 @@

+{
+    "base_config": "egs/vocoder/gan/exp_config_base.json",
+    "exp_name": "bigvgan_large",
+    "inference": {
+        "batch_size": 1,
+    },
+    "model": {
+        "bigvgan": {
+            "activation": "snakebeta",
+            "resblock": "1",
+            "resblock_dilation_sizes": [
+                [
+                    1,
+                    3,
+                    5,
+                ],
+                [
+                    1,
+                    3,
+                    5,
+                ],
+                [
+                    1,
+                    3,
+                    5,
+                ],
+            ],
+            "resblock_kernel_sizes": [
+                3,
+                7,
+                11,
+            ],
+            "snake_logscale": true,
+            "upsample_initial_channel": 1536,
+            "upsample_kernel_sizes": [
+                8,
+                8,
+                4,
+                4,
+                4,
+                4,
+            ],
+            "upsample_rates": [
+                4,
+                4,
+                2,
+                2,
+                2,
+                2,
+            ],
+        },
+        "discriminators": [
+            "mpd",
+            "msstftd",
+        ],
+        "generator": "bigvgan",
+        "mpd": {
+            "discriminator_channel_multi": 1,
+            "mpd_reshapes": [
+                2,
+                3,
+                5,
+                7,
+                11,
+            ],
+            "use_spectral_norm": false,
+        },
+        "mrd": {
+            "discriminator_channel_multi": 1,
+            "mrd_override": false,
+            "resolutions": [
+                [
+                    1024,
+                    120,
+                    600,
+                ],
+                [
+                    2048,
+                    240,
+                    1200,
+                ],
+                [
+                    512,
+                    50,
+                    240,
+                ],
+            ],
+            "use_spectral_norm": false,
+        },
+        "msstftd": {
+            "filters": 32,
+        },
+    },
+    "model_type": "GANVocoder",
+    "preprocess": {
+        "audio_dir": "audios",
+        "bits": 8,
+        "contentvec_dir": "contentvec",
+        "cut_mel_frame": 32,
+        "data_augment": false,
+        "dur_dir": "durs",
+        "duration_dir": "duration",
+        "emo2id": "emo2id.json",
+        "energy_dir": "energys",
+        "energy_extract_mode": "from_mel",
+        "energy_norm": false,
+        "extract_audio": true,
+        "extract_contentvec_feature": false,
+        "extract_duration": false,
+        "extract_energy": false,
+        "extract_label": false,
+        "extract_mcep": false,
+        "extract_mel": true,
+        "extract_mert_feature": false,
+        "extract_one_hot": false,
+        "extract_pitch": false,
+        "extract_uv": false,
+        "extract_wenet_feature": false,
+        "extract_whisper_feature": false,
+        "f0_max": 1100,
+        "f0_min": 50,
+        "file_lst": "file.lst",
+        "fmax": 12000,
+        "fmin": 0,
+        "hop_size": 256,
+        "is_mu_law": false,
+        "lab_dir": "labs",
+        "label_dir": "labels",
+        "mcep_dir": "mcep",
+        "mel_dir": "mels",
+        "mel_min_max_norm": false,
+        "min_level_db": -115,
+        "n_fft": 1024,
+        "n_mel": 100,
+        "num_silent_frames": 8,
+        "phone_seq_file": "phone_seq_file",
+        "pitch_bin": 256,
+        "pitch_dir": "pitches",
+        "pitch_extractor": "parselmouth",
+        "pitch_max": 1100.0,
+        "pitch_min": 50.0,
+        "pitch_norm": false,
+        "processed_dir": "processed_data",
+        "ref_level_db": 20,
+        "sample_rate": 24000,
+        "spk2id": "singers.json",
+        "train_file": "train.json",
+        "trim_fft_size": 512,
+        "trim_hop_size": 128,
+        "trim_silence": false,
+        "trim_top_db": 30,
+        "trimmed_wav_dir": "trimmed_wavs",
+        "use_audio": true,
+        "use_dur": false,
+        "use_emoid": false,
+        "use_frame_duration": false,
+        "use_frame_energy": false,
+        "use_frame_pitch": false,
+        "use_lab": false,
+        "use_label": false,
+        "use_log_scale_energy": false,
+        "use_log_scale_pitch": false,
+        "use_mel": true,
+        "use_one_hot": false,
+        "use_phn_seq": false,
+        "use_phone_duration": false,
+        "use_phone_energy": false,
+        "use_phone_pitch": false,
+        "use_spkid": false,
+        "use_uv": false,
+        "use_wav": false,
+        "use_wenet": false,
+        "utt2emo": "utt2emo",
+        "utt2spk": "utt2spk",
+        "uv_dir": "uvs",
+        "valid_file": "test.json",
+        "wav_dir": "wavs",
+        "wenet_dir": "wenet",
+        "win_size": 1024,
+    },
+    "supported_model_type": [
+        "GANVocoder",
+        "Fastspeech2",
+        "DiffSVC",
+        "Transformer",
+        "EDM",
+        "CD",
+    ],
+    "train": {
+        "adamw": {
+            "adam_b1": 0.8,
+            "adam_b2": 0.99,
+            "lr": 0.0002,
+        },
+        "batch_size": 4,
+        "criterions": [
+            "feature",
+            "discriminator",
+            "generator",
+            "mel",
+        ],
+        "dataloader": {
+            "num_worker": 4,
+            "pin_memory": true,
+        },
+        "ddp": true,
+        "epochs": 50000,
+        "exponential_lr": {
+            "lr_decay": 0.999,
+        },
+        "gradient_accumulation_step": 1,
+        "keep_checkpoint_max": 5,
+        "max_epoch": 1000000,
+        "max_steps": 1000000,
+        "multi_speaker_training": false,
+        "random_seed": 114514,
+        "run_eval": [
+            true,
+        ],
+        "sampler": {
+            "drop_last": true,
+            "holistic_shuffle": true,
+        },
+        "save_checkpoint_stride": [
+            200,
+        ],
+        "save_checkpoints_steps": 10000,
+        "save_summary_steps": 500,
+        "total_training_steps": 50000,
+        "tracker": [
+            "tensorboard",
+        ],
+        "valid_interval": 10000,
+    },
+}

pretrained/contentvec/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# Download
+- [Link](https://github.com/auspicious3000/contentvec)
+- Model: `ContentVec_legacy`
+- Classes: 500