Billpai
commited on
Commit
·
0312eff
1
Parent(s):
f196feb
test
Browse files- optimizer/__init__.py +0 -0
- optimizer/optimizers.py +780 -0
- preprocessors/__init__.py +189 -0
- preprocessors/bigdata.py +145 -0
- preprocessors/cdmusiceval.py +174 -0
- preprocessors/coco.py +100 -0
- preprocessors/cocoeval.py +99 -0
- preprocessors/csd.py +202 -0
- preprocessors/custom.py +143 -0
- preprocessors/kising.py +116 -0
- preprocessors/libritts.py +143 -0
- preprocessors/lijian.py +151 -0
- preprocessors/ljspeech.py +197 -0
- preprocessors/ljspeech_vocoder.py +86 -0
- preprocessors/m4singer.py +138 -0
- preprocessors/metadata.py +138 -0
- preprocessors/nus48e.py +203 -0
- preprocessors/opencpop.py +73 -0
- preprocessors/opensinger.py +169 -0
- preprocessors/opera.py +186 -0
- preprocessors/pjs.py +135 -0
- preprocessors/popbutfy.py +153 -0
- preprocessors/popcs.py +118 -0
- preprocessors/processor.py +100 -0
- preprocessors/svcc.py +85 -0
- preprocessors/svcceval.py +80 -0
- preprocessors/vctk.py +163 -0
- preprocessors/vctkfewsinger.py +175 -0
- preprocessors/vctksample.py +108 -0
- preprocessors/vocalist.py +137 -0
- pretrained/bigvgan/args.json +235 -0
- pretrained/contentvec/README.md +5 -0
optimizer/__init__.py
ADDED
File without changes
|
optimizer/optimizers.py
ADDED
@@ -0,0 +1,780 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This module is modified from https://github.com/Plachtaa/VALL-E-X/blob/3faaf8ccadb154d63b38070caf518ce9309ea0f4/modules/optim.py#L836
|
2 |
+
|
3 |
+
import logging
|
4 |
+
import contextlib
|
5 |
+
import torch
|
6 |
+
from torch import Tensor
|
7 |
+
from torch.optim.lr_scheduler import _LRScheduler
|
8 |
+
from torch.optim import Optimizer
|
9 |
+
from typing import List, Tuple
|
10 |
+
from collections import defaultdict
|
11 |
+
|
12 |
+
|
13 |
+
class NoamLR(_LRScheduler):
|
14 |
+
"""
|
15 |
+
Implements the Noam Learning rate schedule. This corresponds to increasing the learning rate
|
16 |
+
linearly for the first ``num_warmup`` training steps, and decreasing it thereafter proportionally
|
17 |
+
to the inverse square root of the step number, scaled by the inverse square root of the
|
18 |
+
dimensionality of the model. Time will tell if this is just madness or it's actually important.
|
19 |
+
Parameters
|
20 |
+
----------
|
21 |
+
num_warmup: ``int``, required.
|
22 |
+
The number of steps to linearly increase the learning rate.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(self, optimizer, num_warmup):
|
26 |
+
self.num_warmup = num_warmup
|
27 |
+
self.base_lr = optimizer.param_groups[0]["lr"]
|
28 |
+
super().__init__(optimizer)
|
29 |
+
|
30 |
+
def get_lr(self):
|
31 |
+
last_epoch = max(1, self.last_epoch)
|
32 |
+
scale = min(last_epoch ** (-0.5), last_epoch * self.num_warmup ** (-1.5))
|
33 |
+
return [scale * self.base_lr]
|
34 |
+
|
35 |
+
class Eve(Optimizer):
|
36 |
+
"""
|
37 |
+
Implements Eve algorithm. This is a modified version of AdamW with a special
|
38 |
+
way of setting the weight-decay / shrinkage-factor, which is designed to make the
|
39 |
+
rms of the parameters approach a particular target_rms (default: 0.1). This is
|
40 |
+
for use with networks with 'scaled' versions of modules (see scaling.py), which
|
41 |
+
will be close to invariant to the absolute scale on the parameter matrix.
|
42 |
+
|
43 |
+
The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
|
44 |
+
The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
|
45 |
+
Eve is unpublished so far.
|
46 |
+
|
47 |
+
Arguments:
|
48 |
+
params (iterable): iterable of parameters to optimize or dicts defining
|
49 |
+
parameter groups
|
50 |
+
lr (float, optional): learning rate (default: 1e-3)
|
51 |
+
betas (Tuple[float, float], optional): coefficients used for computing
|
52 |
+
running averages of gradient and its square (default: (0.9, 0.999))
|
53 |
+
eps (float, optional): term added to the denominator to improve
|
54 |
+
numerical stability (default: 1e-8)
|
55 |
+
weight_decay (float, optional): weight decay coefficient (default: 3e-4;
|
56 |
+
this value means that the weight would decay significantly after
|
57 |
+
about 3k minibatches. Is not multiplied by learning rate, but
|
58 |
+
is conditional on RMS-value of parameter being > target_rms.
|
59 |
+
target_rms (float, optional): target root-mean-square value of
|
60 |
+
parameters, if they fall below this we will stop applying weight decay.
|
61 |
+
|
62 |
+
|
63 |
+
.. _Adam: A Method for Stochastic Optimization:
|
64 |
+
https://arxiv.org/abs/1412.6980
|
65 |
+
.. _Decoupled Weight Decay Regularization:
|
66 |
+
https://arxiv.org/abs/1711.05101
|
67 |
+
.. _On the Convergence of Adam and Beyond:
|
68 |
+
https://openreview.net/forum?id=ryQu7f-RZ
|
69 |
+
"""
|
70 |
+
|
71 |
+
def __init__(
|
72 |
+
self,
|
73 |
+
params,
|
74 |
+
lr=1e-3,
|
75 |
+
betas=(0.9, 0.98),
|
76 |
+
eps=1e-8,
|
77 |
+
weight_decay=1e-3,
|
78 |
+
target_rms=0.1,
|
79 |
+
):
|
80 |
+
if not 0.0 <= lr:
|
81 |
+
raise ValueError("Invalid learning rate: {}".format(lr))
|
82 |
+
if not 0.0 <= eps:
|
83 |
+
raise ValueError("Invalid epsilon value: {}".format(eps))
|
84 |
+
if not 0.0 <= betas[0] < 1.0:
|
85 |
+
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
|
86 |
+
if not 0.0 <= betas[1] < 1.0:
|
87 |
+
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
|
88 |
+
if not 0 <= weight_decay <= 0.1:
|
89 |
+
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
|
90 |
+
if not 0 < target_rms <= 10.0:
|
91 |
+
raise ValueError("Invalid target_rms value: {}".format(target_rms))
|
92 |
+
defaults = dict(
|
93 |
+
lr=lr,
|
94 |
+
betas=betas,
|
95 |
+
eps=eps,
|
96 |
+
weight_decay=weight_decay,
|
97 |
+
target_rms=target_rms,
|
98 |
+
)
|
99 |
+
super(Eve, self).__init__(params, defaults)
|
100 |
+
|
101 |
+
def __setstate__(self, state):
|
102 |
+
super(Eve, self).__setstate__(state)
|
103 |
+
|
104 |
+
@torch.no_grad()
|
105 |
+
def step(self, closure=None):
|
106 |
+
"""Performs a single optimization step.
|
107 |
+
|
108 |
+
Arguments:
|
109 |
+
closure (callable, optional): A closure that reevaluates the model
|
110 |
+
and returns the loss.
|
111 |
+
"""
|
112 |
+
loss = None
|
113 |
+
if closure is not None:
|
114 |
+
with torch.enable_grad():
|
115 |
+
loss = closure()
|
116 |
+
|
117 |
+
for group in self.param_groups:
|
118 |
+
for p in group["params"]:
|
119 |
+
if p.grad is None:
|
120 |
+
continue
|
121 |
+
|
122 |
+
# Perform optimization step
|
123 |
+
grad = p.grad
|
124 |
+
if grad.is_sparse:
|
125 |
+
raise RuntimeError("AdamW does not support sparse gradients")
|
126 |
+
|
127 |
+
state = self.state[p]
|
128 |
+
|
129 |
+
# State initialization
|
130 |
+
if len(state) == 0:
|
131 |
+
state["step"] = 0
|
132 |
+
# Exponential moving average of gradient values
|
133 |
+
state["exp_avg"] = torch.zeros_like(
|
134 |
+
p, memory_format=torch.preserve_format
|
135 |
+
)
|
136 |
+
# Exponential moving average of squared gradient values
|
137 |
+
state["exp_avg_sq"] = torch.zeros_like(
|
138 |
+
p, memory_format=torch.preserve_format
|
139 |
+
)
|
140 |
+
|
141 |
+
exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
|
142 |
+
|
143 |
+
beta1, beta2 = group["betas"]
|
144 |
+
|
145 |
+
state["step"] += 1
|
146 |
+
bias_correction1 = 1 - beta1 ** state["step"]
|
147 |
+
bias_correction2 = 1 - beta2 ** state["step"]
|
148 |
+
|
149 |
+
# Decay the first and second moment running average coefficient
|
150 |
+
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
|
151 |
+
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
|
152 |
+
denom = (exp_avg_sq.sqrt() * (bias_correction2**-0.5)).add_(
|
153 |
+
group["eps"]
|
154 |
+
)
|
155 |
+
|
156 |
+
step_size = group["lr"] / bias_correction1
|
157 |
+
target_rms = group["target_rms"]
|
158 |
+
weight_decay = group["weight_decay"]
|
159 |
+
|
160 |
+
if p.numel() > 1:
|
161 |
+
# avoid applying this weight-decay on "scaling factors"
|
162 |
+
# (which are scalar).
|
163 |
+
is_above_target_rms = p.norm() > (target_rms * (p.numel() ** 0.5))
|
164 |
+
p.mul_(1 - (weight_decay * is_above_target_rms))
|
165 |
+
|
166 |
+
p.addcdiv_(exp_avg, denom, value=-step_size)
|
167 |
+
|
168 |
+
# if random.random() < 0.0005:
|
169 |
+
# step = (exp_avg / denom) * step_size
|
170 |
+
# logging.info(
|
171 |
+
# f"Delta rms = {(step**2).mean().item()}, shape = {step.shape}"
|
172 |
+
# )
|
173 |
+
|
174 |
+
return loss
|
175 |
+
|
176 |
+
class BatchedOptimizer(Optimizer):
|
177 |
+
"""
|
178 |
+
This class adds to class Optimizer the capability to optimize parameters in batches:
|
179 |
+
it will stack the parameters and their grads for you so the optimizer can work
|
180 |
+
on tensors with an extra leading dimension. This is intended for speed with GPUs,
|
181 |
+
as it reduces the number of kernels launched in the optimizer.
|
182 |
+
|
183 |
+
Args:
|
184 |
+
params:
|
185 |
+
"""
|
186 |
+
|
187 |
+
def __init__(self, params, defaults):
|
188 |
+
super(BatchedOptimizer, self).__init__(params, defaults)
|
189 |
+
|
190 |
+
@contextlib.contextmanager
|
191 |
+
def batched_params(self, param_group, group_params_names):
|
192 |
+
"""
|
193 |
+
This function returns (technically, yields) a list of
|
194 |
+
of tuples (p, state), where
|
195 |
+
p is a `fake` parameter that is stacked (over axis 0) from real parameters
|
196 |
+
that share the same shape, and its gradient is also stacked;
|
197 |
+
`state` is the state corresponding to this batch of parameters
|
198 |
+
(it will be physically located in the "state" for one of the real
|
199 |
+
parameters, the last one that has any particular shape and dtype).
|
200 |
+
|
201 |
+
This function is decorated as a context manager so that it can
|
202 |
+
write parameters back to their "real" locations.
|
203 |
+
|
204 |
+
The idea is, instead of doing:
|
205 |
+
<code>
|
206 |
+
for p in group["params"]:
|
207 |
+
state = self.state[p]
|
208 |
+
...
|
209 |
+
</code>
|
210 |
+
you can do:
|
211 |
+
<code>
|
212 |
+
with self.batched_params(group["params"]) as batches:
|
213 |
+
for p, state, p_names in batches:
|
214 |
+
...
|
215 |
+
</code>
|
216 |
+
|
217 |
+
Args:
|
218 |
+
group: a parameter group, which is a list of parameters; should be
|
219 |
+
one of self.param_groups.
|
220 |
+
group_params_names: name for each parameter in group,
|
221 |
+
which is List[str].
|
222 |
+
"""
|
223 |
+
batches = defaultdict(
|
224 |
+
list
|
225 |
+
) # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter
|
226 |
+
batches_names = defaultdict(
|
227 |
+
list
|
228 |
+
) # `batches` maps from tuple (dtype_as_str,*shape) to list of str
|
229 |
+
|
230 |
+
assert len(param_group) == len(group_params_names)
|
231 |
+
for p, named_p in zip(param_group, group_params_names):
|
232 |
+
key = (str(p.dtype), *p.shape)
|
233 |
+
batches[key].append(p)
|
234 |
+
batches_names[key].append(named_p)
|
235 |
+
|
236 |
+
batches_names_keys = list(batches_names.keys())
|
237 |
+
sorted_idx = sorted(
|
238 |
+
range(len(batches_names)), key=lambda i: batches_names_keys[i]
|
239 |
+
)
|
240 |
+
batches_names = [batches_names[batches_names_keys[idx]] for idx in sorted_idx]
|
241 |
+
batches = [batches[batches_names_keys[idx]] for idx in sorted_idx]
|
242 |
+
|
243 |
+
stacked_params_dict = dict()
|
244 |
+
|
245 |
+
# turn batches into a list, in deterministic order.
|
246 |
+
# tuples will contain tuples of (stacked_param, state, stacked_params_names),
|
247 |
+
# one for each batch in `batches`.
|
248 |
+
tuples = []
|
249 |
+
|
250 |
+
for batch, batch_names in zip(batches, batches_names):
|
251 |
+
p = batch[0]
|
252 |
+
# we arbitrarily store the state in the
|
253 |
+
# state corresponding to the 1st parameter in the
|
254 |
+
# group. class Optimizer will take care of saving/loading state.
|
255 |
+
state = self.state[p]
|
256 |
+
p_stacked = torch.stack(batch)
|
257 |
+
grad = torch.stack(
|
258 |
+
[torch.zeros_like(p) if p.grad is None else p.grad for p in batch]
|
259 |
+
)
|
260 |
+
p_stacked.grad = grad
|
261 |
+
stacked_params_dict[key] = p_stacked
|
262 |
+
tuples.append((p_stacked, state, batch_names))
|
263 |
+
|
264 |
+
yield tuples
|
265 |
+
|
266 |
+
for ((stacked_params, _state, _names), batch) in zip(tuples, batches):
|
267 |
+
for i, p in enumerate(batch):
|
268 |
+
p.copy_(stacked_params[i])
|
269 |
+
|
270 |
+
class ScaledAdam(BatchedOptimizer):
|
271 |
+
"""
|
272 |
+
Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update
|
273 |
+
proportional to the norm of that parameter; and also learn the scale of the parameter,
|
274 |
+
in log space, subject to upper and lower limits (as if we had factored each parameter as
|
275 |
+
param = underlying_param * log_scale.exp())
|
276 |
+
|
277 |
+
|
278 |
+
Args:
|
279 |
+
params: The parameters or param_groups to optimize (like other Optimizer subclasses)
|
280 |
+
lr: The learning rate. We will typically use a learning rate schedule that starts
|
281 |
+
at 0.03 and decreases over time, i.e. much higher than other common
|
282 |
+
optimizers.
|
283 |
+
clipping_scale: (e.g. 2.0)
|
284 |
+
A scale for gradient-clipping: if specified, the normalized gradients
|
285 |
+
over the whole model will be clipped to have 2-norm equal to
|
286 |
+
`clipping_scale` times the median 2-norm over the most recent period
|
287 |
+
of `clipping_update_period` minibatches. By "normalized gradients",
|
288 |
+
we mean after multiplying by the rms parameter value for this tensor
|
289 |
+
[for non-scalars]; this is appropriate because our update is scaled
|
290 |
+
by this quantity.
|
291 |
+
betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad.
|
292 |
+
Must satisfy 0 < beta <= beta2 < 1.
|
293 |
+
scalar_lr_scale: A scaling factor on the learning rate, that we use to update the
|
294 |
+
scale of each parameter tensor and scalar parameters of the mode..
|
295 |
+
If each parameter were decomposed
|
296 |
+
as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale
|
297 |
+
would be a the scaling factor on the learning rate of p_scale.
|
298 |
+
eps: A general-purpose epsilon to prevent division by zero
|
299 |
+
param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of
|
300 |
+
learning the scale on the parameters (we'll constrain the rms of each non-scalar
|
301 |
+
parameter tensor to be >= this value)
|
302 |
+
param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of
|
303 |
+
learning the scale on the parameters (we'll constrain the rms of each non-scalar
|
304 |
+
parameter tensor to be <= this value)
|
305 |
+
scalar_max: Maximum absolute value for scalar parameters (applicable if your
|
306 |
+
model has any parameters with numel() == 1).
|
307 |
+
size_update_period: The periodicity, in steps, with which we update the size (scale)
|
308 |
+
of the parameter tensor. This is provided to save a little time
|
309 |
+
in the update.
|
310 |
+
clipping_update_period: if clipping_scale is specified, this is the period
|
311 |
+
"""
|
312 |
+
|
313 |
+
def __init__(
|
314 |
+
self,
|
315 |
+
params,
|
316 |
+
lr=3e-02,
|
317 |
+
clipping_scale=None,
|
318 |
+
betas=(0.9, 0.98),
|
319 |
+
scalar_lr_scale=0.1,
|
320 |
+
eps=1.0e-08,
|
321 |
+
param_min_rms=1.0e-05,
|
322 |
+
param_max_rms=3.0,
|
323 |
+
scalar_max=10.0,
|
324 |
+
size_update_period=4,
|
325 |
+
clipping_update_period=100,
|
326 |
+
parameters_names=None,
|
327 |
+
show_dominant_parameters=True,
|
328 |
+
):
|
329 |
+
|
330 |
+
assert parameters_names is not None, (
|
331 |
+
"Please prepare parameters_names,"
|
332 |
+
"which is a List[List[str]]. Each List[str] is for a group"
|
333 |
+
"and each str is for a parameter"
|
334 |
+
)
|
335 |
+
defaults = dict(
|
336 |
+
lr=lr,
|
337 |
+
clipping_scale=clipping_scale,
|
338 |
+
betas=betas,
|
339 |
+
scalar_lr_scale=scalar_lr_scale,
|
340 |
+
eps=eps,
|
341 |
+
param_min_rms=param_min_rms,
|
342 |
+
param_max_rms=param_max_rms,
|
343 |
+
scalar_max=scalar_max,
|
344 |
+
size_update_period=size_update_period,
|
345 |
+
clipping_update_period=clipping_update_period,
|
346 |
+
)
|
347 |
+
|
348 |
+
super(ScaledAdam, self).__init__(params, defaults)
|
349 |
+
assert len(self.param_groups) == len(parameters_names)
|
350 |
+
self.parameters_names = parameters_names
|
351 |
+
self.show_dominant_parameters = show_dominant_parameters
|
352 |
+
|
353 |
+
def __setstate__(self, state):
|
354 |
+
super(ScaledAdam, self).__setstate__(state)
|
355 |
+
|
356 |
+
@torch.no_grad()
|
357 |
+
def step(self, closure=None):
|
358 |
+
"""Performs a single optimization step.
|
359 |
+
|
360 |
+
Arguments:
|
361 |
+
closure (callable, optional): A closure that reevaluates the model
|
362 |
+
and returns the loss.
|
363 |
+
"""
|
364 |
+
loss = None
|
365 |
+
if closure is not None:
|
366 |
+
with torch.enable_grad():
|
367 |
+
loss = closure()
|
368 |
+
|
369 |
+
batch = True
|
370 |
+
|
371 |
+
for group, group_params_names in zip(self.param_groups, self.parameters_names):
|
372 |
+
|
373 |
+
with self.batched_params(group["params"], group_params_names) as batches:
|
374 |
+
|
375 |
+
# batches is list of pairs (stacked_param, state). stacked_param is like
|
376 |
+
# a regular parameter, and will have a .grad, but the 1st dim corresponds to
|
377 |
+
# a stacking dim, it is not a real dim.
|
378 |
+
|
379 |
+
if (
|
380 |
+
len(batches[0][1]) == 0
|
381 |
+
):
|
382 |
+
clipping_scale = 1
|
383 |
+
else:
|
384 |
+
clipping_scale = self._get_clipping_scale(group, batches)
|
385 |
+
|
386 |
+
for p, state, _ in batches:
|
387 |
+
# Perform optimization step.
|
388 |
+
# grad is not going to be None, we handled that when creating the batches.
|
389 |
+
grad = p.grad
|
390 |
+
if grad.is_sparse:
|
391 |
+
raise RuntimeError(
|
392 |
+
"ScaledAdam optimizer does not support sparse gradients"
|
393 |
+
)
|
394 |
+
# State initialization
|
395 |
+
if len(state) == 0:
|
396 |
+
self._init_state(group, p, state)
|
397 |
+
|
398 |
+
self._step_one_batch(group, p, state, clipping_scale)
|
399 |
+
|
400 |
+
return loss
|
401 |
+
|
402 |
+
def _init_state(self, group: dict, p: Tensor, state: dict):
|
403 |
+
"""
|
404 |
+
Initializes state dict for parameter 'p'. Assumes that dim 0 of tensor p
|
405 |
+
is actually the batch dimension, corresponding to batched-together
|
406 |
+
parameters of a given shape.
|
407 |
+
|
408 |
+
|
409 |
+
Args:
|
410 |
+
group: Dict to look up configuration values.
|
411 |
+
p: The parameter that we are initializing the state for
|
412 |
+
state: Dict from string to whatever state we are initializing
|
413 |
+
"""
|
414 |
+
size_update_period = group["size_update_period"]
|
415 |
+
|
416 |
+
state["step"] = 0
|
417 |
+
|
418 |
+
kwargs = {"device": p.device, "dtype": p.dtype}
|
419 |
+
|
420 |
+
# 'delta' implements conventional momentum. There are
|
421 |
+
# several different kinds of update going on, so rather than
|
422 |
+
# compute "exp_avg" like in Adam, we store and decay a
|
423 |
+
# parameter-change "delta", which combines all forms of
|
424 |
+
# update. this is equivalent to how it's done in Adam,
|
425 |
+
# except for the first few steps.
|
426 |
+
state["delta"] = torch.zeros_like(p, memory_format=torch.preserve_format)
|
427 |
+
|
428 |
+
batch_size = p.shape[0]
|
429 |
+
numel = p.numel() // batch_size
|
430 |
+
numel = p.numel()
|
431 |
+
|
432 |
+
if numel > 1:
|
433 |
+
# "param_rms" just periodically records the scalar root-mean-square value of
|
434 |
+
# the parameter tensor.
|
435 |
+
# it has a shape like (batch_size, 1, 1, 1, 1)
|
436 |
+
param_rms = (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
|
437 |
+
state["param_rms"] = param_rms
|
438 |
+
|
439 |
+
state["scale_exp_avg_sq"] = torch.zeros_like(param_rms)
|
440 |
+
state["scale_grads"] = torch.zeros(
|
441 |
+
size_update_period, *param_rms.shape, **kwargs
|
442 |
+
)
|
443 |
+
|
444 |
+
# exp_avg_sq is the weighted sum of scaled gradients. as in Adam.
|
445 |
+
state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
|
446 |
+
|
447 |
+
def _get_clipping_scale(
|
448 |
+
self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]]
|
449 |
+
) -> float:
|
450 |
+
"""
|
451 |
+
Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients
|
452 |
+
by this amount before applying the rest of the update.
|
453 |
+
|
454 |
+
Args:
|
455 |
+
group: the parameter group, an item in self.param_groups
|
456 |
+
tuples: a list of tuples of (param, state, param_names)
|
457 |
+
where param is a batched set of parameters,
|
458 |
+
with a .grad (1st dim is batch dim)
|
459 |
+
and state is the state-dict where optimization parameters are kept.
|
460 |
+
param_names is a List[str] while each str is name for a parameter
|
461 |
+
in batched set of parameters "param".
|
462 |
+
"""
|
463 |
+
assert len(tuples) >= 1
|
464 |
+
clipping_scale = group["clipping_scale"]
|
465 |
+
(first_p, first_state, _) = tuples[0]
|
466 |
+
step = first_state["step"]
|
467 |
+
if clipping_scale is None or step == 0:
|
468 |
+
# no clipping. return early on step == 0 because the other
|
469 |
+
# parameters' state won't have been initialized yet.
|
470 |
+
return 1.0
|
471 |
+
clipping_update_period = group["clipping_update_period"]
|
472 |
+
|
473 |
+
tot_sumsq = torch.tensor(0.0, device=first_p.device)
|
474 |
+
for (p, state, param_names) in tuples:
|
475 |
+
grad = p.grad
|
476 |
+
if grad.is_sparse:
|
477 |
+
raise RuntimeError(
|
478 |
+
"ScaledAdam optimizer does not support sparse gradients"
|
479 |
+
)
|
480 |
+
if p.numel() == p.shape[0]: # a batch of scalars
|
481 |
+
tot_sumsq += (grad**2).sum() # sum() to change shape [1] to []
|
482 |
+
else:
|
483 |
+
tot_sumsq += ((grad * state["param_rms"]) ** 2).sum()
|
484 |
+
|
485 |
+
tot_norm = tot_sumsq.sqrt()
|
486 |
+
if "model_norms" not in first_state:
|
487 |
+
first_state["model_norms"] = torch.zeros(
|
488 |
+
clipping_update_period, device=p.device
|
489 |
+
)
|
490 |
+
first_state["model_norms"][step % clipping_update_period] = tot_norm
|
491 |
+
|
492 |
+
if step % clipping_update_period == 0:
|
493 |
+
# Print some stats.
|
494 |
+
# We don't reach here if step == 0 because we would have returned
|
495 |
+
# above.
|
496 |
+
sorted_norms = first_state["model_norms"].sort()[0].to("cpu")
|
497 |
+
quartiles = []
|
498 |
+
for n in range(0, 5):
|
499 |
+
index = min(
|
500 |
+
clipping_update_period - 1,
|
501 |
+
(clipping_update_period // 4) * n,
|
502 |
+
)
|
503 |
+
quartiles.append(sorted_norms[index].item())
|
504 |
+
|
505 |
+
median = quartiles[2]
|
506 |
+
threshold = clipping_scale * median
|
507 |
+
first_state["model_norm_threshold"] = threshold
|
508 |
+
percent_clipped = (
|
509 |
+
first_state["num_clipped"] * 100.0 / clipping_update_period
|
510 |
+
if "num_clipped" in first_state
|
511 |
+
else 0.0
|
512 |
+
)
|
513 |
+
first_state["num_clipped"] = 0
|
514 |
+
quartiles = " ".join(["%.3e" % x for x in quartiles])
|
515 |
+
logging.info(
|
516 |
+
f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, "
|
517 |
+
f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}"
|
518 |
+
)
|
519 |
+
|
520 |
+
if step < clipping_update_period:
|
521 |
+
return 1.0 # We have not yet estimated a norm to clip to.
|
522 |
+
else:
|
523 |
+
try:
|
524 |
+
model_norm_threshold = first_state["model_norm_threshold"]
|
525 |
+
except KeyError:
|
526 |
+
logging.info(
|
527 |
+
"Warning: model_norm_threshold not in state: possibly "
|
528 |
+
"you changed config when restarting, adding clipping_scale option?"
|
529 |
+
)
|
530 |
+
return 1.0
|
531 |
+
ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item())
|
532 |
+
if ans < 1.0:
|
533 |
+
first_state["num_clipped"] += 1
|
534 |
+
if ans < 0.1:
|
535 |
+
logging.warn(
|
536 |
+
f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}"
|
537 |
+
)
|
538 |
+
if self.show_dominant_parameters:
|
539 |
+
assert p.shape[0] == len(param_names)
|
540 |
+
self._show_gradient_dominating_parameter(tuples, tot_sumsq)
|
541 |
+
return ans
|
542 |
+
|
543 |
+
def _show_gradient_dominating_parameter(
|
544 |
+
self, tuples: List[Tuple[Tensor, dict, List[str]]], tot_sumsq: Tensor
|
545 |
+
):
|
546 |
+
"""
|
547 |
+
Show information of parameter wihch dominanting tot_sumsq.
|
548 |
+
|
549 |
+
Args:
|
550 |
+
tuples: a list of tuples of (param, state, param_names)
|
551 |
+
where param is a batched set of parameters,
|
552 |
+
with a .grad (1st dim is batch dim)
|
553 |
+
and state is the state-dict where optimization parameters are kept.
|
554 |
+
param_names is a List[str] while each str is name for a parameter
|
555 |
+
in batched set of parameters "param".
|
556 |
+
tot_sumsq: sumsq of all parameters. Though it's could be calculated
|
557 |
+
from tuples, we still pass it to save some time.
|
558 |
+
"""
|
559 |
+
all_sumsq_orig = {}
|
560 |
+
for (p, state, batch_param_names) in tuples:
|
561 |
+
# p is a stacked batch parameters.
|
562 |
+
batch_grad = p.grad
|
563 |
+
if p.numel() == p.shape[0]: # a batch of scalars
|
564 |
+
batch_sumsq_orig = batch_grad**2
|
565 |
+
# Dummpy values used by following `zip` statement.
|
566 |
+
batch_rms_orig = torch.ones(p.shape[0])
|
567 |
+
else:
|
568 |
+
batch_rms_orig = state["param_rms"]
|
569 |
+
batch_sumsq_orig = ((batch_grad * batch_rms_orig) ** 2).sum(
|
570 |
+
dim=list(range(1, batch_grad.ndim))
|
571 |
+
)
|
572 |
+
|
573 |
+
for name, sumsq_orig, rms, grad in zip(
|
574 |
+
batch_param_names, batch_sumsq_orig, batch_rms_orig, batch_grad
|
575 |
+
):
|
576 |
+
|
577 |
+
proportion_orig = sumsq_orig / tot_sumsq
|
578 |
+
all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad)
|
579 |
+
|
580 |
+
assert torch.isclose(
|
581 |
+
sum([value[0] for value in all_sumsq_orig.values()]).cpu(),
|
582 |
+
torch.tensor(1.0),
|
583 |
+
)
|
584 |
+
sorted_by_proportion = {
|
585 |
+
k: v
|
586 |
+
for k, v in sorted(
|
587 |
+
all_sumsq_orig.items(),
|
588 |
+
key=lambda item: item[1][0],
|
589 |
+
reverse=True,
|
590 |
+
)
|
591 |
+
}
|
592 |
+
dominant_param_name = next(iter(sorted_by_proportion))
|
593 |
+
(
|
594 |
+
dominant_proportion,
|
595 |
+
dominant_sumsq,
|
596 |
+
dominant_rms,
|
597 |
+
dominant_grad,
|
598 |
+
) = sorted_by_proportion[dominant_param_name]
|
599 |
+
logging.info(
|
600 |
+
f"Parameter Dominanting tot_sumsq {dominant_param_name}"
|
601 |
+
f" with proportion {dominant_proportion:.2f},"
|
602 |
+
f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)"
|
603 |
+
f"={dominant_sumsq:.3e},"
|
604 |
+
f" grad_sumsq = {(dominant_grad**2).sum():.3e},"
|
605 |
+
f" orig_rms_sq={(dominant_rms**2).item():.3e}"
|
606 |
+
)
|
607 |
+
|
608 |
+
def _step_one_batch(
|
609 |
+
self, group: dict, p: Tensor, state: dict, clipping_scale: float
|
610 |
+
):
|
611 |
+
"""
|
612 |
+
Do the step for one parameter, which is actually going to be a batch of
|
613 |
+
`real` parameters, with dim 0 as the batch dim.
|
614 |
+
Args:
|
615 |
+
group: dict to look up configuration values
|
616 |
+
p: parameter to update (actually multiple parameters stacked together
|
617 |
+
as a batch)
|
618 |
+
state: state-dict for p, to look up the optimizer state
|
619 |
+
"""
|
620 |
+
lr = group["lr"]
|
621 |
+
size_update_period = group["size_update_period"]
|
622 |
+
beta1 = group["betas"][0]
|
623 |
+
|
624 |
+
grad = p.grad
|
625 |
+
if clipping_scale != 1.0:
|
626 |
+
grad = grad * clipping_scale
|
627 |
+
step = state["step"]
|
628 |
+
delta = state["delta"]
|
629 |
+
|
630 |
+
delta.mul_(beta1)
|
631 |
+
batch_size = p.shape[0]
|
632 |
+
numel = p.numel() // batch_size
|
633 |
+
if numel > 1:
|
634 |
+
# Update the size/scale of p, and set param_rms
|
635 |
+
scale_grads = state["scale_grads"]
|
636 |
+
scale_grads[step % size_update_period] = (p * grad).sum(
|
637 |
+
dim=list(range(1, p.ndim)), keepdim=True
|
638 |
+
)
|
639 |
+
if step % size_update_period == size_update_period - 1:
|
640 |
+
param_rms = state["param_rms"] # shape: (batch_size, 1, 1, ..)
|
641 |
+
param_rms.copy_(
|
642 |
+
(p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
|
643 |
+
)
|
644 |
+
if step > 0:
|
645 |
+
# self._size_update() learns the overall scale on the
|
646 |
+
# parameter, by shrinking or expanding it.
|
647 |
+
self._size_update(group, scale_grads, p, state)
|
648 |
+
|
649 |
+
if numel == 1:
|
650 |
+
# For parameters with 1 element we just use regular Adam.
|
651 |
+
# Updates delta.
|
652 |
+
self._step_scalar(group, p, state)
|
653 |
+
else:
|
654 |
+
self._step(group, p, state)
|
655 |
+
|
656 |
+
state["step"] = step + 1
|
657 |
+
|
658 |
+
def _size_update(
|
659 |
+
self, group: dict, scale_grads: Tensor, p: Tensor, state: dict
|
660 |
+
) -> None:
|
661 |
+
"""
|
662 |
+
Called only where p.numel() > 1, this updates the scale of the parameter.
|
663 |
+
If we imagine: p = underlying_param * scale.exp(), and we are doing
|
664 |
+
gradient descent on underlying param and on scale, this function does the update
|
665 |
+
on `scale`.
|
666 |
+
|
667 |
+
Args:
|
668 |
+
group: dict to look up configuration values
|
669 |
+
scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing
|
670 |
+
grads w.r.t. the scales.
|
671 |
+
p: The parameter to update
|
672 |
+
state: The state-dict of p
|
673 |
+
"""
|
674 |
+
|
675 |
+
param_rms = state["param_rms"]
|
676 |
+
beta1, beta2 = group["betas"]
|
677 |
+
size_lr = group["lr"] * group["scalar_lr_scale"]
|
678 |
+
param_min_rms = group["param_min_rms"]
|
679 |
+
param_max_rms = group["param_max_rms"]
|
680 |
+
eps = group["eps"]
|
681 |
+
step = state["step"]
|
682 |
+
batch_size = p.shape[0]
|
683 |
+
|
684 |
+
size_update_period = scale_grads.shape[0]
|
685 |
+
# correct beta2 for the size update period: we will have
|
686 |
+
# faster decay at this level.
|
687 |
+
beta2_corr = beta2**size_update_period
|
688 |
+
|
689 |
+
scale_exp_avg_sq = state["scale_exp_avg_sq"] # shape: (batch_size, 1, 1, ..)
|
690 |
+
scale_exp_avg_sq.mul_(beta2_corr).add_(
|
691 |
+
(scale_grads**2).mean(dim=0), # mean over dim `size_update_period`
|
692 |
+
alpha=1 - beta2_corr,
|
693 |
+
) # shape is (batch_size, 1, 1, ...)
|
694 |
+
|
695 |
+
# The 1st time we reach here is when size_step == 1.
|
696 |
+
size_step = (step + 1) // size_update_period
|
697 |
+
bias_correction2 = 1 - beta2_corr**size_step
|
698 |
+
# we don't bother with bias_correction1; this will help prevent divergence
|
699 |
+
# at the start of training.
|
700 |
+
|
701 |
+
denom = scale_exp_avg_sq.sqrt() + eps
|
702 |
+
|
703 |
+
scale_step = (
|
704 |
+
-size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom
|
705 |
+
)
|
706 |
+
|
707 |
+
is_too_small = param_rms < param_min_rms
|
708 |
+
is_too_large = param_rms > param_max_rms
|
709 |
+
|
710 |
+
# when the param gets too small, just don't shrink it any further.
|
711 |
+
scale_step.masked_fill_(is_too_small, 0.0)
|
712 |
+
# when it gets too large, stop it from getting any larger.
|
713 |
+
scale_step.masked_fill_(is_too_large, -size_lr * size_update_period)
|
714 |
+
delta = state["delta"]
|
715 |
+
# the factor of (1-beta1) relates to momentum.
|
716 |
+
delta.add_(p * scale_step, alpha=(1 - beta1))
|
717 |
+
|
718 |
+
def _step(self, group: dict, p: Tensor, state: dict):
|
719 |
+
"""
|
720 |
+
This function does the core update of self.step(), in the case where the members of
|
721 |
+
the batch have more than 1 element.
|
722 |
+
|
723 |
+
Args:
|
724 |
+
group: A dict which will be used to look up configuration values
|
725 |
+
p: The parameter to be updated
|
726 |
+
grad: The grad of p
|
727 |
+
state: The state-dict corresponding to parameter p
|
728 |
+
|
729 |
+
This function modifies p.
|
730 |
+
"""
|
731 |
+
grad = p.grad
|
732 |
+
lr = group["lr"]
|
733 |
+
beta1, beta2 = group["betas"]
|
734 |
+
eps = group["eps"]
|
735 |
+
param_min_rms = group["param_min_rms"]
|
736 |
+
step = state["step"]
|
737 |
+
|
738 |
+
exp_avg_sq = state["exp_avg_sq"]
|
739 |
+
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2))
|
740 |
+
|
741 |
+
this_step = state["step"] - (state["zero_step"] if "zero_step" in state else 0)
|
742 |
+
bias_correction2 = 1 - beta2 ** (this_step + 1)
|
743 |
+
if bias_correction2 < 0.99:
|
744 |
+
# note: not in-place.
|
745 |
+
exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2)
|
746 |
+
|
747 |
+
denom = exp_avg_sq.sqrt()
|
748 |
+
denom += eps
|
749 |
+
grad = grad / denom
|
750 |
+
|
751 |
+
alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms)
|
752 |
+
|
753 |
+
delta = state["delta"]
|
754 |
+
delta.add_(grad * alpha)
|
755 |
+
p.add_(delta)
|
756 |
+
|
757 |
+
def _step_scalar(self, group: dict, p: Tensor, state: dict):
|
758 |
+
"""
|
759 |
+
A simplified form of the core update for scalar tensors, where we cannot get a good
|
760 |
+
estimate of the parameter rms.
|
761 |
+
"""
|
762 |
+
beta1, beta2 = group["betas"]
|
763 |
+
scalar_max = group["scalar_max"]
|
764 |
+
eps = group["eps"]
|
765 |
+
lr = group["lr"] * group["scalar_lr_scale"]
|
766 |
+
grad = p.grad
|
767 |
+
|
768 |
+
exp_avg_sq = state["exp_avg_sq"] # shape: (batch_size,)
|
769 |
+
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
|
770 |
+
|
771 |
+
# bias_correction2 is like in Adam. Don't bother with bias_correction1;
|
772 |
+
# slower update at the start will help stability anyway.
|
773 |
+
bias_correction2 = 1 - beta2 ** (state["step"] + 1)
|
774 |
+
denom = (exp_avg_sq / bias_correction2).sqrt() + eps
|
775 |
+
|
776 |
+
delta = state["delta"]
|
777 |
+
delta.add_(grad / denom, alpha=-lr * (1 - beta1))
|
778 |
+
p.clamp_(min=-scalar_max, max=scalar_max)
|
779 |
+
p.add_(delta)
|
780 |
+
|
preprocessors/__init__.py
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
"""
|
7 |
+
For source datasets' standard samples
|
8 |
+
"""
|
9 |
+
|
10 |
+
from collections import defaultdict
|
11 |
+
import os
|
12 |
+
import json
|
13 |
+
|
14 |
+
SPEECH_DATASETS = ["vctk", "vctksample"]
|
15 |
+
|
16 |
+
GOLDEN_TEST_SAMPLES = defaultdict(list)
|
17 |
+
GOLDEN_TEST_SAMPLES["m4singer"] = [
|
18 |
+
"Alto-1_美错_0014",
|
19 |
+
"Bass-1_十年_0008",
|
20 |
+
"Soprano-2_同桌的你_0018",
|
21 |
+
"Tenor-5_爱笑的眼睛_0010",
|
22 |
+
]
|
23 |
+
GOLDEN_TEST_SAMPLES["svcc"] = [
|
24 |
+
# IDF1
|
25 |
+
"IDF1_10030",
|
26 |
+
"IDF1_10120",
|
27 |
+
"IDF1_10140",
|
28 |
+
# IDM1
|
29 |
+
"IDM1_10001",
|
30 |
+
"IDM1_10030",
|
31 |
+
"IDM1_10120",
|
32 |
+
# CDF1
|
33 |
+
"CDF1_10030",
|
34 |
+
"CDF1_10120",
|
35 |
+
"CDF1_10140",
|
36 |
+
# CDM1
|
37 |
+
"CDM1_10001",
|
38 |
+
"CDM1_10030",
|
39 |
+
"CDM1_10120",
|
40 |
+
]
|
41 |
+
GOLDEN_TEST_SAMPLES["svcceval"] = [
|
42 |
+
# SF1
|
43 |
+
"SF1_30001",
|
44 |
+
"SF1_30002",
|
45 |
+
"SF1_30003",
|
46 |
+
# SM1
|
47 |
+
"SM1_30001",
|
48 |
+
"SM1_30002",
|
49 |
+
"SM1_30003",
|
50 |
+
]
|
51 |
+
GOLDEN_TEST_SAMPLES["popbutfy"] = [
|
52 |
+
"Female1#you_are_my_sunshine_Professional#0",
|
53 |
+
"Female4#Someone_Like_You_Professional#10",
|
54 |
+
"Male2#Lemon_Tree_Professional#12",
|
55 |
+
"Male5#can_you_feel_the_love_tonight_Professional#20",
|
56 |
+
]
|
57 |
+
GOLDEN_TEST_SAMPLES["opensinger"] = [
|
58 |
+
"Man_0_大鱼_10",
|
59 |
+
"Man_21_丑八怪_14",
|
60 |
+
"Woman_39_mojito_22",
|
61 |
+
"Woman_40_易燃易爆炸_12",
|
62 |
+
]
|
63 |
+
GOLDEN_TEST_SAMPLES["nus48e"] = [
|
64 |
+
"ADIZ_read#01#0000",
|
65 |
+
"MCUR_sing#10#0000",
|
66 |
+
"JLEE_read#08#0001",
|
67 |
+
"SAMF_sing#18#0001",
|
68 |
+
]
|
69 |
+
GOLDEN_TEST_SAMPLES["popcs"] = [
|
70 |
+
"明天会更好_0004",
|
71 |
+
"欧若拉_0005",
|
72 |
+
"虫儿飞_0006",
|
73 |
+
"隐形的翅膀_0008",
|
74 |
+
]
|
75 |
+
GOLDEN_TEST_SAMPLES["kising"] = [
|
76 |
+
"421_0040",
|
77 |
+
"424_0013",
|
78 |
+
"431_0026",
|
79 |
+
]
|
80 |
+
GOLDEN_TEST_SAMPLES["csd"] = [
|
81 |
+
"en_004a_0001",
|
82 |
+
"en_042b_0006",
|
83 |
+
"kr_013a_0006",
|
84 |
+
"kr_045b_0004",
|
85 |
+
]
|
86 |
+
GOLDEN_TEST_SAMPLES["opera"] = [
|
87 |
+
"fem_01#neg_1#0000",
|
88 |
+
"fem_12#pos_3#0003",
|
89 |
+
"male_02#neg_1#0002",
|
90 |
+
"male_11#pos_2#0001",
|
91 |
+
]
|
92 |
+
GOLDEN_TEST_SAMPLES["lijian"] = [
|
93 |
+
"058矜持_0000",
|
94 |
+
"079绒花_0000",
|
95 |
+
"120遥远的天空底下_0000",
|
96 |
+
]
|
97 |
+
GOLDEN_TEST_SAMPLES["cdmusiceval"] = ["陶喆_普通朋友", "蔡琴_给电影人的情书"]
|
98 |
+
|
99 |
+
GOLDEN_TRAIN_SAMPLES = defaultdict(list)
|
100 |
+
|
101 |
+
|
102 |
+
def get_golden_samples_indexes(
|
103 |
+
dataset_name,
|
104 |
+
dataset_dir=None,
|
105 |
+
cfg=None,
|
106 |
+
split=None,
|
107 |
+
min_samples=5,
|
108 |
+
):
|
109 |
+
"""
|
110 |
+
# Get Standard samples' indexes
|
111 |
+
"""
|
112 |
+
if dataset_dir is None:
|
113 |
+
assert cfg is not None
|
114 |
+
dataset_dir = os.path.join(
|
115 |
+
cfg.OUTPUT_PATH,
|
116 |
+
"preprocess/{}_version".format(cfg.PREPROCESS_VERSION),
|
117 |
+
dataset_name,
|
118 |
+
)
|
119 |
+
|
120 |
+
assert split is not None
|
121 |
+
utt_file = os.path.join(dataset_dir, "{}.json".format(split))
|
122 |
+
with open(utt_file, "r", encoding="utf-8") as f:
|
123 |
+
samples = json.load(f)
|
124 |
+
|
125 |
+
if "train" in split:
|
126 |
+
golden_samples = GOLDEN_TRAIN_SAMPLES[dataset_name]
|
127 |
+
if "test" in split:
|
128 |
+
golden_samples = GOLDEN_TEST_SAMPLES[dataset_name]
|
129 |
+
|
130 |
+
res = []
|
131 |
+
for idx, utt in enumerate(samples):
|
132 |
+
if utt["Uid"] in golden_samples:
|
133 |
+
res.append(idx)
|
134 |
+
|
135 |
+
if dataset_name == "cdmusiceval":
|
136 |
+
if "_".join(utt["Uid"].split("_")[:2]) in golden_samples:
|
137 |
+
res.append(idx)
|
138 |
+
|
139 |
+
if len(res) == 0:
|
140 |
+
res = [i for i in range(min_samples)]
|
141 |
+
|
142 |
+
return res
|
143 |
+
|
144 |
+
|
145 |
+
def get_specific_singer_indexes(dataset_dir, singer_name, split):
|
146 |
+
utt_file = os.path.join(dataset_dir, "{}.json".format(split))
|
147 |
+
with open(utt_file, "r", encoding="utf-8") as f:
|
148 |
+
samples = json.load(f)
|
149 |
+
|
150 |
+
res = []
|
151 |
+
for idx, utt in enumerate(samples):
|
152 |
+
if utt["Singer"] == singer_name:
|
153 |
+
res.append(idx)
|
154 |
+
|
155 |
+
assert len(res) != 0
|
156 |
+
return res
|
157 |
+
|
158 |
+
|
159 |
+
def get_uids_and_wav_paths(
|
160 |
+
cfg, dataset, dataset_type="train", only_specific_singer=None, return_singers=False
|
161 |
+
):
|
162 |
+
dataset_dir = os.path.join(
|
163 |
+
cfg.OUTPUT_PATH, "preprocess/{}_version".format(cfg.PREPROCESS_VERSION), dataset
|
164 |
+
)
|
165 |
+
dataset_file = os.path.join(
|
166 |
+
dataset_dir, "{}.json".format(dataset_type.split("_")[-1])
|
167 |
+
)
|
168 |
+
with open(dataset_file, "r") as f:
|
169 |
+
utterances = json.load(f)
|
170 |
+
|
171 |
+
indexes = range(len(utterances))
|
172 |
+
if "golden" in dataset_type:
|
173 |
+
# golden_train or golden_test
|
174 |
+
indexes = get_golden_samples_indexes(
|
175 |
+
dataset, dataset_dir, split=dataset_type.split("_")[-1]
|
176 |
+
)
|
177 |
+
if only_specific_singer is not None:
|
178 |
+
indexes = get_specific_singer_indexes(
|
179 |
+
dataset_dir, only_specific_singer, dataset_type
|
180 |
+
)
|
181 |
+
|
182 |
+
uids = [utterances[i]["Uid"] for i in indexes]
|
183 |
+
wav_paths = [utterances[i]["Path"] for i in indexes]
|
184 |
+
singers = [utterances[i]["Singer"] for i in indexes]
|
185 |
+
|
186 |
+
if not return_singers:
|
187 |
+
return uids, wav_paths
|
188 |
+
else:
|
189 |
+
return uids, wav_paths, singers
|
preprocessors/bigdata.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
from collections import defaultdict
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
|
13 |
+
def get_uids_and_wav_paths(cfg, dataset, dataset_type):
|
14 |
+
assert dataset == "bigdata"
|
15 |
+
dataset_dir = os.path.join(
|
16 |
+
cfg.OUTPUT_PATH,
|
17 |
+
"preprocess/{}_version".format(cfg.PREPROCESS_VERSION),
|
18 |
+
"bigdata/{}".format(cfg.BIGDATA_VERSION),
|
19 |
+
)
|
20 |
+
dataset_file = os.path.join(
|
21 |
+
dataset_dir, "{}.json".format(dataset_type.split("_")[-1])
|
22 |
+
)
|
23 |
+
with open(dataset_file, "r") as f:
|
24 |
+
utterances = json.load(f)
|
25 |
+
|
26 |
+
# Uids
|
27 |
+
uids = [u["Uid"] for u in utterances]
|
28 |
+
|
29 |
+
# Wav paths
|
30 |
+
wav_paths = [u["Path"] for u in utterances]
|
31 |
+
|
32 |
+
return uids, wav_paths
|
33 |
+
|
34 |
+
|
35 |
+
def take_duration(utt):
|
36 |
+
return utt["Duration"]
|
37 |
+
|
38 |
+
|
39 |
+
def main(output_path, cfg):
|
40 |
+
datasets = cfg.dataset
|
41 |
+
|
42 |
+
print("-" * 10)
|
43 |
+
print("Preparing samples for bigdata...")
|
44 |
+
print("Including: \n{}\n".format("\n".join(datasets)))
|
45 |
+
|
46 |
+
datasets.sort()
|
47 |
+
bigdata_version = "_".join(datasets)
|
48 |
+
|
49 |
+
save_dir = os.path.join(output_path, bigdata_version)
|
50 |
+
os.makedirs(save_dir, exist_ok=True)
|
51 |
+
|
52 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
53 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
54 |
+
singer_dict_file = os.path.join(save_dir, cfg.preprocess.spk2id)
|
55 |
+
utt2singer_file = os.path.join(save_dir, cfg.preprocess.utt2spk)
|
56 |
+
utt2singer = open(utt2singer_file, "a+")
|
57 |
+
# We select songs of standard samples as test songs
|
58 |
+
train = []
|
59 |
+
test = []
|
60 |
+
|
61 |
+
train_total_duration = 0
|
62 |
+
test_total_duration = 0
|
63 |
+
|
64 |
+
# Singer unique names
|
65 |
+
singer_names = set()
|
66 |
+
|
67 |
+
for dataset in datasets:
|
68 |
+
dataset_path = os.path.join(output_path, dataset)
|
69 |
+
train_json = os.path.join(dataset_path, "train.json")
|
70 |
+
test_json = os.path.join(dataset_path, "test.json")
|
71 |
+
|
72 |
+
with open(train_json, "r", encoding="utf-8") as f:
|
73 |
+
train_utterances = json.load(f)
|
74 |
+
|
75 |
+
with open(test_json, "r", encoding="utf-8") as f:
|
76 |
+
test_utterances = json.load(f)
|
77 |
+
|
78 |
+
for utt in tqdm(train_utterances):
|
79 |
+
train.append(utt)
|
80 |
+
train_total_duration += utt["Duration"]
|
81 |
+
singer_names.add("{}_{}".format(utt["Dataset"], utt["Singer"]))
|
82 |
+
utt2singer.write(
|
83 |
+
"{}_{}\t{}_{}\n".format(
|
84 |
+
utt["Dataset"], utt["Uid"], utt["Dataset"], utt["Singer"]
|
85 |
+
)
|
86 |
+
)
|
87 |
+
|
88 |
+
for utt in test_utterances:
|
89 |
+
test.append(utt)
|
90 |
+
test_total_duration += utt["Duration"]
|
91 |
+
singer_names.add("{}_{}".format(utt["Dataset"], utt["Singer"]))
|
92 |
+
utt2singer.write(
|
93 |
+
"{}_{}\t{}_{}\n".format(
|
94 |
+
utt["Dataset"], utt["Uid"], utt["Dataset"], utt["Singer"]
|
95 |
+
)
|
96 |
+
)
|
97 |
+
|
98 |
+
utt2singer.close()
|
99 |
+
|
100 |
+
train.sort(key=take_duration)
|
101 |
+
test.sort(key=take_duration)
|
102 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
103 |
+
print(
|
104 |
+
"#Train hours= {}, #Test hours= {}".format(
|
105 |
+
train_total_duration / 3600, test_total_duration / 3600
|
106 |
+
)
|
107 |
+
)
|
108 |
+
|
109 |
+
# Singer Look Up Table
|
110 |
+
singer_names = list(singer_names)
|
111 |
+
singer_names.sort()
|
112 |
+
singer_lut = {name: i for i, name in enumerate(singer_names)}
|
113 |
+
print("#Singers: {}\n".format(len(singer_lut)))
|
114 |
+
|
115 |
+
# Save
|
116 |
+
with open(train_output_file, "w") as f:
|
117 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
118 |
+
with open(test_output_file, "w") as f:
|
119 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
120 |
+
with open(singer_dict_file, "w") as f:
|
121 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
122 |
+
|
123 |
+
# Save meta info
|
124 |
+
meta_info = {
|
125 |
+
"datasets": datasets,
|
126 |
+
"train": {"size": len(train), "hours": round(train_total_duration / 3600, 4)},
|
127 |
+
"test": {"size": len(test), "hours": round(test_total_duration / 3600, 4)},
|
128 |
+
"singers": {"size": len(singer_lut)},
|
129 |
+
}
|
130 |
+
singer2mins = defaultdict(float)
|
131 |
+
for utt in train:
|
132 |
+
dataset, singer, duration = utt["Dataset"], utt["Singer"], utt["Duration"]
|
133 |
+
singer2mins["{}_{}".format(dataset, singer)] += duration / 60
|
134 |
+
singer2mins = sorted(singer2mins.items(), key=lambda x: x[1], reverse=True)
|
135 |
+
singer2mins = dict(
|
136 |
+
zip([i[0] for i in singer2mins], [round(i[1], 2) for i in singer2mins])
|
137 |
+
)
|
138 |
+
meta_info["singers"]["training_minutes"] = singer2mins
|
139 |
+
|
140 |
+
with open(os.path.join(save_dir, "meta_info.json"), "w") as f:
|
141 |
+
json.dump(meta_info, f, indent=4, ensure_ascii=False)
|
142 |
+
|
143 |
+
for singer, min in singer2mins.items():
|
144 |
+
print("Singer {}: {} mins".format(singer, min))
|
145 |
+
print("-" * 10, "\n")
|
preprocessors/cdmusiceval.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
from glob import glob
|
7 |
+
import os
|
8 |
+
import json
|
9 |
+
import torchaudio
|
10 |
+
from tqdm import tqdm
|
11 |
+
from collections import defaultdict
|
12 |
+
|
13 |
+
from utils.util import has_existed, remove_and_create
|
14 |
+
from utils.audio_slicer import split_utterances_from_audio
|
15 |
+
|
16 |
+
|
17 |
+
def split_to_utterances(input_dir, output_dir):
|
18 |
+
print("Splitting to utterances for {}...".format(input_dir))
|
19 |
+
|
20 |
+
files_list = glob("*", root_dir=input_dir)
|
21 |
+
files_list.sort()
|
22 |
+
for wav_file in tqdm(files_list):
|
23 |
+
# # Load waveform
|
24 |
+
# waveform, fs = torchaudio.load(os.path.join(input_dir, wav_file))
|
25 |
+
|
26 |
+
# Singer name, Song name
|
27 |
+
song_name, singer_name = wav_file.split("_")[2].split("-")
|
28 |
+
save_dir = os.path.join(output_dir, singer_name, song_name)
|
29 |
+
|
30 |
+
split_utterances_from_audio(
|
31 |
+
os.path.join(input_dir, wav_file), save_dir, max_duration_of_utterance=10
|
32 |
+
)
|
33 |
+
|
34 |
+
# # Split
|
35 |
+
# slicer = Slicer(sr=fs, threshold=-30.0, max_sil_kept=3000, min_interval=1000)
|
36 |
+
# chunks = slicer.slice(waveform)
|
37 |
+
|
38 |
+
# for i, chunk in enumerate(chunks):
|
39 |
+
# save_dir = os.path.join(output_dir, singer_name, song_name)
|
40 |
+
# os.makedirs(save_dir, exist_ok=True)
|
41 |
+
|
42 |
+
# output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
|
43 |
+
# save_audio(output_file, chunk, fs)
|
44 |
+
|
45 |
+
|
46 |
+
def _main(dataset_path):
|
47 |
+
"""
|
48 |
+
Split to utterances
|
49 |
+
"""
|
50 |
+
utterance_dir = os.path.join(dataset_path, "utterances")
|
51 |
+
remove_and_create(utterance_dir)
|
52 |
+
split_to_utterances(os.path.join(dataset_path, "vocal"), utterance_dir)
|
53 |
+
|
54 |
+
|
55 |
+
def statistics(utterance_dir):
|
56 |
+
singers = []
|
57 |
+
songs = []
|
58 |
+
singers2songs = defaultdict(lambda: defaultdict(list))
|
59 |
+
|
60 |
+
singer_infos = glob(utterance_dir + "/*")
|
61 |
+
|
62 |
+
for singer_info in singer_infos:
|
63 |
+
singer = singer_info.split("/")[-1]
|
64 |
+
|
65 |
+
song_infos = glob(singer_info + "/*")
|
66 |
+
|
67 |
+
for song_info in song_infos:
|
68 |
+
song = song_info.split("/")[-1]
|
69 |
+
|
70 |
+
singers.append(singer)
|
71 |
+
songs.append(song)
|
72 |
+
|
73 |
+
utts = glob(song_info + "/*.wav")
|
74 |
+
|
75 |
+
for utt in utts:
|
76 |
+
uid = utt.split("/")[-1].split(".")[0]
|
77 |
+
singers2songs[singer][song].append(uid)
|
78 |
+
|
79 |
+
unique_singers = list(set(singers))
|
80 |
+
unique_songs = list(set(songs))
|
81 |
+
unique_singers.sort()
|
82 |
+
unique_songs.sort()
|
83 |
+
|
84 |
+
print(
|
85 |
+
"Statistics: {} singers, {} utterances ({} unique songs)".format(
|
86 |
+
len(unique_singers), len(songs), len(unique_songs)
|
87 |
+
)
|
88 |
+
)
|
89 |
+
print("Singers: \n{}".format("\t".join(unique_singers)))
|
90 |
+
return singers2songs, unique_singers
|
91 |
+
|
92 |
+
|
93 |
+
def main(output_path, dataset_path):
|
94 |
+
print("-" * 10)
|
95 |
+
print("Preparing samples for CD Music Eval...\n")
|
96 |
+
|
97 |
+
if not os.path.exists(os.path.join(dataset_path, "utterances")):
|
98 |
+
print("Spliting into utterances...\n")
|
99 |
+
_main(dataset_path)
|
100 |
+
|
101 |
+
save_dir = os.path.join(output_path, "cdmusiceval")
|
102 |
+
os.makedirs(save_dir, exist_ok=True)
|
103 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
104 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
105 |
+
singer_dict_file = os.path.join(save_dir, "singers.json")
|
106 |
+
utt2singer_file = os.path.join(save_dir, "utt2singer")
|
107 |
+
if (
|
108 |
+
has_existed(train_output_file)
|
109 |
+
and has_existed(test_output_file)
|
110 |
+
and has_existed(singer_dict_file)
|
111 |
+
and has_existed(utt2singer_file)
|
112 |
+
):
|
113 |
+
return
|
114 |
+
utt2singer = open(utt2singer_file, "w")
|
115 |
+
|
116 |
+
# Load
|
117 |
+
utt_path = os.path.join(dataset_path, "utterances")
|
118 |
+
singers2songs, unique_singers = statistics(utt_path)
|
119 |
+
|
120 |
+
# We select songs of standard samples as test songs
|
121 |
+
train = []
|
122 |
+
test = []
|
123 |
+
|
124 |
+
train_index_count = 0
|
125 |
+
test_index_count = 0
|
126 |
+
|
127 |
+
train_total_duration = 0
|
128 |
+
test_total_duration = 0
|
129 |
+
|
130 |
+
for singer, songs in tqdm(singers2songs.items()):
|
131 |
+
song_names = list(songs.keys())
|
132 |
+
|
133 |
+
for chosen_song in song_names:
|
134 |
+
for chosen_uid in songs[chosen_song]:
|
135 |
+
res = {
|
136 |
+
"Dataset": "cdmusiceval",
|
137 |
+
"Singer": singer,
|
138 |
+
"Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
|
139 |
+
}
|
140 |
+
res["Path"] = "{}/{}/{}.wav".format(singer, chosen_song, chosen_uid)
|
141 |
+
res["Path"] = os.path.join(utt_path, res["Path"])
|
142 |
+
assert os.path.exists(res["Path"])
|
143 |
+
|
144 |
+
waveform, sample_rate = torchaudio.load(res["Path"])
|
145 |
+
duration = waveform.size(-1) / sample_rate
|
146 |
+
res["Duration"] = duration
|
147 |
+
|
148 |
+
if duration <= 1e-8:
|
149 |
+
continue
|
150 |
+
|
151 |
+
res["index"] = test_index_count
|
152 |
+
test_total_duration += duration
|
153 |
+
test.append(res)
|
154 |
+
test_index_count += 1
|
155 |
+
|
156 |
+
utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
|
157 |
+
|
158 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
159 |
+
print(
|
160 |
+
"#Train hours= {}, #Test hours= {}".format(
|
161 |
+
train_total_duration / 3600, test_total_duration / 3600
|
162 |
+
)
|
163 |
+
)
|
164 |
+
|
165 |
+
# Save train.json and test.json
|
166 |
+
with open(train_output_file, "w") as f:
|
167 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
168 |
+
with open(test_output_file, "w") as f:
|
169 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
170 |
+
|
171 |
+
# Save singers.json
|
172 |
+
singer_lut = {name: i for i, name in enumerate(unique_singers)}
|
173 |
+
with open(singer_dict_file, "w") as f:
|
174 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
preprocessors/coco.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import torchaudio
|
9 |
+
from tqdm import tqdm
|
10 |
+
from glob import glob
|
11 |
+
from collections import defaultdict
|
12 |
+
|
13 |
+
from utils.util import has_existed
|
14 |
+
from preprocessors import GOLDEN_TEST_SAMPLES
|
15 |
+
|
16 |
+
|
17 |
+
def get_test_songs():
|
18 |
+
return ["007Di Da Di"]
|
19 |
+
|
20 |
+
|
21 |
+
def coco_statistics(data_dir):
|
22 |
+
song2utts = defaultdict(list)
|
23 |
+
|
24 |
+
song_infos = glob(data_dir + "/*")
|
25 |
+
|
26 |
+
for song in song_infos:
|
27 |
+
song_name = song.split("/")[-1]
|
28 |
+
utts = glob(song + "/*.wav")
|
29 |
+
for utt in utts:
|
30 |
+
uid = utt.split("/")[-1].split(".")[0]
|
31 |
+
song2utts[song_name].append(uid)
|
32 |
+
|
33 |
+
print("Coco: {} songs".format(len(song_infos)))
|
34 |
+
return song2utts
|
35 |
+
|
36 |
+
|
37 |
+
def main(output_path, dataset_path):
|
38 |
+
print("-" * 10)
|
39 |
+
print("Preparing datasets for Coco...\n")
|
40 |
+
|
41 |
+
save_dir = os.path.join(output_path, "coco")
|
42 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
43 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
44 |
+
if has_existed(test_output_file):
|
45 |
+
return
|
46 |
+
|
47 |
+
# Load
|
48 |
+
song2utts = coco_statistics(dataset_path)
|
49 |
+
test_songs = get_test_songs()
|
50 |
+
|
51 |
+
# We select songs of standard samples as test songs
|
52 |
+
train = []
|
53 |
+
test = []
|
54 |
+
|
55 |
+
train_index_count = 0
|
56 |
+
test_index_count = 0
|
57 |
+
|
58 |
+
train_total_duration = 0
|
59 |
+
test_total_duration = 0
|
60 |
+
|
61 |
+
for song_name, uids in tqdm(song2utts.items()):
|
62 |
+
for chosen_uid in uids:
|
63 |
+
res = {
|
64 |
+
"Dataset": "coco",
|
65 |
+
"Singer": "coco",
|
66 |
+
"Song": song_name,
|
67 |
+
"Uid": "{}_{}".format(song_name, chosen_uid),
|
68 |
+
}
|
69 |
+
res["Path"] = "{}/{}.wav".format(song_name, chosen_uid)
|
70 |
+
res["Path"] = os.path.join(dataset_path, res["Path"])
|
71 |
+
assert os.path.exists(res["Path"])
|
72 |
+
|
73 |
+
waveform, sample_rate = torchaudio.load(res["Path"])
|
74 |
+
duration = waveform.size(-1) / sample_rate
|
75 |
+
res["Duration"] = duration
|
76 |
+
|
77 |
+
if song_name in test_songs:
|
78 |
+
res["index"] = test_index_count
|
79 |
+
test_total_duration += duration
|
80 |
+
test.append(res)
|
81 |
+
test_index_count += 1
|
82 |
+
else:
|
83 |
+
res["index"] = train_index_count
|
84 |
+
train_total_duration += duration
|
85 |
+
train.append(res)
|
86 |
+
train_index_count += 1
|
87 |
+
|
88 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
89 |
+
print(
|
90 |
+
"#Train hours= {}, #Test hours= {}".format(
|
91 |
+
train_total_duration / 3600, test_total_duration / 3600
|
92 |
+
)
|
93 |
+
)
|
94 |
+
|
95 |
+
# Save
|
96 |
+
os.makedirs(save_dir, exist_ok=True)
|
97 |
+
with open(train_output_file, "w") as f:
|
98 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
99 |
+
with open(test_output_file, "w") as f:
|
100 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
preprocessors/cocoeval.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import random
|
7 |
+
import os
|
8 |
+
import json
|
9 |
+
import torchaudio
|
10 |
+
from tqdm import tqdm
|
11 |
+
from glob import glob
|
12 |
+
from collections import defaultdict
|
13 |
+
|
14 |
+
from utils.util import has_existed
|
15 |
+
from utils.audio_slicer import split_utterances_from_audio
|
16 |
+
from preprocessors import GOLDEN_TEST_SAMPLES
|
17 |
+
|
18 |
+
|
19 |
+
def _split_utts():
|
20 |
+
raw_dir = "/mnt/chongqinggeminiceph1fs/geminicephfs/wx-mm-spr-xxxx/xueyaozhang/dataset/李玟/cocoeval/raw"
|
21 |
+
output_root = "/mnt/chongqinggeminiceph1fs/geminicephfs/wx-mm-spr-xxxx/xueyaozhang/dataset/李玟/cocoeval/utterances"
|
22 |
+
|
23 |
+
if os.path.exists(output_root):
|
24 |
+
os.system("rm -rf {}".format(output_root))
|
25 |
+
|
26 |
+
vocal_files = glob(os.path.join(raw_dir, "*/vocal.wav"))
|
27 |
+
for vocal_f in tqdm(vocal_files):
|
28 |
+
song_name = vocal_f.split("/")[-2]
|
29 |
+
|
30 |
+
output_dir = os.path.join(output_root, song_name)
|
31 |
+
os.makedirs(output_dir, exist_ok=True)
|
32 |
+
|
33 |
+
split_utterances_from_audio(vocal_f, output_dir, min_interval=300)
|
34 |
+
|
35 |
+
|
36 |
+
def cocoeval_statistics(data_dir):
|
37 |
+
song2utts = defaultdict(list)
|
38 |
+
|
39 |
+
song_infos = glob(data_dir + "/*")
|
40 |
+
|
41 |
+
for song in song_infos:
|
42 |
+
song_name = song.split("/")[-1]
|
43 |
+
utts = glob(song + "/*.wav")
|
44 |
+
for utt in utts:
|
45 |
+
uid = utt.split("/")[-1].split(".")[0]
|
46 |
+
song2utts[song_name].append(uid)
|
47 |
+
|
48 |
+
print("Cocoeval: {} songs".format(len(song_infos)))
|
49 |
+
return song2utts
|
50 |
+
|
51 |
+
|
52 |
+
def main(output_path, dataset_path):
|
53 |
+
print("-" * 10)
|
54 |
+
print("Preparing datasets for Cocoeval...\n")
|
55 |
+
|
56 |
+
save_dir = os.path.join(output_path, "cocoeval")
|
57 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
58 |
+
if has_existed(test_output_file):
|
59 |
+
return
|
60 |
+
|
61 |
+
# Load
|
62 |
+
song2utts = cocoeval_statistics(dataset_path)
|
63 |
+
|
64 |
+
train, test = [], []
|
65 |
+
train_index_count, test_index_count = 0, 0
|
66 |
+
train_total_duration, test_total_duration = 0.0, 0.0
|
67 |
+
|
68 |
+
for song_name, uids in tqdm(song2utts.items()):
|
69 |
+
for chosen_uid in uids:
|
70 |
+
res = {
|
71 |
+
"Dataset": "cocoeval",
|
72 |
+
"Singer": "TBD",
|
73 |
+
"Song": song_name,
|
74 |
+
"Uid": "{}_{}".format(song_name, chosen_uid),
|
75 |
+
}
|
76 |
+
res["Path"] = "{}/{}.wav".format(song_name, chosen_uid)
|
77 |
+
res["Path"] = os.path.join(dataset_path, res["Path"])
|
78 |
+
assert os.path.exists(res["Path"])
|
79 |
+
|
80 |
+
waveform, sample_rate = torchaudio.load(res["Path"])
|
81 |
+
duration = waveform.size(-1) / sample_rate
|
82 |
+
res["Duration"] = duration
|
83 |
+
|
84 |
+
res["index"] = test_index_count
|
85 |
+
test_total_duration += duration
|
86 |
+
test.append(res)
|
87 |
+
test_index_count += 1
|
88 |
+
|
89 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
90 |
+
print(
|
91 |
+
"#Train hours= {}, #Test hours= {}".format(
|
92 |
+
train_total_duration / 3600, test_total_duration / 3600
|
93 |
+
)
|
94 |
+
)
|
95 |
+
|
96 |
+
# Save
|
97 |
+
os.makedirs(save_dir, exist_ok=True)
|
98 |
+
with open(test_output_file, "w") as f:
|
99 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
preprocessors/csd.py
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import glob
|
10 |
+
from tqdm import tqdm
|
11 |
+
import torchaudio
|
12 |
+
import pandas as pd
|
13 |
+
from glob import glob
|
14 |
+
from collections import defaultdict
|
15 |
+
|
16 |
+
from utils.io import save_audio
|
17 |
+
from utils.util import has_existed
|
18 |
+
from preprocessors import GOLDEN_TEST_SAMPLES
|
19 |
+
|
20 |
+
|
21 |
+
def save_utterance(output_file, waveform, fs, start, end, overlap=0.1):
|
22 |
+
"""
|
23 |
+
waveform: [#channel, audio_len]
|
24 |
+
start, end, overlap: seconds
|
25 |
+
"""
|
26 |
+
start = int((start - overlap) * fs)
|
27 |
+
end = int((end + overlap) * fs)
|
28 |
+
utterance = waveform[:, start:end]
|
29 |
+
save_audio(output_file, utterance, fs)
|
30 |
+
|
31 |
+
|
32 |
+
def split_to_utterances(language_dir, output_dir):
|
33 |
+
print("Splitting to utterances for {}...".format(language_dir))
|
34 |
+
wav_dir = os.path.join(language_dir, "wav")
|
35 |
+
phoneme_dir = os.path.join(language_dir, "txt")
|
36 |
+
annot_dir = os.path.join(language_dir, "csv")
|
37 |
+
|
38 |
+
pitches = set()
|
39 |
+
for wav_file in tqdm(glob("{}/*.wav".format(wav_dir))):
|
40 |
+
# Load waveform
|
41 |
+
song_name = wav_file.split("/")[-1].split(".")[0]
|
42 |
+
waveform, fs = torchaudio.load(wav_file)
|
43 |
+
|
44 |
+
# Load utterances
|
45 |
+
phoneme_file = os.path.join(phoneme_dir, "{}.txt".format(song_name))
|
46 |
+
with open(phoneme_file, "r") as f:
|
47 |
+
lines = f.readlines()
|
48 |
+
utterances = [l.strip().split() for l in lines]
|
49 |
+
utterances = [utt for utt in utterances if len(utt) > 0]
|
50 |
+
|
51 |
+
# Load annotation
|
52 |
+
annot_file = os.path.join(annot_dir, "{}.csv".format(song_name))
|
53 |
+
annot_df = pd.read_csv(annot_file)
|
54 |
+
pitches = pitches.union(set(annot_df["pitch"]))
|
55 |
+
starts = annot_df["start"].tolist()
|
56 |
+
ends = annot_df["end"].tolist()
|
57 |
+
syllables = annot_df["syllable"].tolist()
|
58 |
+
|
59 |
+
# Split
|
60 |
+
curr = 0
|
61 |
+
for i, phones in enumerate(utterances):
|
62 |
+
sz = len(phones)
|
63 |
+
assert phones[0] == syllables[curr]
|
64 |
+
assert phones[-1] == syllables[curr + sz - 1]
|
65 |
+
|
66 |
+
s = starts[curr]
|
67 |
+
e = ends[curr + sz - 1]
|
68 |
+
curr += sz
|
69 |
+
|
70 |
+
save_dir = os.path.join(output_dir, song_name)
|
71 |
+
os.makedirs(save_dir, exist_ok=True)
|
72 |
+
|
73 |
+
output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
|
74 |
+
save_utterance(output_file, waveform, fs, start=s, end=e)
|
75 |
+
|
76 |
+
|
77 |
+
def _main(dataset_path):
|
78 |
+
"""
|
79 |
+
Split to utterances
|
80 |
+
"""
|
81 |
+
utterance_dir = os.path.join(dataset_path, "utterances")
|
82 |
+
|
83 |
+
for lang in ["english", "korean"]:
|
84 |
+
split_to_utterances(os.path.join(dataset_path, lang), utterance_dir)
|
85 |
+
|
86 |
+
|
87 |
+
def get_test_songs():
|
88 |
+
golden_samples = GOLDEN_TEST_SAMPLES["csd"]
|
89 |
+
# every item is a tuple (language, song)
|
90 |
+
golden_songs = [s.split("_")[:2] for s in golden_samples]
|
91 |
+
# language_song, eg: en_001a
|
92 |
+
return golden_songs
|
93 |
+
|
94 |
+
|
95 |
+
def csd_statistics(data_dir):
|
96 |
+
languages = []
|
97 |
+
songs = []
|
98 |
+
languages2songs = defaultdict(lambda: defaultdict(list))
|
99 |
+
|
100 |
+
folder_infos = glob(data_dir + "/*")
|
101 |
+
|
102 |
+
for folder_info in folder_infos:
|
103 |
+
folder_info_split = folder_info.split("/")[-1]
|
104 |
+
|
105 |
+
language = folder_info_split[:2]
|
106 |
+
song = folder_info_split[2:]
|
107 |
+
|
108 |
+
languages.append(language)
|
109 |
+
songs.append(song)
|
110 |
+
|
111 |
+
utts = glob(folder_info + "/*")
|
112 |
+
|
113 |
+
for utt in utts:
|
114 |
+
uid = utt.split("/")[-1].split(".")[0]
|
115 |
+
languages2songs[language][song].append(uid)
|
116 |
+
|
117 |
+
unique_languages = list(set(languages))
|
118 |
+
unique_songs = list(set(songs))
|
119 |
+
unique_languages.sort()
|
120 |
+
unique_songs.sort()
|
121 |
+
|
122 |
+
print(
|
123 |
+
"csd: {} languages, {} utterances ({} unique songs)".format(
|
124 |
+
len(unique_languages), len(songs), len(unique_songs)
|
125 |
+
)
|
126 |
+
)
|
127 |
+
print("Languages: \n{}".format("\t".join(unique_languages)))
|
128 |
+
return languages2songs
|
129 |
+
|
130 |
+
|
131 |
+
def main(output_path, dataset_path):
|
132 |
+
print("-" * 10)
|
133 |
+
print("Preparing test samples for csd...\n")
|
134 |
+
|
135 |
+
if not os.path.exists(os.path.join(dataset_path, "utterances")):
|
136 |
+
print("Spliting into utterances...\n")
|
137 |
+
_main(dataset_path)
|
138 |
+
|
139 |
+
save_dir = os.path.join(output_path, "csd")
|
140 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
141 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
142 |
+
if has_existed(test_output_file):
|
143 |
+
return
|
144 |
+
|
145 |
+
# Load
|
146 |
+
csd_path = os.path.join(dataset_path, "utterances")
|
147 |
+
|
148 |
+
language2songs = csd_statistics(csd_path)
|
149 |
+
test_songs = get_test_songs()
|
150 |
+
|
151 |
+
# We select songs of standard samples as test songs
|
152 |
+
train = []
|
153 |
+
test = []
|
154 |
+
|
155 |
+
train_index_count = 0
|
156 |
+
test_index_count = 0
|
157 |
+
|
158 |
+
train_total_duration = 0
|
159 |
+
test_total_duration = 0
|
160 |
+
|
161 |
+
for language, songs in tqdm(language2songs.items()):
|
162 |
+
song_names = list(songs.keys())
|
163 |
+
|
164 |
+
for chosen_song in song_names:
|
165 |
+
for chosen_uid in songs[chosen_song]:
|
166 |
+
res = {
|
167 |
+
"Dataset": "csd",
|
168 |
+
"Singer": "Female1_{}".format(language),
|
169 |
+
"Uid": "{}_{}_{}".format(language, chosen_song, chosen_uid),
|
170 |
+
}
|
171 |
+
res["Path"] = "{}{}/{}.wav".format(language, chosen_song, chosen_uid)
|
172 |
+
res["Path"] = os.path.join(csd_path, res["Path"])
|
173 |
+
assert os.path.exists(res["Path"])
|
174 |
+
|
175 |
+
waveform, sample_rate = torchaudio.load(res["Path"])
|
176 |
+
duration = waveform.size(-1) / sample_rate
|
177 |
+
res["Duration"] = duration
|
178 |
+
|
179 |
+
if [language, chosen_song] in test_songs:
|
180 |
+
res["index"] = test_index_count
|
181 |
+
test_total_duration += duration
|
182 |
+
test.append(res)
|
183 |
+
test_index_count += 1
|
184 |
+
else:
|
185 |
+
res["index"] = train_index_count
|
186 |
+
train_total_duration += duration
|
187 |
+
train.append(res)
|
188 |
+
train_index_count += 1
|
189 |
+
|
190 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
191 |
+
print(
|
192 |
+
"#Train hours= {}, #Test hours= {}".format(
|
193 |
+
train_total_duration / 3600, test_total_duration / 3600
|
194 |
+
)
|
195 |
+
)
|
196 |
+
|
197 |
+
# Save
|
198 |
+
os.makedirs(save_dir, exist_ok=True)
|
199 |
+
with open(train_output_file, "w") as f:
|
200 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
201 |
+
with open(test_output_file, "w") as f:
|
202 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
preprocessors/custom.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
from glob import glob
|
7 |
+
import os
|
8 |
+
import json
|
9 |
+
import torchaudio
|
10 |
+
from tqdm import tqdm
|
11 |
+
from collections import defaultdict
|
12 |
+
|
13 |
+
from utils.util import has_existed
|
14 |
+
|
15 |
+
|
16 |
+
def statistics(utterance_dir):
|
17 |
+
singers = []
|
18 |
+
songs = []
|
19 |
+
singers2songs = defaultdict(lambda: defaultdict(list))
|
20 |
+
|
21 |
+
singer_infos = glob(utterance_dir + "/*")
|
22 |
+
|
23 |
+
for singer_info in singer_infos:
|
24 |
+
singer = singer_info.split("/")[-1]
|
25 |
+
|
26 |
+
song_infos = glob(singer_info + "/*")
|
27 |
+
|
28 |
+
for song_info in song_infos:
|
29 |
+
song = song_info.split("/")[-1]
|
30 |
+
|
31 |
+
singers.append(singer)
|
32 |
+
songs.append(song)
|
33 |
+
|
34 |
+
utts = glob(song_info + "/*.wav")
|
35 |
+
|
36 |
+
for utt in utts:
|
37 |
+
uid = utt.split("/")[-1].split(".")[0]
|
38 |
+
singers2songs[singer][song].append(uid)
|
39 |
+
|
40 |
+
unique_singers = list(set(singers))
|
41 |
+
unique_songs = list(set(songs))
|
42 |
+
unique_singers.sort()
|
43 |
+
unique_songs.sort()
|
44 |
+
|
45 |
+
print(
|
46 |
+
"Statistics: {} singers, {} utterances ({} unique songs)".format(
|
47 |
+
len(unique_singers), len(songs), len(unique_songs)
|
48 |
+
)
|
49 |
+
)
|
50 |
+
print("Singers: \n{}".format("\t".join(unique_singers)))
|
51 |
+
return singers2songs, unique_singers
|
52 |
+
|
53 |
+
|
54 |
+
def main(output_path, dataset_path, dataset_name):
|
55 |
+
print("-" * 10)
|
56 |
+
print("Preparing samples for {}...\n".format(dataset_name))
|
57 |
+
|
58 |
+
save_dir = os.path.join(output_path, dataset_name)
|
59 |
+
os.makedirs(save_dir, exist_ok=True)
|
60 |
+
|
61 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
62 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
63 |
+
singer_dict_file = os.path.join(save_dir, "singers.json")
|
64 |
+
utt2singer_file = os.path.join(save_dir, "utt2singer")
|
65 |
+
if (
|
66 |
+
has_existed(train_output_file)
|
67 |
+
and has_existed(test_output_file)
|
68 |
+
and has_existed(singer_dict_file)
|
69 |
+
and has_existed(utt2singer_file)
|
70 |
+
):
|
71 |
+
return
|
72 |
+
utt2singer = open(utt2singer_file, "w")
|
73 |
+
|
74 |
+
# Load
|
75 |
+
singers2songs, unique_singers = statistics(dataset_path)
|
76 |
+
|
77 |
+
# We select songs of standard samples as test songs
|
78 |
+
train = []
|
79 |
+
test = []
|
80 |
+
test_songs = set()
|
81 |
+
|
82 |
+
train_index_count = 0
|
83 |
+
test_index_count = 0
|
84 |
+
|
85 |
+
train_total_duration = 0
|
86 |
+
test_total_duration = 0
|
87 |
+
|
88 |
+
for singer, songs in singers2songs.items():
|
89 |
+
song_names = list(songs.keys())
|
90 |
+
|
91 |
+
print("Singer {}...".format(singer))
|
92 |
+
for chosen_song in tqdm(song_names):
|
93 |
+
for chosen_uid in songs[chosen_song]:
|
94 |
+
res = {
|
95 |
+
"Dataset": dataset_name,
|
96 |
+
"Singer": singer,
|
97 |
+
"Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
|
98 |
+
}
|
99 |
+
res["Path"] = "{}/{}/{}.wav".format(singer, chosen_song, chosen_uid)
|
100 |
+
res["Path"] = os.path.join(dataset_path, res["Path"])
|
101 |
+
assert os.path.exists(res["Path"])
|
102 |
+
|
103 |
+
waveform, sample_rate = torchaudio.load(res["Path"])
|
104 |
+
duration = waveform.size(-1) / sample_rate
|
105 |
+
res["Duration"] = duration
|
106 |
+
|
107 |
+
# Remove the utterance whose duration is shorter than 0.1s
|
108 |
+
if duration <= 1e-2:
|
109 |
+
continue
|
110 |
+
|
111 |
+
# Place into train or test
|
112 |
+
if "{}_{}".format(singer, chosen_song) not in test_songs:
|
113 |
+
test_songs.add("{}_{}".format(singer, chosen_song))
|
114 |
+
|
115 |
+
res["index"] = test_index_count
|
116 |
+
test_total_duration += duration
|
117 |
+
test.append(res)
|
118 |
+
test_index_count += 1
|
119 |
+
else:
|
120 |
+
res["index"] = train_index_count
|
121 |
+
train_total_duration += duration
|
122 |
+
train.append(res)
|
123 |
+
train_index_count += 1
|
124 |
+
|
125 |
+
utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
|
126 |
+
|
127 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
128 |
+
print(
|
129 |
+
"#Train hours= {}, #Test hours= {}".format(
|
130 |
+
train_total_duration / 3600, test_total_duration / 3600
|
131 |
+
)
|
132 |
+
)
|
133 |
+
|
134 |
+
# Save train.json and test.json
|
135 |
+
with open(train_output_file, "w") as f:
|
136 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
137 |
+
with open(test_output_file, "w") as f:
|
138 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
139 |
+
|
140 |
+
# Save singers.json
|
141 |
+
singer_lut = {name: i for i, name in enumerate(unique_singers)}
|
142 |
+
with open(singer_dict_file, "w") as f:
|
143 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
preprocessors/kising.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import random
|
7 |
+
import os
|
8 |
+
import json
|
9 |
+
import torchaudio
|
10 |
+
from tqdm import tqdm
|
11 |
+
from glob import glob
|
12 |
+
from collections import defaultdict
|
13 |
+
|
14 |
+
from utils.util import has_existed
|
15 |
+
from preprocessors import GOLDEN_TEST_SAMPLES
|
16 |
+
|
17 |
+
|
18 |
+
def get_test_folders():
|
19 |
+
golden_samples = GOLDEN_TEST_SAMPLES["kising"]
|
20 |
+
# every item is a string
|
21 |
+
golden_folders = [s.split("_")[:1] for s in golden_samples]
|
22 |
+
# folder, eg: 422
|
23 |
+
return golden_folders
|
24 |
+
|
25 |
+
|
26 |
+
def KiSing_statistics(data_dir):
|
27 |
+
folders = []
|
28 |
+
folders2utts = defaultdict(list)
|
29 |
+
|
30 |
+
folder_infos = glob(data_dir + "/*")
|
31 |
+
|
32 |
+
for folder_info in folder_infos:
|
33 |
+
folder = folder_info.split("/")[-1]
|
34 |
+
|
35 |
+
folders.append(folder)
|
36 |
+
|
37 |
+
utts = glob(folder_info + "/*.wav")
|
38 |
+
|
39 |
+
for utt in utts:
|
40 |
+
uid = utt.split("/")[-1].split(".")[0]
|
41 |
+
folders2utts[folder].append(uid)
|
42 |
+
|
43 |
+
unique_folders = list(set(folders))
|
44 |
+
unique_folders.sort()
|
45 |
+
|
46 |
+
print("KiSing: {} unique songs".format(len(unique_folders)))
|
47 |
+
return folders2utts
|
48 |
+
|
49 |
+
|
50 |
+
def main(output_path, dataset_path):
|
51 |
+
print("-" * 10)
|
52 |
+
print("Preparing test samples for KiSing...\n")
|
53 |
+
|
54 |
+
save_dir = os.path.join(output_path, "kising")
|
55 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
56 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
57 |
+
if has_existed(test_output_file):
|
58 |
+
return
|
59 |
+
|
60 |
+
# Load
|
61 |
+
KiSing_dir = dataset_path
|
62 |
+
|
63 |
+
folders2utts = KiSing_statistics(KiSing_dir)
|
64 |
+
test_folders = get_test_folders()
|
65 |
+
|
66 |
+
# We select songs of standard samples as test songs
|
67 |
+
train = []
|
68 |
+
test = []
|
69 |
+
|
70 |
+
train_index_count = 0
|
71 |
+
test_index_count = 0
|
72 |
+
|
73 |
+
train_total_duration = 0
|
74 |
+
test_total_duration = 0
|
75 |
+
|
76 |
+
folder_names = list(folders2utts.keys())
|
77 |
+
|
78 |
+
for chosen_folder in folder_names:
|
79 |
+
for chosen_uid in folders2utts[chosen_folder]:
|
80 |
+
res = {
|
81 |
+
"Dataset": "kising",
|
82 |
+
"Singer": "female1",
|
83 |
+
"Uid": "{}_{}".format(chosen_folder, chosen_uid),
|
84 |
+
}
|
85 |
+
res["Path"] = "{}/{}.wav".format(chosen_folder, chosen_uid)
|
86 |
+
res["Path"] = os.path.join(KiSing_dir, res["Path"])
|
87 |
+
assert os.path.exists(res["Path"])
|
88 |
+
|
89 |
+
waveform, sample_rate = torchaudio.load(res["Path"])
|
90 |
+
duration = waveform.size(-1) / sample_rate
|
91 |
+
res["Duration"] = duration
|
92 |
+
|
93 |
+
if ([chosen_folder]) in test_folders:
|
94 |
+
res["index"] = test_index_count
|
95 |
+
test_total_duration += duration
|
96 |
+
test.append(res)
|
97 |
+
test_index_count += 1
|
98 |
+
else:
|
99 |
+
res["index"] = train_index_count
|
100 |
+
train_total_duration += duration
|
101 |
+
train.append(res)
|
102 |
+
train_index_count += 1
|
103 |
+
|
104 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
105 |
+
print(
|
106 |
+
"#Train hours= {}, #Test hours= {}".format(
|
107 |
+
train_total_duration / 3600, test_total_duration / 3600
|
108 |
+
)
|
109 |
+
)
|
110 |
+
|
111 |
+
# Save
|
112 |
+
os.makedirs(save_dir, exist_ok=True)
|
113 |
+
with open(train_output_file, "w") as f:
|
114 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
115 |
+
with open(test_output_file, "w") as f:
|
116 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
preprocessors/libritts.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import torchaudio
|
9 |
+
from tqdm import tqdm
|
10 |
+
from glob import glob
|
11 |
+
from collections import defaultdict
|
12 |
+
|
13 |
+
from utils.util import has_existed
|
14 |
+
|
15 |
+
|
16 |
+
def libritts_statistics(data_dir):
|
17 |
+
speakers = []
|
18 |
+
distribution2speakers2pharases2utts = defaultdict(
|
19 |
+
lambda: defaultdict(lambda: defaultdict(list))
|
20 |
+
)
|
21 |
+
|
22 |
+
distribution_infos = glob(data_dir + "/*")
|
23 |
+
|
24 |
+
for distribution_info in distribution_infos:
|
25 |
+
distribution = distribution_info.split("/")[-1]
|
26 |
+
print(distribution)
|
27 |
+
|
28 |
+
speaker_infos = glob(distribution_info + "/*")
|
29 |
+
|
30 |
+
if len(speaker_infos) == 0:
|
31 |
+
continue
|
32 |
+
|
33 |
+
for speaker_info in speaker_infos:
|
34 |
+
speaker = speaker_info.split("/")[-1]
|
35 |
+
|
36 |
+
speakers.append(speaker)
|
37 |
+
|
38 |
+
pharase_infos = glob(speaker_info + "/*")
|
39 |
+
|
40 |
+
for pharase_info in pharase_infos:
|
41 |
+
pharase = pharase_info.split("/")[-1]
|
42 |
+
|
43 |
+
utts = glob(pharase_info + "/*.wav")
|
44 |
+
|
45 |
+
for utt in utts:
|
46 |
+
uid = utt.split("/")[-1].split(".")[0]
|
47 |
+
distribution2speakers2pharases2utts[distribution][speaker][
|
48 |
+
pharase
|
49 |
+
].append(uid)
|
50 |
+
|
51 |
+
unique_speakers = list(set(speakers))
|
52 |
+
unique_speakers.sort()
|
53 |
+
|
54 |
+
print("Speakers: \n{}".format("\t".join(unique_speakers)))
|
55 |
+
return distribution2speakers2pharases2utts, unique_speakers
|
56 |
+
|
57 |
+
|
58 |
+
def main(output_path, dataset_path):
|
59 |
+
print("-" * 10)
|
60 |
+
print("Preparing samples for libritts...\n")
|
61 |
+
|
62 |
+
save_dir = os.path.join(output_path, "libritts")
|
63 |
+
os.makedirs(save_dir, exist_ok=True)
|
64 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
65 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
66 |
+
singer_dict_file = os.path.join(save_dir, "singers.json")
|
67 |
+
utt2singer_file = os.path.join(save_dir, "utt2singer")
|
68 |
+
if has_existed(train_output_file):
|
69 |
+
return
|
70 |
+
utt2singer = open(utt2singer_file, "w")
|
71 |
+
|
72 |
+
# Load
|
73 |
+
libritts_path = dataset_path
|
74 |
+
|
75 |
+
distribution2speakers2pharases2utts, unique_speakers = libritts_statistics(
|
76 |
+
libritts_path
|
77 |
+
)
|
78 |
+
|
79 |
+
# We select pharases of standard spekaer as test songs
|
80 |
+
train = []
|
81 |
+
test = []
|
82 |
+
|
83 |
+
train_index_count = 0
|
84 |
+
test_index_count = 0
|
85 |
+
|
86 |
+
train_total_duration = 0
|
87 |
+
test_total_duration = 0
|
88 |
+
|
89 |
+
for distribution, speakers2pharases2utts in tqdm(
|
90 |
+
distribution2speakers2pharases2utts.items()
|
91 |
+
):
|
92 |
+
for speaker, pharases2utts in tqdm(speakers2pharases2utts.items()):
|
93 |
+
pharase_names = list(pharases2utts.keys())
|
94 |
+
|
95 |
+
for chosen_pharase in pharase_names:
|
96 |
+
for chosen_uid in pharases2utts[chosen_pharase]:
|
97 |
+
res = {
|
98 |
+
"Dataset": "libritts",
|
99 |
+
"Singer": speaker,
|
100 |
+
"Uid": "{}#{}#{}#{}".format(
|
101 |
+
distribution, speaker, chosen_pharase, chosen_uid
|
102 |
+
),
|
103 |
+
}
|
104 |
+
res["Path"] = "{}/{}/{}/{}.wav".format(
|
105 |
+
distribution, speaker, chosen_pharase, chosen_uid
|
106 |
+
)
|
107 |
+
res["Path"] = os.path.join(libritts_path, res["Path"])
|
108 |
+
assert os.path.exists(res["Path"])
|
109 |
+
|
110 |
+
waveform, sample_rate = torchaudio.load(res["Path"])
|
111 |
+
duration = waveform.size(-1) / sample_rate
|
112 |
+
res["Duration"] = duration
|
113 |
+
|
114 |
+
if not "train" in distribution:
|
115 |
+
res["index"] = test_index_count
|
116 |
+
test_total_duration += duration
|
117 |
+
test.append(res)
|
118 |
+
test_index_count += 1
|
119 |
+
else:
|
120 |
+
res["index"] = train_index_count
|
121 |
+
train_total_duration += duration
|
122 |
+
train.append(res)
|
123 |
+
train_index_count += 1
|
124 |
+
|
125 |
+
utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
|
126 |
+
|
127 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
128 |
+
print(
|
129 |
+
"#Train hours= {}, #Test hours= {}".format(
|
130 |
+
train_total_duration / 3600, test_total_duration / 3600
|
131 |
+
)
|
132 |
+
)
|
133 |
+
|
134 |
+
# Save train.json and test.json
|
135 |
+
with open(train_output_file, "w") as f:
|
136 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
137 |
+
with open(test_output_file, "w") as f:
|
138 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
139 |
+
|
140 |
+
# Save singers.json
|
141 |
+
singer_lut = {name: i for i, name in enumerate(unique_speakers)}
|
142 |
+
with open(singer_dict_file, "w") as f:
|
143 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
preprocessors/lijian.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import glob
|
7 |
+
import os
|
8 |
+
import json
|
9 |
+
import torchaudio
|
10 |
+
from tqdm import tqdm
|
11 |
+
from collections import defaultdict
|
12 |
+
|
13 |
+
|
14 |
+
from utils.io import save_audio
|
15 |
+
from utils.util import has_existed, remove_and_create
|
16 |
+
from utils.audio_slicer import Slicer
|
17 |
+
from preprocessors import GOLDEN_TEST_SAMPLES
|
18 |
+
|
19 |
+
|
20 |
+
def split_to_utterances(input_dir, output_dir):
|
21 |
+
print("Splitting to utterances for {}...".format(input_dir))
|
22 |
+
|
23 |
+
files_list = glob.glob("*.flac", root_dir=input_dir)
|
24 |
+
files_list.sort()
|
25 |
+
for wav_file in tqdm(files_list):
|
26 |
+
# Load waveform
|
27 |
+
waveform, fs = torchaudio.load(os.path.join(input_dir, wav_file))
|
28 |
+
|
29 |
+
# Song name
|
30 |
+
filename = wav_file.replace(" ", "")
|
31 |
+
filename = filename.replace("(Live)", "")
|
32 |
+
song_id, filename = filename.split("李健-")
|
33 |
+
|
34 |
+
song_id = song_id.split("_")[0]
|
35 |
+
song_name = "{:03d}".format(int(song_id)) + filename.split("_")[0].split("-")[0]
|
36 |
+
|
37 |
+
# Split
|
38 |
+
slicer = Slicer(sr=fs, threshold=-30.0, max_sil_kept=3000)
|
39 |
+
chunks = slicer.slice(waveform)
|
40 |
+
|
41 |
+
save_dir = os.path.join(output_dir, song_name)
|
42 |
+
remove_and_create(save_dir)
|
43 |
+
|
44 |
+
for i, chunk in enumerate(chunks):
|
45 |
+
output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
|
46 |
+
save_audio(output_file, chunk, fs)
|
47 |
+
|
48 |
+
|
49 |
+
def _main(dataset_path):
|
50 |
+
"""
|
51 |
+
Split to utterances
|
52 |
+
"""
|
53 |
+
utterance_dir = os.path.join(dataset_path, "utterances")
|
54 |
+
split_to_utterances(os.path.join(dataset_path, "vocal_v2"), utterance_dir)
|
55 |
+
|
56 |
+
|
57 |
+
def get_test_songs():
|
58 |
+
golden_samples = GOLDEN_TEST_SAMPLES["lijian"]
|
59 |
+
golden_songs = [s.split("_")[0] for s in golden_samples]
|
60 |
+
return golden_songs
|
61 |
+
|
62 |
+
|
63 |
+
def statistics(utt_dir):
|
64 |
+
song2utts = defaultdict(list)
|
65 |
+
|
66 |
+
song_infos = glob.glob(utt_dir + "/*")
|
67 |
+
song_infos.sort()
|
68 |
+
for song in song_infos:
|
69 |
+
song_name = song.split("/")[-1]
|
70 |
+
utt_infos = glob.glob(song + "/*.wav")
|
71 |
+
utt_infos.sort()
|
72 |
+
for utt in utt_infos:
|
73 |
+
uid = utt.split("/")[-1].split(".")[0]
|
74 |
+
song2utts[song_name].append(uid)
|
75 |
+
|
76 |
+
utt_sum = sum([len(utts) for utts in song2utts.values()])
|
77 |
+
print("Li Jian: {} unique songs, {} utterances".format(len(song2utts), utt_sum))
|
78 |
+
return song2utts
|
79 |
+
|
80 |
+
|
81 |
+
def main(output_path, dataset_path):
|
82 |
+
print("-" * 10)
|
83 |
+
print("Preparing test samples for Li Jian...\n")
|
84 |
+
|
85 |
+
if not os.path.exists(os.path.join(dataset_path, "utterances")):
|
86 |
+
print("Spliting into utterances...\n")
|
87 |
+
_main(dataset_path)
|
88 |
+
|
89 |
+
save_dir = os.path.join(output_path, "lijian")
|
90 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
91 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
92 |
+
if has_existed(test_output_file):
|
93 |
+
return
|
94 |
+
|
95 |
+
# Load
|
96 |
+
lijian_path = os.path.join(dataset_path, "utterances")
|
97 |
+
song2utts = statistics(lijian_path)
|
98 |
+
test_songs = get_test_songs()
|
99 |
+
|
100 |
+
# We select songs of standard samples as test songs
|
101 |
+
train = []
|
102 |
+
test = []
|
103 |
+
|
104 |
+
train_index_count = 0
|
105 |
+
test_index_count = 0
|
106 |
+
|
107 |
+
train_total_duration = 0
|
108 |
+
test_total_duration = 0
|
109 |
+
|
110 |
+
for chosen_song, utts in tqdm(song2utts.items()):
|
111 |
+
for chosen_uid in song2utts[chosen_song]:
|
112 |
+
res = {
|
113 |
+
"Dataset": "lijian",
|
114 |
+
"Singer": "lijian",
|
115 |
+
"Uid": "{}_{}".format(chosen_song, chosen_uid),
|
116 |
+
}
|
117 |
+
res["Path"] = "{}/{}.wav".format(chosen_song, chosen_uid)
|
118 |
+
res["Path"] = os.path.join(lijian_path, res["Path"])
|
119 |
+
assert os.path.exists(res["Path"])
|
120 |
+
|
121 |
+
waveform, sample_rate = torchaudio.load(res["Path"])
|
122 |
+
duration = waveform.size(-1) / sample_rate
|
123 |
+
res["Duration"] = duration
|
124 |
+
|
125 |
+
if duration <= 1e-8:
|
126 |
+
continue
|
127 |
+
|
128 |
+
if chosen_song in test_songs:
|
129 |
+
res["index"] = test_index_count
|
130 |
+
test_total_duration += duration
|
131 |
+
test.append(res)
|
132 |
+
test_index_count += 1
|
133 |
+
else:
|
134 |
+
res["index"] = train_index_count
|
135 |
+
train_total_duration += duration
|
136 |
+
train.append(res)
|
137 |
+
train_index_count += 1
|
138 |
+
|
139 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
140 |
+
print(
|
141 |
+
"#Train hours= {}, #Test hours= {}".format(
|
142 |
+
train_total_duration / 3600, test_total_duration / 3600
|
143 |
+
)
|
144 |
+
)
|
145 |
+
|
146 |
+
# Save
|
147 |
+
os.makedirs(save_dir, exist_ok=True)
|
148 |
+
with open(train_output_file, "w") as f:
|
149 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
150 |
+
with open(test_output_file, "w") as f:
|
151 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
preprocessors/ljspeech.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import json
|
7 |
+
from tqdm import tqdm
|
8 |
+
import os
|
9 |
+
import torchaudio
|
10 |
+
from utils import audio
|
11 |
+
import csv
|
12 |
+
import random
|
13 |
+
|
14 |
+
from utils.util import has_existed
|
15 |
+
from text import _clean_text
|
16 |
+
import librosa
|
17 |
+
import soundfile as sf
|
18 |
+
from scipy.io import wavfile
|
19 |
+
|
20 |
+
from pathlib import Path
|
21 |
+
import numpy as np
|
22 |
+
|
23 |
+
|
24 |
+
def textgird_extract(
|
25 |
+
corpus_directory,
|
26 |
+
output_directory,
|
27 |
+
mfa_path=os.path.join("mfa", "montreal-forced-aligner", "bin", "mfa_align"),
|
28 |
+
lexicon=os.path.join("mfa", "lexicon", "librispeech-lexicon.txt"),
|
29 |
+
acoustic_model_path=os.path.join(
|
30 |
+
"mfa", "montreal-forced-aligner", "pretrained_models", "english.zip"
|
31 |
+
),
|
32 |
+
jobs="8",
|
33 |
+
):
|
34 |
+
assert os.path.exists(
|
35 |
+
corpus_directory
|
36 |
+
), "Please check the directionary contains *.wav, *.lab"
|
37 |
+
assert (
|
38 |
+
os.path.exists(mfa_path)
|
39 |
+
and os.path.exists(lexicon)
|
40 |
+
and os.path.exists(acoustic_model_path)
|
41 |
+
), f"Please download the MFA tools to {mfa_path} firstly"
|
42 |
+
Path(output_directory).mkdir(parents=True, exist_ok=True)
|
43 |
+
print(f"MFA results are save in {output_directory}")
|
44 |
+
os.system(
|
45 |
+
f".{os.path.sep}{mfa_path} {corpus_directory} {lexicon} {acoustic_model_path} {output_directory} -j {jobs} --clean"
|
46 |
+
)
|
47 |
+
|
48 |
+
|
49 |
+
def get_lines(file):
|
50 |
+
lines = []
|
51 |
+
with open(file, encoding="utf-8") as f:
|
52 |
+
for line in tqdm(f):
|
53 |
+
lines.append(line.strip())
|
54 |
+
return lines
|
55 |
+
|
56 |
+
|
57 |
+
def get_uid2utt(ljspeech_path, dataset, cfg):
|
58 |
+
index_count = 0
|
59 |
+
total_duration = 0
|
60 |
+
|
61 |
+
uid2utt = []
|
62 |
+
for l in tqdm(dataset):
|
63 |
+
items = l.split("|")
|
64 |
+
uid = items[0]
|
65 |
+
text = items[2]
|
66 |
+
|
67 |
+
res = {
|
68 |
+
"Dataset": "LJSpeech",
|
69 |
+
"index": index_count,
|
70 |
+
"Singer": "LJSpeech",
|
71 |
+
"Uid": uid,
|
72 |
+
"Text": text,
|
73 |
+
}
|
74 |
+
|
75 |
+
# Duration in wav files
|
76 |
+
audio_file = os.path.join(ljspeech_path, "wavs/{}.wav".format(uid))
|
77 |
+
|
78 |
+
res["Path"] = audio_file
|
79 |
+
|
80 |
+
waveform, sample_rate = torchaudio.load(audio_file)
|
81 |
+
duration = waveform.size(-1) / sample_rate
|
82 |
+
res["Duration"] = duration
|
83 |
+
|
84 |
+
uid2utt.append(res)
|
85 |
+
|
86 |
+
index_count = index_count + 1
|
87 |
+
total_duration += duration
|
88 |
+
|
89 |
+
return uid2utt, total_duration / 3600
|
90 |
+
|
91 |
+
|
92 |
+
def split_dataset(lines, test_rate=0.05, test_size=None):
|
93 |
+
if test_size == None:
|
94 |
+
test_size = int(len(lines) * test_rate)
|
95 |
+
random.shuffle(lines)
|
96 |
+
|
97 |
+
train_set = []
|
98 |
+
test_set = []
|
99 |
+
|
100 |
+
for line in lines[:test_size]:
|
101 |
+
test_set.append(line)
|
102 |
+
for line in lines[test_size:]:
|
103 |
+
train_set.append(line)
|
104 |
+
return train_set, test_set
|
105 |
+
|
106 |
+
|
107 |
+
max_wav_value = 32768.0
|
108 |
+
|
109 |
+
|
110 |
+
def prepare_align(dataset, dataset_path, cfg, output_path):
|
111 |
+
in_dir = dataset_path
|
112 |
+
out_dir = os.path.join(output_path, dataset, cfg.raw_data)
|
113 |
+
sampling_rate = cfg.sample_rate
|
114 |
+
cleaners = cfg.text_cleaners
|
115 |
+
speaker = "LJSpeech"
|
116 |
+
with open(os.path.join(dataset_path, "metadata.csv"), encoding="utf-8") as f:
|
117 |
+
for line in tqdm(f):
|
118 |
+
parts = line.strip().split("|")
|
119 |
+
base_name = parts[0]
|
120 |
+
text = parts[2]
|
121 |
+
text = _clean_text(text, cleaners)
|
122 |
+
|
123 |
+
output_wav_path = os.path.join(out_dir, speaker, "{}.wav".format(base_name))
|
124 |
+
output_lab_path = os.path.join(out_dir, speaker, "{}.lab".format(base_name))
|
125 |
+
|
126 |
+
if os.path.exists(output_wav_path) and os.path.exists(output_lab_path):
|
127 |
+
continue
|
128 |
+
|
129 |
+
wav_path = os.path.join(in_dir, "wavs", "{}.wav".format(base_name))
|
130 |
+
if os.path.exists(wav_path):
|
131 |
+
os.makedirs(os.path.join(out_dir, speaker), exist_ok=True)
|
132 |
+
wav, _ = librosa.load(wav_path, sampling_rate)
|
133 |
+
wav = wav / max(abs(wav)) * max_wav_value
|
134 |
+
|
135 |
+
wavfile.write(
|
136 |
+
os.path.join(out_dir, speaker, "{}.wav".format(base_name)),
|
137 |
+
sampling_rate,
|
138 |
+
wav.astype(np.int16),
|
139 |
+
)
|
140 |
+
|
141 |
+
with open(
|
142 |
+
os.path.join(out_dir, speaker, "{}.lab".format(base_name)),
|
143 |
+
"w",
|
144 |
+
) as f1:
|
145 |
+
f1.write(text)
|
146 |
+
# Extract textgird with MFA
|
147 |
+
textgird_extract(
|
148 |
+
corpus_directory=out_dir,
|
149 |
+
output_directory=os.path.join(output_path, dataset, "TextGrid"),
|
150 |
+
)
|
151 |
+
|
152 |
+
|
153 |
+
def main(output_path, dataset_path, cfg):
|
154 |
+
print("-" * 10)
|
155 |
+
print("Dataset splits for {}...\n".format("LJSpeech"))
|
156 |
+
|
157 |
+
dataset = "LJSpeech"
|
158 |
+
|
159 |
+
save_dir = os.path.join(output_path, dataset)
|
160 |
+
os.makedirs(save_dir, exist_ok=True)
|
161 |
+
ljspeech_path = dataset_path
|
162 |
+
|
163 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
164 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
165 |
+
singer_dict_file = os.path.join(save_dir, "singers.json")
|
166 |
+
|
167 |
+
speaker = "LJSpeech"
|
168 |
+
speakers = [dataset + "_" + speaker]
|
169 |
+
singer_lut = {name: i for i, name in enumerate(sorted(speakers))}
|
170 |
+
with open(singer_dict_file, "w") as f:
|
171 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
172 |
+
|
173 |
+
if has_existed(train_output_file) and has_existed(test_output_file):
|
174 |
+
return
|
175 |
+
|
176 |
+
meta_file = os.path.join(ljspeech_path, "metadata.csv")
|
177 |
+
lines = get_lines(meta_file)
|
178 |
+
|
179 |
+
train_set, test_set = split_dataset(lines)
|
180 |
+
|
181 |
+
res, hours = get_uid2utt(ljspeech_path, train_set, cfg)
|
182 |
+
|
183 |
+
# Save train
|
184 |
+
os.makedirs(save_dir, exist_ok=True)
|
185 |
+
with open(train_output_file, "w") as f:
|
186 |
+
json.dump(res, f, indent=4, ensure_ascii=False)
|
187 |
+
|
188 |
+
print("Train_hours= {}".format(hours))
|
189 |
+
|
190 |
+
res, hours = get_uid2utt(ljspeech_path, test_set, cfg)
|
191 |
+
|
192 |
+
# Save test
|
193 |
+
os.makedirs(save_dir, exist_ok=True)
|
194 |
+
with open(test_output_file, "w") as f:
|
195 |
+
json.dump(res, f, indent=4, ensure_ascii=False)
|
196 |
+
|
197 |
+
print("Test_hours= {}".format(hours))
|
preprocessors/ljspeech_vocoder.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import torchaudio
|
9 |
+
from tqdm import tqdm
|
10 |
+
from glob import glob
|
11 |
+
|
12 |
+
from utils.util import has_existed
|
13 |
+
|
14 |
+
|
15 |
+
def main(output_path, dataset_path):
|
16 |
+
print("-" * 10)
|
17 |
+
print("Dataset splits for ljspeech...\n")
|
18 |
+
|
19 |
+
save_dir = os.path.join(output_path, "ljspeech")
|
20 |
+
ljspeech_path = dataset_path
|
21 |
+
|
22 |
+
wave_files = glob(ljspeech_path + "/wavs/*.wav")
|
23 |
+
|
24 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
25 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
26 |
+
|
27 |
+
if has_existed(train_output_file):
|
28 |
+
return
|
29 |
+
|
30 |
+
utts = []
|
31 |
+
|
32 |
+
for wave_file in tqdm(wave_files):
|
33 |
+
res = {
|
34 |
+
"Dataset": "ljspeech",
|
35 |
+
"Singer": "female1",
|
36 |
+
"Uid": "{}".format(wave_file.split("/")[-1].split(".")[0]),
|
37 |
+
}
|
38 |
+
res["Path"] = wave_file
|
39 |
+
assert os.path.exists(res["Path"])
|
40 |
+
|
41 |
+
waveform, sample_rate = torchaudio.load(res["Path"])
|
42 |
+
duration = waveform.size(-1) / sample_rate
|
43 |
+
res["Duration"] = duration
|
44 |
+
|
45 |
+
if duration <= 1e-8:
|
46 |
+
continue
|
47 |
+
|
48 |
+
utts.append(res)
|
49 |
+
|
50 |
+
test_length = len(utts) // 20
|
51 |
+
|
52 |
+
train_utts = []
|
53 |
+
train_index_count = 0
|
54 |
+
train_total_duration = 0
|
55 |
+
|
56 |
+
for i in tqdm(range(len(utts) - test_length)):
|
57 |
+
tmp = utts[i]
|
58 |
+
tmp["index"] = train_index_count
|
59 |
+
train_index_count += 1
|
60 |
+
train_total_duration += tmp["Duration"]
|
61 |
+
train_utts.append(tmp)
|
62 |
+
|
63 |
+
test_utts = []
|
64 |
+
test_index_count = 0
|
65 |
+
test_total_duration = 0
|
66 |
+
|
67 |
+
for i in tqdm(range(len(utts) - test_length, len(utts))):
|
68 |
+
tmp = utts[i]
|
69 |
+
tmp["index"] = test_index_count
|
70 |
+
test_index_count += 1
|
71 |
+
test_total_duration += tmp["Duration"]
|
72 |
+
test_utts.append(tmp)
|
73 |
+
|
74 |
+
print("#Train = {}, #Test = {}".format(len(train_utts), len(test_utts)))
|
75 |
+
print(
|
76 |
+
"#Train hours= {}, #Test hours= {}".format(
|
77 |
+
train_total_duration / 3600, test_total_duration / 3600
|
78 |
+
)
|
79 |
+
)
|
80 |
+
|
81 |
+
# Save
|
82 |
+
os.makedirs(save_dir, exist_ok=True)
|
83 |
+
with open(train_output_file, "w") as f:
|
84 |
+
json.dump(train_utts, f, indent=4, ensure_ascii=False)
|
85 |
+
with open(test_output_file, "w") as f:
|
86 |
+
json.dump(test_utts, f, indent=4, ensure_ascii=False)
|
preprocessors/m4singer.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import librosa
|
9 |
+
from tqdm import tqdm
|
10 |
+
from collections import defaultdict
|
11 |
+
|
12 |
+
from utils.util import has_existed
|
13 |
+
from preprocessors import GOLDEN_TEST_SAMPLES
|
14 |
+
|
15 |
+
|
16 |
+
def get_test_songs():
|
17 |
+
golden_samples = GOLDEN_TEST_SAMPLES["m4singer"]
|
18 |
+
# every item is a tuple (singer, song)
|
19 |
+
golden_songs = [s.split("_")[:2] for s in golden_samples]
|
20 |
+
# singer_song, eg: Alto-1_美错
|
21 |
+
golden_songs = ["_".join(t) for t in golden_songs]
|
22 |
+
return golden_songs
|
23 |
+
|
24 |
+
|
25 |
+
def m4singer_statistics(meta):
|
26 |
+
singers = []
|
27 |
+
songs = []
|
28 |
+
singer2songs = defaultdict(lambda: defaultdict(list))
|
29 |
+
for utt in meta:
|
30 |
+
p, s, uid = utt["item_name"].split("#")
|
31 |
+
singers.append(p)
|
32 |
+
songs.append(s)
|
33 |
+
singer2songs[p][s].append(uid)
|
34 |
+
|
35 |
+
unique_singers = list(set(singers))
|
36 |
+
unique_songs = list(set(songs))
|
37 |
+
unique_singers.sort()
|
38 |
+
unique_songs.sort()
|
39 |
+
|
40 |
+
print(
|
41 |
+
"M4Singer: {} singers, {} utterances ({} unique songs)".format(
|
42 |
+
len(unique_singers), len(songs), len(unique_songs)
|
43 |
+
)
|
44 |
+
)
|
45 |
+
print("Singers: \n{}".format("\t".join(unique_singers)))
|
46 |
+
return singer2songs, unique_singers
|
47 |
+
|
48 |
+
|
49 |
+
def main(output_path, dataset_path):
|
50 |
+
print("-" * 10)
|
51 |
+
print("Preparing test samples for m4singer...\n")
|
52 |
+
|
53 |
+
save_dir = os.path.join(output_path, "m4singer")
|
54 |
+
os.makedirs(save_dir, exist_ok=True)
|
55 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
56 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
57 |
+
singer_dict_file = os.path.join(save_dir, "singers.json")
|
58 |
+
utt2singer_file = os.path.join(save_dir, "utt2singer")
|
59 |
+
if (
|
60 |
+
has_existed(train_output_file)
|
61 |
+
and has_existed(test_output_file)
|
62 |
+
and has_existed(singer_dict_file)
|
63 |
+
and has_existed(utt2singer_file)
|
64 |
+
):
|
65 |
+
return
|
66 |
+
utt2singer = open(utt2singer_file, "w")
|
67 |
+
|
68 |
+
# Load
|
69 |
+
m4singer_dir = dataset_path
|
70 |
+
meta_file = os.path.join(m4singer_dir, "meta.json")
|
71 |
+
with open(meta_file, "r", encoding="utf-8") as f:
|
72 |
+
meta = json.load(f)
|
73 |
+
|
74 |
+
singer2songs, unique_singers = m4singer_statistics(meta)
|
75 |
+
|
76 |
+
test_songs = get_test_songs()
|
77 |
+
|
78 |
+
# We select songs of standard samples as test songs
|
79 |
+
train = []
|
80 |
+
test = []
|
81 |
+
|
82 |
+
train_index_count = 0
|
83 |
+
test_index_count = 0
|
84 |
+
|
85 |
+
train_total_duration = 0
|
86 |
+
test_total_duration = 0
|
87 |
+
|
88 |
+
for singer, songs in tqdm(singer2songs.items()):
|
89 |
+
song_names = list(songs.keys())
|
90 |
+
|
91 |
+
for chosen_song in song_names:
|
92 |
+
chosen_song = chosen_song.replace(" ", "-")
|
93 |
+
for chosen_uid in songs[chosen_song]:
|
94 |
+
res = {
|
95 |
+
"Dataset": "m4singer",
|
96 |
+
"Singer": singer,
|
97 |
+
"Song": chosen_song,
|
98 |
+
"Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
|
99 |
+
}
|
100 |
+
|
101 |
+
res["Path"] = os.path.join(
|
102 |
+
m4singer_dir, "{}#{}/{}.wav".format(singer, chosen_song, chosen_uid)
|
103 |
+
)
|
104 |
+
assert os.path.exists(res["Path"])
|
105 |
+
|
106 |
+
duration = librosa.get_duration(filename=res["Path"])
|
107 |
+
res["Duration"] = duration
|
108 |
+
|
109 |
+
if "_".join([singer, chosen_song]) in test_songs:
|
110 |
+
res["index"] = test_index_count
|
111 |
+
test_total_duration += duration
|
112 |
+
test.append(res)
|
113 |
+
test_index_count += 1
|
114 |
+
else:
|
115 |
+
res["index"] = train_index_count
|
116 |
+
train_total_duration += duration
|
117 |
+
train.append(res)
|
118 |
+
train_index_count += 1
|
119 |
+
|
120 |
+
utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
|
121 |
+
|
122 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
123 |
+
print(
|
124 |
+
"#Train hours= {}, #Test hours= {}".format(
|
125 |
+
train_total_duration / 3600, test_total_duration / 3600
|
126 |
+
)
|
127 |
+
)
|
128 |
+
|
129 |
+
# Save train.json and test.json
|
130 |
+
with open(train_output_file, "w") as f:
|
131 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
132 |
+
with open(test_output_file, "w") as f:
|
133 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
134 |
+
|
135 |
+
# Save singers.json
|
136 |
+
singer_lut = {name: i for i, name in enumerate(unique_singers)}
|
137 |
+
with open(singer_dict_file, "w") as f:
|
138 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
preprocessors/metadata.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
from tqdm import tqdm
|
9 |
+
|
10 |
+
|
11 |
+
def cal_metadata(cfg):
|
12 |
+
"""
|
13 |
+
Dump metadata (singers.json, meta_info.json, utt2singer) for singer dataset or multi-datasets.
|
14 |
+
"""
|
15 |
+
from collections import Counter
|
16 |
+
|
17 |
+
datasets = cfg.dataset
|
18 |
+
|
19 |
+
print("-" * 10)
|
20 |
+
print("Preparing metadata...")
|
21 |
+
print("Including: \n{}\n".format("\n".join(datasets)))
|
22 |
+
|
23 |
+
datasets.sort()
|
24 |
+
|
25 |
+
for dataset in tqdm(datasets):
|
26 |
+
save_dir = os.path.join(cfg.preprocess.processed_dir, dataset)
|
27 |
+
assert os.path.exists(save_dir)
|
28 |
+
|
29 |
+
# 'train.json' and 'test.json' of target dataset
|
30 |
+
train_metadata = os.path.join(save_dir, "train.json")
|
31 |
+
test_metadata = os.path.join(save_dir, "test.json")
|
32 |
+
|
33 |
+
# Sort the metadata as the duration order
|
34 |
+
with open(train_metadata, "r", encoding="utf-8") as f:
|
35 |
+
train_utterances = json.load(f)
|
36 |
+
with open(test_metadata, "r", encoding="utf-8") as f:
|
37 |
+
test_utterances = json.load(f)
|
38 |
+
|
39 |
+
train_utterances = sorted(train_utterances, key=lambda x: x["Duration"])
|
40 |
+
test_utterances = sorted(test_utterances, key=lambda x: x["Duration"])
|
41 |
+
|
42 |
+
# Write back the sorted metadata
|
43 |
+
with open(train_metadata, "w") as f:
|
44 |
+
json.dump(train_utterances, f, indent=4, ensure_ascii=False)
|
45 |
+
with open(test_metadata, "w") as f:
|
46 |
+
json.dump(test_utterances, f, indent=4, ensure_ascii=False)
|
47 |
+
|
48 |
+
# Paths of metadata needed to be generated
|
49 |
+
singer_dict_file = os.path.join(save_dir, cfg.preprocess.spk2id)
|
50 |
+
utt2singer_file = os.path.join(save_dir, cfg.preprocess.utt2spk)
|
51 |
+
|
52 |
+
# Get the total duration and singer names for train and test utterances
|
53 |
+
train_total_duration = sum(utt["Duration"] for utt in train_utterances)
|
54 |
+
test_total_duration = sum(utt["Duration"] for utt in test_utterances)
|
55 |
+
|
56 |
+
singer_names = set(
|
57 |
+
f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
|
58 |
+
for utt in train_utterances + test_utterances
|
59 |
+
)
|
60 |
+
|
61 |
+
# Write the utt2singer file and sort the singer names
|
62 |
+
with open(utt2singer_file, "w", encoding="utf-8") as f:
|
63 |
+
for utt in train_utterances + test_utterances:
|
64 |
+
f.write(
|
65 |
+
f"{utt['Dataset']}_{utt['Uid']}\t{replace_augment_name(utt['Dataset'])}_{utt['Singer']}\n"
|
66 |
+
)
|
67 |
+
|
68 |
+
singer_names = sorted(singer_names)
|
69 |
+
singer_lut = {name: i for i, name in enumerate(singer_names)}
|
70 |
+
|
71 |
+
# dump singers.json
|
72 |
+
with open(singer_dict_file, "w", encoding="utf-8") as f:
|
73 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
74 |
+
|
75 |
+
meta_info = {
|
76 |
+
"dataset": dataset,
|
77 |
+
"statistics": {
|
78 |
+
"size": len(train_utterances) + len(test_utterances),
|
79 |
+
"hours": round(train_total_duration / 3600, 4)
|
80 |
+
+ round(test_total_duration / 3600, 4),
|
81 |
+
},
|
82 |
+
"train": {
|
83 |
+
"size": len(train_utterances),
|
84 |
+
"hours": round(train_total_duration / 3600, 4),
|
85 |
+
},
|
86 |
+
"test": {
|
87 |
+
"size": len(test_utterances),
|
88 |
+
"hours": round(test_total_duration / 3600, 4),
|
89 |
+
},
|
90 |
+
"singers": {"size": len(singer_lut)},
|
91 |
+
}
|
92 |
+
# Use Counter to count the minutes for each singer
|
93 |
+
total_singer2mins = Counter()
|
94 |
+
training_singer2mins = Counter()
|
95 |
+
for utt in train_utterances:
|
96 |
+
k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
|
97 |
+
training_singer2mins[k] += utt["Duration"] / 60
|
98 |
+
total_singer2mins[k] += utt["Duration"] / 60
|
99 |
+
for utt in test_utterances:
|
100 |
+
k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
|
101 |
+
total_singer2mins[k] += utt["Duration"] / 60
|
102 |
+
|
103 |
+
training_singer2mins = dict(
|
104 |
+
sorted(training_singer2mins.items(), key=lambda x: x[1], reverse=True)
|
105 |
+
)
|
106 |
+
training_singer2mins = {k: round(v, 2) for k, v in training_singer2mins.items()}
|
107 |
+
meta_info["singers"]["training_minutes"] = training_singer2mins
|
108 |
+
|
109 |
+
total_singer2mins = dict(
|
110 |
+
sorted(total_singer2mins.items(), key=lambda x: x[1], reverse=True)
|
111 |
+
)
|
112 |
+
total_singer2mins = {k: round(v, 2) for k, v in total_singer2mins.items()}
|
113 |
+
meta_info["singers"]["minutes"] = total_singer2mins
|
114 |
+
|
115 |
+
with open(os.path.join(save_dir, "meta_info.json"), "w") as f:
|
116 |
+
json.dump(meta_info, f, indent=4, ensure_ascii=False)
|
117 |
+
|
118 |
+
for singer, min in training_singer2mins.items():
|
119 |
+
print(f"Singer {singer}: {min} mins for training")
|
120 |
+
print("-" * 10, "\n")
|
121 |
+
|
122 |
+
|
123 |
+
def replace_augment_name(dataset: str) -> str:
|
124 |
+
"""Replace the augmented dataset name with the original dataset name.
|
125 |
+
>>> print(replace_augment_name("dataset_equalizer"))
|
126 |
+
dataset
|
127 |
+
"""
|
128 |
+
if "equalizer" in dataset:
|
129 |
+
dataset = dataset.replace("_equalizer", "")
|
130 |
+
elif "formant_shift" in dataset:
|
131 |
+
dataset = dataset.replace("_formant_shift", "")
|
132 |
+
elif "pitch_shift" in dataset:
|
133 |
+
dataset = dataset.replace("_pitch_shift", "")
|
134 |
+
elif "time_stretch" in dataset:
|
135 |
+
dataset = dataset.replace("_time_stretch", "")
|
136 |
+
else:
|
137 |
+
pass
|
138 |
+
return dataset
|
preprocessors/nus48e.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import torchaudio
|
9 |
+
from tqdm import tqdm
|
10 |
+
from glob import glob
|
11 |
+
from collections import defaultdict
|
12 |
+
|
13 |
+
|
14 |
+
from utils.io import save_audio
|
15 |
+
from utils.util import has_existed
|
16 |
+
from utils.audio_slicer import Slicer
|
17 |
+
from preprocessors import GOLDEN_TEST_SAMPLES
|
18 |
+
|
19 |
+
|
20 |
+
def split_to_utterances(dataset_path, singer, style, output_dir):
|
21 |
+
data_dir = os.path.join(dataset_path, singer, style)
|
22 |
+
|
23 |
+
print("Splitting to utterances for {}...".format(data_dir))
|
24 |
+
|
25 |
+
wave_files = glob(data_dir + "/*.wav")
|
26 |
+
|
27 |
+
for wav_file in tqdm(wave_files):
|
28 |
+
# Load waveform
|
29 |
+
song_name = wav_file.split("/")[-1].split(".")[0]
|
30 |
+
waveform, fs = torchaudio.load(wav_file)
|
31 |
+
|
32 |
+
# Split
|
33 |
+
slicer = Slicer(sr=fs, threshold=-40.0, max_sil_kept=4000)
|
34 |
+
chunks = slicer.slice(waveform)
|
35 |
+
|
36 |
+
for i, chunk in enumerate(chunks):
|
37 |
+
save_dir = os.path.join(output_dir, singer, style, song_name)
|
38 |
+
os.makedirs(save_dir, exist_ok=True)
|
39 |
+
|
40 |
+
output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
|
41 |
+
save_audio(output_file, chunk, fs)
|
42 |
+
|
43 |
+
|
44 |
+
def _main(dataset_path):
|
45 |
+
"""
|
46 |
+
Split to utterances
|
47 |
+
"""
|
48 |
+
utterance_dir = os.path.join(dataset_path, "utterances")
|
49 |
+
|
50 |
+
singer_infos = glob(dataset_path + "/*")
|
51 |
+
|
52 |
+
for singer_info in singer_infos:
|
53 |
+
singer = singer_info.split("/")[-1]
|
54 |
+
|
55 |
+
for style in ["read", "sing"]:
|
56 |
+
split_to_utterances(dataset_path, singer, style, utterance_dir)
|
57 |
+
|
58 |
+
|
59 |
+
def get_test_songs():
|
60 |
+
golden_samples = GOLDEN_TEST_SAMPLES["nus48e"]
|
61 |
+
# every item is a tuple (singer, song)
|
62 |
+
golden_songs = [s.split("#")[:2] for s in golden_samples]
|
63 |
+
# singer_song, eg: Female1#Almost_lover_Amateur
|
64 |
+
return golden_songs
|
65 |
+
|
66 |
+
|
67 |
+
def nus48e_statistics(data_dir):
|
68 |
+
singers = []
|
69 |
+
songs = []
|
70 |
+
singer2songs = defaultdict(lambda: defaultdict(list))
|
71 |
+
|
72 |
+
singer_infos = glob(data_dir + "/*")
|
73 |
+
|
74 |
+
for singer_info in singer_infos:
|
75 |
+
singer_info_split = singer_info.split("/")[-1]
|
76 |
+
|
77 |
+
style_infos = glob(singer_info + "/*")
|
78 |
+
|
79 |
+
for style_info in style_infos:
|
80 |
+
style_info_split = style_info.split("/")[-1]
|
81 |
+
|
82 |
+
singer = singer_info_split + "_" + style_info_split
|
83 |
+
singers.append(singer)
|
84 |
+
|
85 |
+
song_infos = glob(style_info + "/*")
|
86 |
+
|
87 |
+
for song_info in song_infos:
|
88 |
+
song = song_info.split("/")[-1]
|
89 |
+
|
90 |
+
songs.append(song)
|
91 |
+
|
92 |
+
utts = glob(song_info + "/*.wav")
|
93 |
+
|
94 |
+
for utt in utts:
|
95 |
+
uid = utt.split("/")[-1].split(".")[0]
|
96 |
+
singer2songs[singer][song].append(uid)
|
97 |
+
|
98 |
+
unique_singers = list(set(singers))
|
99 |
+
unique_songs = list(set(songs))
|
100 |
+
unique_singers.sort()
|
101 |
+
unique_songs.sort()
|
102 |
+
|
103 |
+
print(
|
104 |
+
"nus_48_e: {} singers, {} utterances ({} unique songs)".format(
|
105 |
+
len(unique_singers), len(songs), len(unique_songs)
|
106 |
+
)
|
107 |
+
)
|
108 |
+
print("Singers: \n{}".format("\t".join(unique_singers)))
|
109 |
+
return singer2songs, unique_singers
|
110 |
+
|
111 |
+
|
112 |
+
def main(output_path, dataset_path):
|
113 |
+
print("-" * 10)
|
114 |
+
print("Preparing test samples for nus48e...\n")
|
115 |
+
|
116 |
+
if not os.path.exists(os.path.join(dataset_path, "utterances")):
|
117 |
+
print("Spliting into utterances...\n")
|
118 |
+
_main(dataset_path)
|
119 |
+
|
120 |
+
save_dir = os.path.join(output_path, "nus48e")
|
121 |
+
os.makedirs(save_dir, exist_ok=True)
|
122 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
123 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
124 |
+
singer_dict_file = os.path.join(save_dir, "singers.json")
|
125 |
+
utt2singer_file = os.path.join(save_dir, "utt2singer")
|
126 |
+
if (
|
127 |
+
has_existed(train_output_file)
|
128 |
+
and has_existed(test_output_file)
|
129 |
+
and has_existed(singer_dict_file)
|
130 |
+
and has_existed(utt2singer_file)
|
131 |
+
):
|
132 |
+
return
|
133 |
+
utt2singer = open(utt2singer_file, "w")
|
134 |
+
|
135 |
+
# Load
|
136 |
+
nus48e_path = os.path.join(dataset_path, "utterances")
|
137 |
+
|
138 |
+
singer2songs, unique_singers = nus48e_statistics(nus48e_path)
|
139 |
+
test_songs = get_test_songs()
|
140 |
+
|
141 |
+
# We select songs of standard samples as test songs
|
142 |
+
train = []
|
143 |
+
test = []
|
144 |
+
|
145 |
+
train_index_count = 0
|
146 |
+
test_index_count = 0
|
147 |
+
|
148 |
+
train_total_duration = 0
|
149 |
+
test_total_duration = 0
|
150 |
+
|
151 |
+
for singer, songs in singer2songs.items():
|
152 |
+
song_names = list(songs.keys())
|
153 |
+
|
154 |
+
for chosen_song in song_names:
|
155 |
+
for chosen_uid in songs[chosen_song]:
|
156 |
+
res = {
|
157 |
+
"Dataset": "nus48e",
|
158 |
+
"Singer": singer,
|
159 |
+
"Uid": "{}#{}#{}".format(singer, chosen_song, chosen_uid),
|
160 |
+
}
|
161 |
+
res["Path"] = "{}/{}/{}/{}.wav".format(
|
162 |
+
singer.split("_")[0], singer.split("_")[-1], chosen_song, chosen_uid
|
163 |
+
)
|
164 |
+
res["Path"] = os.path.join(nus48e_path, res["Path"])
|
165 |
+
assert os.path.exists(res["Path"])
|
166 |
+
|
167 |
+
waveform, sample_rate = torchaudio.load(res["Path"])
|
168 |
+
duration = waveform.size(-1) / sample_rate
|
169 |
+
res["Duration"] = duration
|
170 |
+
|
171 |
+
if duration <= 1e-8:
|
172 |
+
continue
|
173 |
+
|
174 |
+
if ([singer, chosen_song]) in test_songs:
|
175 |
+
res["index"] = test_index_count
|
176 |
+
test_total_duration += duration
|
177 |
+
test.append(res)
|
178 |
+
test_index_count += 1
|
179 |
+
else:
|
180 |
+
res["index"] = train_index_count
|
181 |
+
train_total_duration += duration
|
182 |
+
train.append(res)
|
183 |
+
train_index_count += 1
|
184 |
+
|
185 |
+
utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
|
186 |
+
|
187 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
188 |
+
print(
|
189 |
+
"#Train hours= {}, #Test hours= {}".format(
|
190 |
+
train_total_duration / 3600, test_total_duration / 3600
|
191 |
+
)
|
192 |
+
)
|
193 |
+
|
194 |
+
# Save train.json and test.json
|
195 |
+
with open(train_output_file, "w") as f:
|
196 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
197 |
+
with open(test_output_file, "w") as f:
|
198 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
199 |
+
|
200 |
+
# Save singers.json
|
201 |
+
singer_lut = {name: i for i, name in enumerate(unique_singers)}
|
202 |
+
with open(singer_dict_file, "w") as f:
|
203 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
preprocessors/opencpop.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import json
|
7 |
+
from tqdm import tqdm
|
8 |
+
import os
|
9 |
+
import librosa
|
10 |
+
|
11 |
+
from utils.util import has_existed
|
12 |
+
|
13 |
+
|
14 |
+
def get_lines(file):
|
15 |
+
with open(file, "r") as f:
|
16 |
+
lines = f.readlines()
|
17 |
+
lines = [l.strip() for l in lines]
|
18 |
+
return lines
|
19 |
+
|
20 |
+
|
21 |
+
def get_uid2utt(opencpop_path, dataset, dataset_type):
|
22 |
+
index_count = 0
|
23 |
+
total_duration = 0
|
24 |
+
|
25 |
+
file = os.path.join(opencpop_path, "segments", "{}.txt".format(dataset_type))
|
26 |
+
lines = get_lines(file)
|
27 |
+
|
28 |
+
uid2utt = []
|
29 |
+
for l in tqdm(lines):
|
30 |
+
items = l.split("|")
|
31 |
+
uid = items[0]
|
32 |
+
|
33 |
+
res = {
|
34 |
+
"Dataset": dataset,
|
35 |
+
"index": index_count,
|
36 |
+
"Singer": "female1",
|
37 |
+
"Uid": uid,
|
38 |
+
}
|
39 |
+
|
40 |
+
# Duration in wav files
|
41 |
+
audio_file = os.path.join(opencpop_path, "segments/wavs/{}.wav".format(uid))
|
42 |
+
res["Path"] = audio_file
|
43 |
+
|
44 |
+
duration = librosa.get_duration(filename=res["Path"])
|
45 |
+
res["Duration"] = duration
|
46 |
+
|
47 |
+
uid2utt.append(res)
|
48 |
+
|
49 |
+
index_count = index_count + 1
|
50 |
+
total_duration += duration
|
51 |
+
|
52 |
+
return uid2utt, total_duration / 3600
|
53 |
+
|
54 |
+
|
55 |
+
def main(dataset, output_path, dataset_path):
|
56 |
+
print("-" * 10)
|
57 |
+
print("Dataset splits for {}...\n".format(dataset))
|
58 |
+
|
59 |
+
save_dir = os.path.join(output_path, dataset)
|
60 |
+
opencpop_path = dataset_path
|
61 |
+
for dataset_type in ["train", "test"]:
|
62 |
+
output_file = os.path.join(save_dir, "{}.json".format(dataset_type))
|
63 |
+
if has_existed(output_file):
|
64 |
+
continue
|
65 |
+
|
66 |
+
res, hours = get_uid2utt(opencpop_path, dataset, dataset_type)
|
67 |
+
|
68 |
+
# Save
|
69 |
+
os.makedirs(save_dir, exist_ok=True)
|
70 |
+
with open(output_file, "w") as f:
|
71 |
+
json.dump(res, f, indent=4, ensure_ascii=False)
|
72 |
+
|
73 |
+
print("{}_{}_hours= {}".format(dataset, dataset_type, hours))
|
preprocessors/opensinger.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import random
|
7 |
+
import os
|
8 |
+
import json
|
9 |
+
import librosa
|
10 |
+
from tqdm import tqdm
|
11 |
+
from glob import glob
|
12 |
+
from collections import defaultdict
|
13 |
+
|
14 |
+
from utils.util import has_existed
|
15 |
+
from preprocessors import GOLDEN_TEST_SAMPLES
|
16 |
+
|
17 |
+
|
18 |
+
def get_test_songs():
|
19 |
+
golden_samples = GOLDEN_TEST_SAMPLES["opensinger"]
|
20 |
+
# every item is a tuple (singer, song)
|
21 |
+
golden_songs = [s.split("_")[:3] for s in golden_samples]
|
22 |
+
# singer_song, eg: Female1#Almost_lover_Amateur
|
23 |
+
return golden_songs
|
24 |
+
|
25 |
+
|
26 |
+
def opensinger_statistics(data_dir):
|
27 |
+
singers = []
|
28 |
+
songs = []
|
29 |
+
singer2songs = defaultdict(lambda: defaultdict(list))
|
30 |
+
|
31 |
+
gender_infos = glob(data_dir + "/*")
|
32 |
+
|
33 |
+
for gender_info in gender_infos:
|
34 |
+
gender_info_split = gender_info.split("/")[-1][:-3]
|
35 |
+
|
36 |
+
singer_and_song_infos = glob(gender_info + "/*")
|
37 |
+
|
38 |
+
for singer_and_song_info in singer_and_song_infos:
|
39 |
+
singer_and_song_info_split = singer_and_song_info.split("/")[-1].split("_")
|
40 |
+
singer_id, song = (
|
41 |
+
singer_and_song_info_split[0],
|
42 |
+
singer_and_song_info_split[1],
|
43 |
+
)
|
44 |
+
singer = gender_info_split + "_" + singer_id
|
45 |
+
singers.append(singer)
|
46 |
+
songs.append(song)
|
47 |
+
|
48 |
+
utts = glob(singer_and_song_info + "/*.wav")
|
49 |
+
|
50 |
+
for utt in utts:
|
51 |
+
uid = utt.split("/")[-1].split("_")[-1].split(".")[0]
|
52 |
+
singer2songs[singer][song].append(uid)
|
53 |
+
|
54 |
+
unique_singers = list(set(singers))
|
55 |
+
unique_songs = list(set(songs))
|
56 |
+
unique_singers.sort()
|
57 |
+
unique_songs.sort()
|
58 |
+
|
59 |
+
print(
|
60 |
+
"opensinger: {} singers, {} songs ({} unique songs)".format(
|
61 |
+
len(unique_singers), len(songs), len(unique_songs)
|
62 |
+
)
|
63 |
+
)
|
64 |
+
print("Singers: \n{}".format("\t".join(unique_singers)))
|
65 |
+
return singer2songs, unique_singers
|
66 |
+
|
67 |
+
|
68 |
+
def main(output_path, dataset_path):
|
69 |
+
print("-" * 10)
|
70 |
+
print("Preparing test samples for opensinger...\n")
|
71 |
+
|
72 |
+
save_dir = os.path.join(output_path, "opensinger")
|
73 |
+
os.makedirs(save_dir, exist_ok=True)
|
74 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
75 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
76 |
+
singer_dict_file = os.path.join(save_dir, "singers.json")
|
77 |
+
utt2singer_file = os.path.join(save_dir, "utt2singer")
|
78 |
+
if (
|
79 |
+
has_existed(train_output_file)
|
80 |
+
and has_existed(test_output_file)
|
81 |
+
and has_existed(singer_dict_file)
|
82 |
+
and has_existed(utt2singer_file)
|
83 |
+
):
|
84 |
+
return
|
85 |
+
utt2singer = open(utt2singer_file, "w")
|
86 |
+
|
87 |
+
# Load
|
88 |
+
opensinger_path = dataset_path
|
89 |
+
|
90 |
+
singer2songs, unique_singers = opensinger_statistics(opensinger_path)
|
91 |
+
test_songs = get_test_songs()
|
92 |
+
|
93 |
+
# We select songs of standard samples as test songs
|
94 |
+
train = []
|
95 |
+
test = []
|
96 |
+
|
97 |
+
train_index_count = 0
|
98 |
+
test_index_count = 0
|
99 |
+
|
100 |
+
train_total_duration = 0
|
101 |
+
test_total_duration = 0
|
102 |
+
|
103 |
+
for i, (singer, songs) in enumerate(singer2songs.items()):
|
104 |
+
song_names = list(songs.keys())
|
105 |
+
|
106 |
+
for chosen_song in tqdm(
|
107 |
+
song_names, desc="Singer {}/{}".format(i, len(singer2songs))
|
108 |
+
):
|
109 |
+
for chosen_uid in songs[chosen_song]:
|
110 |
+
res = {
|
111 |
+
"Dataset": "opensinger",
|
112 |
+
"Singer": singer,
|
113 |
+
"Song": chosen_song,
|
114 |
+
"Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
|
115 |
+
}
|
116 |
+
res["Path"] = "{}Raw/{}_{}/{}_{}_{}.wav".format(
|
117 |
+
singer.split("_")[0],
|
118 |
+
singer.split("_")[1],
|
119 |
+
chosen_song,
|
120 |
+
singer.split("_")[1],
|
121 |
+
chosen_song,
|
122 |
+
chosen_uid,
|
123 |
+
)
|
124 |
+
res["Path"] = os.path.join(opensinger_path, res["Path"])
|
125 |
+
assert os.path.exists(res["Path"])
|
126 |
+
|
127 |
+
duration = librosa.get_duration(filename=res["Path"])
|
128 |
+
res["Duration"] = duration
|
129 |
+
|
130 |
+
if duration > 30:
|
131 |
+
print(
|
132 |
+
"Wav file: {}, the duration = {:.2f}s > 30s, which has been abandoned.".format(
|
133 |
+
res["Path"], duration
|
134 |
+
)
|
135 |
+
)
|
136 |
+
continue
|
137 |
+
|
138 |
+
if (
|
139 |
+
[singer.split("_")[0], singer.split("_")[1], chosen_song]
|
140 |
+
) in test_songs:
|
141 |
+
res["index"] = test_index_count
|
142 |
+
test_total_duration += duration
|
143 |
+
test.append(res)
|
144 |
+
test_index_count += 1
|
145 |
+
else:
|
146 |
+
res["index"] = train_index_count
|
147 |
+
train_total_duration += duration
|
148 |
+
train.append(res)
|
149 |
+
train_index_count += 1
|
150 |
+
|
151 |
+
utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
|
152 |
+
|
153 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
154 |
+
print(
|
155 |
+
"#Train hours= {}, #Test hours= {}".format(
|
156 |
+
train_total_duration / 3600, test_total_duration / 3600
|
157 |
+
)
|
158 |
+
)
|
159 |
+
|
160 |
+
# Save train.json and test.json
|
161 |
+
with open(train_output_file, "w") as f:
|
162 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
163 |
+
with open(test_output_file, "w") as f:
|
164 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
165 |
+
|
166 |
+
# Save singers.json
|
167 |
+
singer_lut = {name: i for i, name in enumerate(unique_singers)}
|
168 |
+
with open(singer_dict_file, "w") as f:
|
169 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
preprocessors/opera.py
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
from tqdm import tqdm
|
10 |
+
import torchaudio
|
11 |
+
from glob import glob
|
12 |
+
from collections import defaultdict
|
13 |
+
|
14 |
+
from utils.util import has_existed
|
15 |
+
from utils.io import save_audio
|
16 |
+
from utils.audio_slicer import Slicer
|
17 |
+
from preprocessors import GOLDEN_TEST_SAMPLES
|
18 |
+
|
19 |
+
|
20 |
+
def split_to_utterances(language_dir, output_dir):
|
21 |
+
print("Splitting to utterances for {}...".format(language_dir))
|
22 |
+
|
23 |
+
for wav_file in tqdm(glob("{}/*/*".format(language_dir))):
|
24 |
+
# Load waveform
|
25 |
+
singer_name, song_name = wav_file.split("/")[-2:]
|
26 |
+
song_name = song_name.split(".")[0]
|
27 |
+
waveform, fs = torchaudio.load(wav_file)
|
28 |
+
|
29 |
+
# Split
|
30 |
+
slicer = Slicer(sr=fs, threshold=-30.0, max_sil_kept=3000)
|
31 |
+
chunks = slicer.slice(waveform)
|
32 |
+
|
33 |
+
for i, chunk in enumerate(chunks):
|
34 |
+
save_dir = os.path.join(output_dir, singer_name, song_name)
|
35 |
+
os.makedirs(save_dir, exist_ok=True)
|
36 |
+
|
37 |
+
output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
|
38 |
+
save_audio(output_file, chunk, fs)
|
39 |
+
|
40 |
+
|
41 |
+
def _main(dataset_path):
|
42 |
+
"""
|
43 |
+
Split to utterances
|
44 |
+
"""
|
45 |
+
utterance_dir = os.path.join(dataset_path, "utterances")
|
46 |
+
|
47 |
+
for lang in ["chinese", "western"]:
|
48 |
+
split_to_utterances(os.path.join(dataset_path, lang), utterance_dir)
|
49 |
+
|
50 |
+
|
51 |
+
def get_test_songs():
|
52 |
+
golden_samples = GOLDEN_TEST_SAMPLES["opera"]
|
53 |
+
# every item is a tuple (singer, song)
|
54 |
+
golden_songs = [s.split("#")[:2] for s in golden_samples]
|
55 |
+
# singer#song, eg:fem_01#neg_01
|
56 |
+
return golden_songs
|
57 |
+
|
58 |
+
|
59 |
+
def opera_statistics(data_dir):
|
60 |
+
singers = []
|
61 |
+
songs = []
|
62 |
+
singers2songs = defaultdict(lambda: defaultdict(list))
|
63 |
+
|
64 |
+
singer_infos = glob(data_dir + "/*")
|
65 |
+
|
66 |
+
for singer_info in singer_infos:
|
67 |
+
singer = singer_info.split("/")[-1]
|
68 |
+
|
69 |
+
song_infos = glob(singer_info + "/*")
|
70 |
+
|
71 |
+
for song_info in song_infos:
|
72 |
+
song = song_info.split("/")[-1]
|
73 |
+
|
74 |
+
singers.append(singer)
|
75 |
+
songs.append(song)
|
76 |
+
|
77 |
+
utts = glob(song_info + "/*.wav")
|
78 |
+
|
79 |
+
for utt in utts:
|
80 |
+
uid = utt.split("/")[-1].split(".")[0]
|
81 |
+
singers2songs[singer][song].append(uid)
|
82 |
+
|
83 |
+
unique_singers = list(set(singers))
|
84 |
+
unique_songs = list(set(songs))
|
85 |
+
unique_singers.sort()
|
86 |
+
unique_songs.sort()
|
87 |
+
|
88 |
+
print(
|
89 |
+
"opera: {} singers, {} utterances ({} unique songs)".format(
|
90 |
+
len(unique_singers), len(songs), len(unique_songs)
|
91 |
+
)
|
92 |
+
)
|
93 |
+
print("Singers: \n{}".format("\t".join(unique_singers)))
|
94 |
+
return singers2songs, unique_singers
|
95 |
+
|
96 |
+
|
97 |
+
def main(output_path, dataset_path):
|
98 |
+
print("-" * 10)
|
99 |
+
print("Preparing test samples for opera...\n")
|
100 |
+
|
101 |
+
if not os.path.exists(os.path.join(dataset_path, "utterances")):
|
102 |
+
print("Spliting into utterances...\n")
|
103 |
+
_main(dataset_path)
|
104 |
+
|
105 |
+
save_dir = os.path.join(output_path, "opera")
|
106 |
+
os.makedirs(save_dir, exist_ok=True)
|
107 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
108 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
109 |
+
singer_dict_file = os.path.join(save_dir, "singers.json")
|
110 |
+
utt2singer_file = os.path.join(save_dir, "utt2singer")
|
111 |
+
if (
|
112 |
+
has_existed(train_output_file)
|
113 |
+
and has_existed(test_output_file)
|
114 |
+
and has_existed(singer_dict_file)
|
115 |
+
and has_existed(utt2singer_file)
|
116 |
+
):
|
117 |
+
return
|
118 |
+
utt2singer = open(utt2singer_file, "w")
|
119 |
+
|
120 |
+
# Load
|
121 |
+
opera_path = os.path.join(dataset_path, "utterances")
|
122 |
+
|
123 |
+
singers2songs, unique_singers = opera_statistics(opera_path)
|
124 |
+
test_songs = get_test_songs()
|
125 |
+
|
126 |
+
# We select songs of standard samples as test songs
|
127 |
+
train = []
|
128 |
+
test = []
|
129 |
+
|
130 |
+
train_index_count = 0
|
131 |
+
test_index_count = 0
|
132 |
+
|
133 |
+
train_total_duration = 0
|
134 |
+
test_total_duration = 0
|
135 |
+
|
136 |
+
for singer, songs in tqdm(singers2songs.items()):
|
137 |
+
song_names = list(songs.keys())
|
138 |
+
|
139 |
+
for chosen_song in song_names:
|
140 |
+
for chosen_uid in songs[chosen_song]:
|
141 |
+
res = {
|
142 |
+
"Dataset": "opera",
|
143 |
+
"Singer": singer,
|
144 |
+
"Uid": "{}#{}#{}".format(singer, chosen_song, chosen_uid),
|
145 |
+
}
|
146 |
+
res["Path"] = "{}/{}/{}.wav".format(singer, chosen_song, chosen_uid)
|
147 |
+
res["Path"] = os.path.join(opera_path, res["Path"])
|
148 |
+
assert os.path.exists(res["Path"])
|
149 |
+
|
150 |
+
waveform, sample_rate = torchaudio.load(res["Path"])
|
151 |
+
duration = waveform.size(-1) / sample_rate
|
152 |
+
res["Duration"] = duration
|
153 |
+
|
154 |
+
if duration <= 1e-8:
|
155 |
+
continue
|
156 |
+
|
157 |
+
if ([singer, chosen_song]) in test_songs:
|
158 |
+
res["index"] = test_index_count
|
159 |
+
test_total_duration += duration
|
160 |
+
test.append(res)
|
161 |
+
test_index_count += 1
|
162 |
+
else:
|
163 |
+
res["index"] = train_index_count
|
164 |
+
train_total_duration += duration
|
165 |
+
train.append(res)
|
166 |
+
train_index_count += 1
|
167 |
+
|
168 |
+
utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
|
169 |
+
|
170 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
171 |
+
print(
|
172 |
+
"#Train hours= {}, #Test hours= {}".format(
|
173 |
+
train_total_duration / 3600, test_total_duration / 3600
|
174 |
+
)
|
175 |
+
)
|
176 |
+
|
177 |
+
# Save train.json and test.json
|
178 |
+
with open(train_output_file, "w") as f:
|
179 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
180 |
+
with open(test_output_file, "w") as f:
|
181 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
182 |
+
|
183 |
+
# Save singers.json
|
184 |
+
singer_lut = {name: i for i, name in enumerate(unique_singers)}
|
185 |
+
with open(singer_dict_file, "w") as f:
|
186 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
preprocessors/pjs.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
from tqdm import tqdm
|
8 |
+
import glob
|
9 |
+
import json
|
10 |
+
import torchaudio
|
11 |
+
|
12 |
+
from utils.util import has_existed
|
13 |
+
from utils.io import save_audio
|
14 |
+
|
15 |
+
|
16 |
+
def get_splitted_utterances(
|
17 |
+
raw_wav_dir, trimed_wav_dir, n_utterance_splits, overlapping
|
18 |
+
):
|
19 |
+
res = []
|
20 |
+
raw_song_files = glob.glob(
|
21 |
+
os.path.join(raw_wav_dir, "**/pjs*_song.wav"), recursive=True
|
22 |
+
)
|
23 |
+
trimed_song_files = glob.glob(
|
24 |
+
os.path.join(trimed_wav_dir, "**/*.wav"), recursive=True
|
25 |
+
)
|
26 |
+
|
27 |
+
if len(raw_song_files) * n_utterance_splits == len(trimed_song_files):
|
28 |
+
print("Splitted done...")
|
29 |
+
for wav_file in tqdm(trimed_song_files):
|
30 |
+
uid = wav_file.split("/")[-1].split(".")[0]
|
31 |
+
utt = {"Dataset": "pjs", "Singer": "male1", "Uid": uid, "Path": wav_file}
|
32 |
+
|
33 |
+
waveform, sample_rate = torchaudio.load(wav_file)
|
34 |
+
duration = waveform.size(-1) / sample_rate
|
35 |
+
utt["Duration"] = duration
|
36 |
+
|
37 |
+
res.append(utt)
|
38 |
+
|
39 |
+
else:
|
40 |
+
for wav_file in tqdm(raw_song_files):
|
41 |
+
song_id = wav_file.split("/")[-1].split(".")[0]
|
42 |
+
|
43 |
+
waveform, sample_rate = torchaudio.load(wav_file)
|
44 |
+
trimed_waveform = torchaudio.functional.vad(waveform, sample_rate)
|
45 |
+
trimed_waveform = torchaudio.functional.vad(
|
46 |
+
trimed_waveform.flip(dims=[1]), sample_rate
|
47 |
+
).flip(dims=[1])
|
48 |
+
|
49 |
+
audio_len = trimed_waveform.size(-1)
|
50 |
+
lapping_len = overlapping * sample_rate
|
51 |
+
|
52 |
+
for i in range(n_utterance_splits):
|
53 |
+
start = i * audio_len // 3
|
54 |
+
end = start + audio_len // 3 + lapping_len
|
55 |
+
splitted_waveform = trimed_waveform[:, start:end]
|
56 |
+
|
57 |
+
utt = {
|
58 |
+
"Dataset": "pjs",
|
59 |
+
"Singer": "male1",
|
60 |
+
"Uid": "{}_{}".format(song_id, i),
|
61 |
+
}
|
62 |
+
|
63 |
+
# Duration
|
64 |
+
duration = splitted_waveform.size(-1) / sample_rate
|
65 |
+
utt["Duration"] = duration
|
66 |
+
|
67 |
+
# Save trimed wav
|
68 |
+
splitted_waveform_file = os.path.join(
|
69 |
+
trimed_wav_dir, "{}.wav".format(utt["Uid"])
|
70 |
+
)
|
71 |
+
save_audio(splitted_waveform_file, splitted_waveform, sample_rate)
|
72 |
+
|
73 |
+
# Path
|
74 |
+
utt["Path"] = splitted_waveform_file
|
75 |
+
|
76 |
+
res.append(utt)
|
77 |
+
|
78 |
+
res = sorted(res, key=lambda x: x["Uid"])
|
79 |
+
return res
|
80 |
+
|
81 |
+
|
82 |
+
def main(output_path, dataset_path, n_utterance_splits=3, overlapping=1):
|
83 |
+
"""
|
84 |
+
1. Split one raw utterance to three splits (since some samples are too long)
|
85 |
+
2. Overlapping of ajacent splits is 1 s
|
86 |
+
"""
|
87 |
+
print("-" * 10)
|
88 |
+
print("Preparing training dataset for PJS...")
|
89 |
+
|
90 |
+
save_dir = os.path.join(output_path, "pjs")
|
91 |
+
raw_wav_dir = os.path.join(dataset_path, "PJS_corpus_ver1.1")
|
92 |
+
|
93 |
+
# Trim for silence
|
94 |
+
trimed_wav_dir = os.path.join(dataset_path, "trim")
|
95 |
+
os.makedirs(trimed_wav_dir, exist_ok=True)
|
96 |
+
|
97 |
+
# Total utterances
|
98 |
+
utterances = get_splitted_utterances(
|
99 |
+
raw_wav_dir, trimed_wav_dir, n_utterance_splits, overlapping
|
100 |
+
)
|
101 |
+
total_uids = [utt["Uid"] for utt in utterances]
|
102 |
+
|
103 |
+
# Test uids
|
104 |
+
n_test_songs = 3
|
105 |
+
test_uids = []
|
106 |
+
for i in range(1, n_test_songs + 1):
|
107 |
+
test_uids += [
|
108 |
+
"pjs00{}_song_{}".format(i, split_id)
|
109 |
+
for split_id in range(n_utterance_splits)
|
110 |
+
]
|
111 |
+
|
112 |
+
# Train uids
|
113 |
+
train_uids = [uid for uid in total_uids if uid not in test_uids]
|
114 |
+
|
115 |
+
for dataset_type in ["train", "test"]:
|
116 |
+
output_file = os.path.join(save_dir, "{}.json".format(dataset_type))
|
117 |
+
if has_existed(output_file):
|
118 |
+
continue
|
119 |
+
|
120 |
+
uids = eval("{}_uids".format(dataset_type))
|
121 |
+
res = [utt for utt in utterances if utt["Uid"] in uids]
|
122 |
+
for i in range(len(res)):
|
123 |
+
res[i]["index"] = i
|
124 |
+
|
125 |
+
time = sum([utt["Duration"] for utt in res])
|
126 |
+
print(
|
127 |
+
"{}, Total size: {}, Total Duraions = {} s = {:.2f} hour\n".format(
|
128 |
+
dataset_type, len(res), time, time / 3600
|
129 |
+
)
|
130 |
+
)
|
131 |
+
|
132 |
+
# Save
|
133 |
+
os.makedirs(save_dir, exist_ok=True)
|
134 |
+
with open(output_file, "w") as f:
|
135 |
+
json.dump(res, f, indent=4, ensure_ascii=False)
|
preprocessors/popbutfy.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import torchaudio
|
9 |
+
import librosa
|
10 |
+
from tqdm import tqdm
|
11 |
+
from glob import glob
|
12 |
+
from collections import defaultdict
|
13 |
+
|
14 |
+
from utils.util import has_existed
|
15 |
+
from preprocessors import GOLDEN_TEST_SAMPLES
|
16 |
+
|
17 |
+
|
18 |
+
def get_test_songs():
|
19 |
+
golden_samples = GOLDEN_TEST_SAMPLES["popbutfy"]
|
20 |
+
# every item is a tuple (singer, song)
|
21 |
+
golden_songs = [s.split("#")[:2] for s in golden_samples]
|
22 |
+
# singer#song, eg: Female1#Almost_lover_Amateur
|
23 |
+
return golden_songs
|
24 |
+
|
25 |
+
|
26 |
+
def popbutfy_statistics(data_dir):
|
27 |
+
singers = []
|
28 |
+
songs = []
|
29 |
+
singer2songs = defaultdict(lambda: defaultdict(list))
|
30 |
+
|
31 |
+
data_infos = glob(data_dir + "/*")
|
32 |
+
|
33 |
+
for data_info in data_infos:
|
34 |
+
data_info_split = data_info.split("/")[-1].split("#")
|
35 |
+
|
36 |
+
singer, song = data_info_split[0], data_info_split[-1]
|
37 |
+
singers.append(singer)
|
38 |
+
songs.append(song)
|
39 |
+
|
40 |
+
utts = glob(data_info + "/*")
|
41 |
+
|
42 |
+
for utt in utts:
|
43 |
+
uid = utt.split("/")[-1].split("_")[-1].split(".")[0]
|
44 |
+
singer2songs[singer][song].append(uid)
|
45 |
+
|
46 |
+
unique_singers = list(set(singers))
|
47 |
+
unique_songs = list(set(songs))
|
48 |
+
unique_singers.sort()
|
49 |
+
unique_songs.sort()
|
50 |
+
|
51 |
+
print(
|
52 |
+
"PopBuTFy: {} singers, {} utterances ({} unique songs)".format(
|
53 |
+
len(unique_singers), len(songs), len(unique_songs)
|
54 |
+
)
|
55 |
+
)
|
56 |
+
print("Singers: \n{}".format("\t".join(unique_singers)))
|
57 |
+
return singer2songs, unique_singers
|
58 |
+
|
59 |
+
|
60 |
+
def main(output_path, dataset_path):
|
61 |
+
print("-" * 10)
|
62 |
+
print("Preparing test samples for popbutfy...\n")
|
63 |
+
|
64 |
+
save_dir = os.path.join(output_path, "popbutfy")
|
65 |
+
os.makedirs(save_dir, exist_ok=True)
|
66 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
67 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
68 |
+
singer_dict_file = os.path.join(save_dir, "singers.json")
|
69 |
+
utt2singer_file = os.path.join(save_dir, "utt2singer")
|
70 |
+
if (
|
71 |
+
has_existed(train_output_file)
|
72 |
+
and has_existed(test_output_file)
|
73 |
+
and has_existed(singer_dict_file)
|
74 |
+
and has_existed(utt2singer_file)
|
75 |
+
):
|
76 |
+
return
|
77 |
+
utt2singer = open(utt2singer_file, "w")
|
78 |
+
|
79 |
+
# Load
|
80 |
+
popbutfy_dir = dataset_path
|
81 |
+
|
82 |
+
singer2songs, unique_singers = popbutfy_statistics(popbutfy_dir)
|
83 |
+
test_songs = get_test_songs()
|
84 |
+
|
85 |
+
# We select songs of standard samples as test songs
|
86 |
+
train = []
|
87 |
+
test = []
|
88 |
+
|
89 |
+
train_index_count = 0
|
90 |
+
test_index_count = 0
|
91 |
+
|
92 |
+
train_total_duration = 0
|
93 |
+
test_total_duration = 0
|
94 |
+
|
95 |
+
for singer, songs in tqdm(singer2songs.items()):
|
96 |
+
song_names = list(songs.keys())
|
97 |
+
|
98 |
+
for chosen_song in song_names:
|
99 |
+
for chosen_uid in songs[chosen_song]:
|
100 |
+
res = {
|
101 |
+
"Dataset": "popbutfy",
|
102 |
+
"Singer": singer,
|
103 |
+
"Song": chosen_song,
|
104 |
+
"Uid": "{}#{}#".format(singer, chosen_song, chosen_uid),
|
105 |
+
}
|
106 |
+
res["Path"] = "{}#singing#{}/{}#singing#{}_{}.mp3".format(
|
107 |
+
singer, chosen_song, singer, chosen_song, chosen_uid
|
108 |
+
)
|
109 |
+
if not os.path.exists(os.path.join(popbutfy_dir, res["Path"])):
|
110 |
+
res["Path"] = "{}#singing#{}/{}#singing#{}_{}.wav".format(
|
111 |
+
singer, chosen_song, singer, chosen_song, chosen_uid
|
112 |
+
)
|
113 |
+
res["Path"] = os.path.join(popbutfy_dir, res["Path"])
|
114 |
+
assert os.path.exists(res["Path"])
|
115 |
+
|
116 |
+
if res["Path"].split("/")[-1].split(".")[-1] == "wav":
|
117 |
+
waveform, sample_rate = torchaudio.load(res["Path"])
|
118 |
+
duration = waveform.size(-1) / sample_rate
|
119 |
+
else:
|
120 |
+
waveform, sample_rate = librosa.load(res["Path"])
|
121 |
+
duration = waveform.shape[-1] / sample_rate
|
122 |
+
res["Duration"] = duration
|
123 |
+
|
124 |
+
if ([singer, chosen_song]) in test_songs:
|
125 |
+
res["index"] = test_index_count
|
126 |
+
test_total_duration += duration
|
127 |
+
test.append(res)
|
128 |
+
test_index_count += 1
|
129 |
+
else:
|
130 |
+
res["index"] = train_index_count
|
131 |
+
train_total_duration += duration
|
132 |
+
train.append(res)
|
133 |
+
train_index_count += 1
|
134 |
+
|
135 |
+
utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
|
136 |
+
|
137 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
138 |
+
print(
|
139 |
+
"#Train hours= {}, #Test hours= {}".format(
|
140 |
+
train_total_duration / 3600, test_total_duration / 3600
|
141 |
+
)
|
142 |
+
)
|
143 |
+
|
144 |
+
# Save train.json and test.json
|
145 |
+
with open(train_output_file, "w") as f:
|
146 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
147 |
+
with open(test_output_file, "w") as f:
|
148 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
149 |
+
|
150 |
+
# Save singers.json
|
151 |
+
singer_lut = {name: i for i, name in enumerate(unique_singers)}
|
152 |
+
with open(singer_dict_file, "w") as f:
|
153 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
preprocessors/popcs.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import torchaudio
|
9 |
+
from glob import glob
|
10 |
+
from collections import defaultdict
|
11 |
+
|
12 |
+
from utils.util import has_existed
|
13 |
+
from preprocessors import GOLDEN_TEST_SAMPLES
|
14 |
+
|
15 |
+
|
16 |
+
def get_test_songs():
|
17 |
+
golden_samples = GOLDEN_TEST_SAMPLES["popcs"]
|
18 |
+
# every item is a string
|
19 |
+
golden_songs = [s.split("_")[:1] for s in golden_samples]
|
20 |
+
# song, eg: 万有引力
|
21 |
+
return golden_songs
|
22 |
+
|
23 |
+
|
24 |
+
def popcs_statistics(data_dir):
|
25 |
+
songs = []
|
26 |
+
songs2utts = defaultdict(list)
|
27 |
+
|
28 |
+
song_infos = glob(data_dir + "/*")
|
29 |
+
|
30 |
+
for song_info in song_infos:
|
31 |
+
song_info_split = song_info.split("/")[-1].split("-")[-1]
|
32 |
+
|
33 |
+
songs.append(song_info_split)
|
34 |
+
|
35 |
+
utts = glob(song_info + "/*.wav")
|
36 |
+
|
37 |
+
for utt in utts:
|
38 |
+
uid = utt.split("/")[-1].split("_")[0]
|
39 |
+
songs2utts[song_info_split].append(uid)
|
40 |
+
|
41 |
+
unique_songs = list(set(songs))
|
42 |
+
unique_songs.sort()
|
43 |
+
|
44 |
+
print(
|
45 |
+
"popcs: {} utterances ({} unique songs)".format(len(songs), len(unique_songs))
|
46 |
+
)
|
47 |
+
print("Songs: \n{}".format("\t".join(unique_songs)))
|
48 |
+
return songs2utts
|
49 |
+
|
50 |
+
|
51 |
+
def main(output_path, dataset_path):
|
52 |
+
print("-" * 10)
|
53 |
+
print("Preparing test samples for popcs...\n")
|
54 |
+
|
55 |
+
save_dir = os.path.join(output_path, "popcs")
|
56 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
57 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
58 |
+
if has_existed(test_output_file):
|
59 |
+
return
|
60 |
+
|
61 |
+
# Load
|
62 |
+
popcs_dir = dataset_path
|
63 |
+
|
64 |
+
songs2utts = popcs_statistics(popcs_dir)
|
65 |
+
test_songs = get_test_songs()
|
66 |
+
|
67 |
+
# We select songs of standard samples as test songs
|
68 |
+
train = []
|
69 |
+
test = []
|
70 |
+
|
71 |
+
train_index_count = 0
|
72 |
+
test_index_count = 0
|
73 |
+
|
74 |
+
train_total_duration = 0
|
75 |
+
test_total_duration = 0
|
76 |
+
|
77 |
+
song_names = list(songs2utts.keys())
|
78 |
+
|
79 |
+
for chosen_song in song_names:
|
80 |
+
for chosen_uid in songs2utts[chosen_song]:
|
81 |
+
res = {
|
82 |
+
"Dataset": "popcs",
|
83 |
+
"Singer": "female1",
|
84 |
+
"Song": chosen_song,
|
85 |
+
"Uid": "{}_{}".format(chosen_song, chosen_uid),
|
86 |
+
}
|
87 |
+
res["Path"] = "popcs-{}/{}_wf0.wav".format(chosen_song, chosen_uid)
|
88 |
+
res["Path"] = os.path.join(popcs_dir, res["Path"])
|
89 |
+
assert os.path.exists(res["Path"])
|
90 |
+
|
91 |
+
waveform, sample_rate = torchaudio.load(res["Path"])
|
92 |
+
duration = waveform.size(-1) / sample_rate
|
93 |
+
res["Duration"] = duration
|
94 |
+
|
95 |
+
if ([chosen_song]) in test_songs:
|
96 |
+
res["index"] = test_index_count
|
97 |
+
test_total_duration += duration
|
98 |
+
test.append(res)
|
99 |
+
test_index_count += 1
|
100 |
+
else:
|
101 |
+
res["index"] = train_index_count
|
102 |
+
train_total_duration += duration
|
103 |
+
train.append(res)
|
104 |
+
train_index_count += 1
|
105 |
+
|
106 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
107 |
+
print(
|
108 |
+
"#Train hours= {}, #Test hours= {}".format(
|
109 |
+
train_total_duration / 3600, test_total_duration / 3600
|
110 |
+
)
|
111 |
+
)
|
112 |
+
|
113 |
+
# Save
|
114 |
+
os.makedirs(save_dir, exist_ok=True)
|
115 |
+
with open(train_output_file, "w") as f:
|
116 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
117 |
+
with open(test_output_file, "w") as f:
|
118 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
preprocessors/processor.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import re
|
7 |
+
from preprocessors import (
|
8 |
+
m4singer,
|
9 |
+
opencpop,
|
10 |
+
svcc,
|
11 |
+
pjs,
|
12 |
+
popbutfy,
|
13 |
+
opensinger,
|
14 |
+
popcs,
|
15 |
+
kising,
|
16 |
+
csd,
|
17 |
+
opera,
|
18 |
+
nus48e,
|
19 |
+
svcceval,
|
20 |
+
vctk,
|
21 |
+
vctksample,
|
22 |
+
libritts,
|
23 |
+
lijian,
|
24 |
+
cdmusiceval,
|
25 |
+
ljspeech,
|
26 |
+
coco,
|
27 |
+
cocoeval,
|
28 |
+
custom,
|
29 |
+
vocalist,
|
30 |
+
ljspeech_vocoder,
|
31 |
+
)
|
32 |
+
|
33 |
+
|
34 |
+
def preprocess_dataset(
|
35 |
+
dataset, dataset_path, output_path, cfg, is_custom_dataset=False
|
36 |
+
):
|
37 |
+
"""Call specific function to handle specific dataset
|
38 |
+
Args:
|
39 |
+
dataset (str): name of a dataset, e.g. opencpop, m4singer
|
40 |
+
dataset_path (str): path to dataset
|
41 |
+
output_path (str): path to store preprocessing result files
|
42 |
+
"""
|
43 |
+
if is_custom_dataset:
|
44 |
+
custom.main(output_path, dataset_path, dataset_name=dataset)
|
45 |
+
return
|
46 |
+
|
47 |
+
if re.match("opencpop*", dataset):
|
48 |
+
opencpop.main(dataset, output_path, dataset_path)
|
49 |
+
if dataset == "m4singer":
|
50 |
+
m4singer.main(output_path, dataset_path)
|
51 |
+
if dataset == "svcc":
|
52 |
+
svcc.main(output_path, dataset_path)
|
53 |
+
if dataset == "pjs":
|
54 |
+
pjs.main(output_path, dataset_path)
|
55 |
+
if dataset == "popbutfy":
|
56 |
+
popbutfy.main(output_path, dataset_path)
|
57 |
+
if dataset == "opensinger":
|
58 |
+
opensinger.main(output_path, dataset_path)
|
59 |
+
if dataset == "popcs":
|
60 |
+
popcs.main(output_path, dataset_path)
|
61 |
+
if dataset == "kising":
|
62 |
+
kising.main(output_path, dataset_path)
|
63 |
+
if dataset == "csd":
|
64 |
+
csd.main(output_path, dataset_path)
|
65 |
+
if dataset == "opera":
|
66 |
+
opera.main(output_path, dataset_path)
|
67 |
+
if dataset == "nus48e":
|
68 |
+
nus48e.main(output_path, dataset_path)
|
69 |
+
if dataset == "vctk":
|
70 |
+
vctk.main(output_path, dataset_path)
|
71 |
+
if dataset == "svcceval":
|
72 |
+
svcceval.main(output_path, dataset_path)
|
73 |
+
if dataset == "libritts":
|
74 |
+
libritts.main(output_path, dataset_path)
|
75 |
+
if dataset == "lijian":
|
76 |
+
lijian.main(output_path, dataset_path)
|
77 |
+
if dataset == "cdmusiceval":
|
78 |
+
cdmusiceval.main(output_path, dataset_path)
|
79 |
+
if dataset == "LJSpeech":
|
80 |
+
ljspeech.main(output_path, dataset_path, cfg)
|
81 |
+
if dataset == "ljspeech":
|
82 |
+
ljspeech_vocoder.main(output_path, dataset_path)
|
83 |
+
if dataset == "coco":
|
84 |
+
coco.main(output_path, dataset_path)
|
85 |
+
if dataset == "cocoeval":
|
86 |
+
cocoeval.main(output_path, dataset_path)
|
87 |
+
if dataset == "vocalist":
|
88 |
+
vocalist.main(output_path, dataset_path)
|
89 |
+
|
90 |
+
|
91 |
+
def prepare_align(dataset, dataset_path, cfg, output_path):
|
92 |
+
"""Call specific function to handle specific dataset
|
93 |
+
|
94 |
+
Args:
|
95 |
+
dataset (str): name of a dataset, e.g. ljspeech
|
96 |
+
dataset_path (str): path to dataset
|
97 |
+
output_path (str): path to store preprocessing result files
|
98 |
+
"""
|
99 |
+
if dataset == "LJSpeech":
|
100 |
+
ljspeech.prepare_align(dataset, dataset_path, cfg, output_path)
|
preprocessors/svcc.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import glob
|
8 |
+
import librosa
|
9 |
+
import json
|
10 |
+
|
11 |
+
from utils.util import has_existed
|
12 |
+
from preprocessors import GOLDEN_TEST_SAMPLES
|
13 |
+
|
14 |
+
|
15 |
+
def main(output_path, dataset_path):
|
16 |
+
print("-" * 10)
|
17 |
+
print("Preparing training dataset for svcc...")
|
18 |
+
|
19 |
+
data_dir = os.path.join(dataset_path, "Data")
|
20 |
+
save_dir = os.path.join(output_path, "svcc")
|
21 |
+
os.makedirs(save_dir, exist_ok=True)
|
22 |
+
|
23 |
+
singer_dict_file = os.path.join(save_dir, "singers.json")
|
24 |
+
utt2singer_file = os.path.join(save_dir, "utt2singer")
|
25 |
+
utt2singer = open(utt2singer_file, "w")
|
26 |
+
|
27 |
+
# Load utterances
|
28 |
+
train = []
|
29 |
+
test = []
|
30 |
+
singers = []
|
31 |
+
|
32 |
+
for wav_file in glob.glob(os.path.join(data_dir, "*/*.wav")):
|
33 |
+
singer, filename = wav_file.split("/")[-2:]
|
34 |
+
uid = filename.split(".")[0]
|
35 |
+
utt = {
|
36 |
+
"Dataset": "svcc",
|
37 |
+
"Singer": singer,
|
38 |
+
"Uid": "{}_{}".format(singer, uid),
|
39 |
+
"Path": wav_file,
|
40 |
+
}
|
41 |
+
|
42 |
+
# Duration
|
43 |
+
duration = librosa.get_duration(filename=wav_file)
|
44 |
+
utt["Duration"] = duration
|
45 |
+
|
46 |
+
if utt["Uid"] in GOLDEN_TEST_SAMPLES["svcc"]:
|
47 |
+
test.append(utt)
|
48 |
+
else:
|
49 |
+
train.append(utt)
|
50 |
+
|
51 |
+
singers.append(singer)
|
52 |
+
utt2singer.write("{}\t{}\n".format(utt["Uid"], utt["Singer"]))
|
53 |
+
|
54 |
+
# Save singers.json
|
55 |
+
unique_singers = list(set(singers))
|
56 |
+
unique_singers.sort()
|
57 |
+
singer_lut = {name: i for i, name in enumerate(unique_singers)}
|
58 |
+
with open(singer_dict_file, "w") as f:
|
59 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
60 |
+
|
61 |
+
train_total_duration = sum([utt["Duration"] for utt in train])
|
62 |
+
test_total_duration = sum([utt["Duration"] for utt in test])
|
63 |
+
|
64 |
+
for dataset_type in ["train", "test"]:
|
65 |
+
output_file = os.path.join(save_dir, "{}.json".format(dataset_type))
|
66 |
+
if has_existed(output_file):
|
67 |
+
continue
|
68 |
+
|
69 |
+
utterances = eval(dataset_type)
|
70 |
+
utterances = sorted(utterances, key=lambda x: x["Uid"])
|
71 |
+
|
72 |
+
for i in range(len(utterances)):
|
73 |
+
utterances[i]["index"] = i
|
74 |
+
|
75 |
+
print("{}: Total size: {}\n".format(dataset_type, len(utterances)))
|
76 |
+
|
77 |
+
# Save
|
78 |
+
with open(output_file, "w") as f:
|
79 |
+
json.dump(utterances, f, indent=4, ensure_ascii=False)
|
80 |
+
|
81 |
+
print(
|
82 |
+
"#Train hours= {}, #Test hours= {}".format(
|
83 |
+
train_total_duration / 3600, test_total_duration / 3600
|
84 |
+
)
|
85 |
+
)
|
preprocessors/svcceval.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import glob
|
8 |
+
import librosa
|
9 |
+
import json
|
10 |
+
|
11 |
+
from utils.util import has_existed
|
12 |
+
|
13 |
+
|
14 |
+
def main(output_path, dataset_path):
|
15 |
+
print("-" * 10)
|
16 |
+
print("Preparing training dataset for svcceval...")
|
17 |
+
|
18 |
+
data_dir = os.path.join(dataset_path, "Data")
|
19 |
+
save_dir = os.path.join(output_path, "svcceval")
|
20 |
+
os.makedirs(save_dir, exist_ok=True)
|
21 |
+
|
22 |
+
singer_dict_file = os.path.join(save_dir, "singers.json")
|
23 |
+
utt2singer_file = os.path.join(save_dir, "utt2singer")
|
24 |
+
utt2singer = open(utt2singer_file, "w")
|
25 |
+
|
26 |
+
# Load utterances
|
27 |
+
train = []
|
28 |
+
test = []
|
29 |
+
singers = []
|
30 |
+
for wav_file in glob.glob(os.path.join(data_dir, "*/*.wav")):
|
31 |
+
singer, filename = wav_file.split("/")[-2:]
|
32 |
+
uid = filename.split(".")[0]
|
33 |
+
utt = {
|
34 |
+
"Dataset": "svcceval",
|
35 |
+
"Singer": singer,
|
36 |
+
"Uid": "{}_{}".format(singer, uid),
|
37 |
+
"Path": wav_file,
|
38 |
+
}
|
39 |
+
|
40 |
+
# Duration
|
41 |
+
duration = librosa.get_duration(filename=wav_file)
|
42 |
+
utt["Duration"] = duration
|
43 |
+
|
44 |
+
test.append(utt)
|
45 |
+
|
46 |
+
singers.append(singer)
|
47 |
+
utt2singer.write("{}\t{}\n".format(utt["Uid"], utt["Singer"]))
|
48 |
+
|
49 |
+
# Save singers.json
|
50 |
+
unique_singers = list(set(singers))
|
51 |
+
unique_singers.sort()
|
52 |
+
singer_lut = {name: i for i, name in enumerate(unique_singers)}
|
53 |
+
with open(singer_dict_file, "w") as f:
|
54 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
55 |
+
|
56 |
+
train_total_duration = sum([utt["Duration"] for utt in train])
|
57 |
+
test_total_duration = sum([utt["Duration"] for utt in test])
|
58 |
+
|
59 |
+
for dataset_type in ["train", "test"]:
|
60 |
+
output_file = os.path.join(save_dir, "{}.json".format(dataset_type))
|
61 |
+
if has_existed(output_file):
|
62 |
+
continue
|
63 |
+
|
64 |
+
utterances = eval(dataset_type)
|
65 |
+
utterances = sorted(utterances, key=lambda x: x["Uid"])
|
66 |
+
|
67 |
+
for i in range(len(utterances)):
|
68 |
+
utterances[i]["index"] = i
|
69 |
+
|
70 |
+
print("{}: Total size: {}\n".format(dataset_type, len(utterances)))
|
71 |
+
|
72 |
+
# Save
|
73 |
+
with open(output_file, "w") as f:
|
74 |
+
json.dump(utterances, f, indent=4, ensure_ascii=False)
|
75 |
+
|
76 |
+
print(
|
77 |
+
"#Train hours= {}, #Test hours= {}".format(
|
78 |
+
train_total_duration / 3600, test_total_duration / 3600
|
79 |
+
)
|
80 |
+
)
|
preprocessors/vctk.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import librosa
|
9 |
+
from tqdm import tqdm
|
10 |
+
from glob import glob
|
11 |
+
from collections import defaultdict
|
12 |
+
|
13 |
+
from utils.util import has_existed
|
14 |
+
|
15 |
+
|
16 |
+
def get_lines(file):
|
17 |
+
with open(file, "r") as f:
|
18 |
+
lines = f.readlines()
|
19 |
+
lines = [l.strip() for l in lines]
|
20 |
+
return lines
|
21 |
+
|
22 |
+
|
23 |
+
def vctk_statistics(data_dir):
|
24 |
+
speakers = []
|
25 |
+
speakers2utts = defaultdict(list)
|
26 |
+
|
27 |
+
speaker_infos = glob(data_dir + "/wav48_silence_trimmed" + "/*")
|
28 |
+
|
29 |
+
for speaker_info in speaker_infos:
|
30 |
+
speaker = speaker_info.split("/")[-1]
|
31 |
+
|
32 |
+
if speaker == "log.txt":
|
33 |
+
continue
|
34 |
+
|
35 |
+
speakers.append(speaker)
|
36 |
+
|
37 |
+
utts = glob(speaker_info + "/*")
|
38 |
+
|
39 |
+
for utt in utts:
|
40 |
+
uid = (
|
41 |
+
utt.split("/")[-1].split("_")[1]
|
42 |
+
+ "_"
|
43 |
+
+ utt.split("/")[-1].split("_")[2].split(".")[0]
|
44 |
+
)
|
45 |
+
speakers2utts[speaker].append(uid)
|
46 |
+
|
47 |
+
unique_speakers = list(set(speakers))
|
48 |
+
unique_speakers.sort()
|
49 |
+
|
50 |
+
print("Speakers: \n{}".format("\t".join(unique_speakers)))
|
51 |
+
return speakers2utts, unique_speakers
|
52 |
+
|
53 |
+
|
54 |
+
def vctk_speaker_infos(data_dir):
|
55 |
+
file = os.path.join(data_dir, "speaker-info.txt")
|
56 |
+
lines = get_lines(file)
|
57 |
+
|
58 |
+
ID2speakers = defaultdict()
|
59 |
+
for l in tqdm(lines):
|
60 |
+
items = l.replace(" ", "")
|
61 |
+
|
62 |
+
if items[:2] == "ID":
|
63 |
+
# The header line
|
64 |
+
continue
|
65 |
+
|
66 |
+
if items[0] == "p":
|
67 |
+
id = items[:4]
|
68 |
+
gender = items[6]
|
69 |
+
elif items[0] == "s":
|
70 |
+
id = items[:2]
|
71 |
+
gender = items[4]
|
72 |
+
|
73 |
+
if gender == "F":
|
74 |
+
speaker = "female_{}".format(id)
|
75 |
+
elif gender == "M":
|
76 |
+
speaker = "male_{}".format(id)
|
77 |
+
|
78 |
+
ID2speakers[id] = speaker
|
79 |
+
|
80 |
+
return ID2speakers
|
81 |
+
|
82 |
+
|
83 |
+
def main(output_path, dataset_path, TEST_NUM_OF_EVERY_SPEAKER=3):
|
84 |
+
print("-" * 10)
|
85 |
+
print("Preparing test samples for vctk...")
|
86 |
+
|
87 |
+
save_dir = os.path.join(output_path, "vctk")
|
88 |
+
os.makedirs(save_dir, exist_ok=True)
|
89 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
90 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
91 |
+
singer_dict_file = os.path.join(save_dir, "singers.json")
|
92 |
+
utt2singer_file = os.path.join(save_dir, "utt2singer")
|
93 |
+
if has_existed(train_output_file):
|
94 |
+
return
|
95 |
+
utt2singer = open(utt2singer_file, "w")
|
96 |
+
|
97 |
+
# Load
|
98 |
+
vctk_dir = dataset_path
|
99 |
+
|
100 |
+
ID2speakers = vctk_speaker_infos(vctk_dir)
|
101 |
+
speaker2utts, unique_speakers = vctk_statistics(vctk_dir)
|
102 |
+
|
103 |
+
# We select speakers of standard samples as test utts
|
104 |
+
train = []
|
105 |
+
test = []
|
106 |
+
|
107 |
+
train_index_count = 0
|
108 |
+
test_index_count = 0
|
109 |
+
test_speaker_count = defaultdict(int)
|
110 |
+
|
111 |
+
train_total_duration = 0
|
112 |
+
test_total_duration = 0
|
113 |
+
|
114 |
+
for i, speaker in enumerate(speaker2utts.keys()):
|
115 |
+
for chosen_uid in tqdm(
|
116 |
+
speaker2utts[speaker],
|
117 |
+
desc="Speaker {}/{}, #Train = {}, #Test = {}".format(
|
118 |
+
i + 1, len(speaker2utts), train_index_count, test_index_count
|
119 |
+
),
|
120 |
+
):
|
121 |
+
res = {
|
122 |
+
"Dataset": "vctk",
|
123 |
+
"Singer": ID2speakers[speaker],
|
124 |
+
"Uid": "{}#{}".format(ID2speakers[speaker], chosen_uid),
|
125 |
+
}
|
126 |
+
res["Path"] = "{}/{}_{}.flac".format(speaker, speaker, chosen_uid)
|
127 |
+
res["Path"] = os.path.join(vctk_dir, "wav48_silence_trimmed", res["Path"])
|
128 |
+
assert os.path.exists(res["Path"])
|
129 |
+
|
130 |
+
duration = librosa.get_duration(filename=res["Path"])
|
131 |
+
res["Duration"] = duration
|
132 |
+
|
133 |
+
if test_speaker_count[speaker] < TEST_NUM_OF_EVERY_SPEAKER:
|
134 |
+
res["index"] = test_index_count
|
135 |
+
test_total_duration += duration
|
136 |
+
test.append(res)
|
137 |
+
test_index_count += 1
|
138 |
+
test_speaker_count[speaker] += 1
|
139 |
+
else:
|
140 |
+
res["index"] = train_index_count
|
141 |
+
train_total_duration += duration
|
142 |
+
train.append(res)
|
143 |
+
train_index_count += 1
|
144 |
+
|
145 |
+
utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
|
146 |
+
|
147 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
148 |
+
print(
|
149 |
+
"#Train hours= {}, #Test hours= {}".format(
|
150 |
+
train_total_duration / 3600, test_total_duration / 3600
|
151 |
+
)
|
152 |
+
)
|
153 |
+
|
154 |
+
# Save train.json and test.json
|
155 |
+
with open(train_output_file, "w") as f:
|
156 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
157 |
+
with open(test_output_file, "w") as f:
|
158 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
159 |
+
|
160 |
+
# Save singers.json
|
161 |
+
singer_lut = {name: i for i, name in enumerate(unique_speakers)}
|
162 |
+
with open(singer_dict_file, "w") as f:
|
163 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
preprocessors/vctkfewsinger.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import pickle
|
9 |
+
import glob
|
10 |
+
from collections import defaultdict
|
11 |
+
from tqdm import tqdm
|
12 |
+
|
13 |
+
|
14 |
+
# Train: male 20 hours, female 10 hours
|
15 |
+
TRAIN_MALE_MAX_SECONDS = 20 * 3600
|
16 |
+
TRAIN_FEMALE_MAX_SECONDS = 10 * 3600
|
17 |
+
TEST_MAX_NUM_EVERY_PERSON = 5
|
18 |
+
|
19 |
+
|
20 |
+
def select_sample_idxs():
|
21 |
+
chosen_speakers = get_chosen_speakers()
|
22 |
+
|
23 |
+
with open(os.path.join(vctk_dir, "train.json"), "r") as f:
|
24 |
+
raw_train = json.load(f)
|
25 |
+
with open(os.path.join(vctk_dir, "test.json"), "r") as f:
|
26 |
+
raw_test = json.load(f)
|
27 |
+
|
28 |
+
train_idxs, test_idxs = [], []
|
29 |
+
|
30 |
+
# =========== Test ===========
|
31 |
+
test_nums = defaultdict(int)
|
32 |
+
for utt in tqdm(raw_train):
|
33 |
+
idx = utt["index"]
|
34 |
+
singer = utt["Singer"]
|
35 |
+
|
36 |
+
if singer in chosen_speakers and test_nums[singer] < TEST_MAX_NUM_EVERY_PERSON:
|
37 |
+
test_nums[singer] += 1
|
38 |
+
test_idxs.append("train_{}".format(idx))
|
39 |
+
|
40 |
+
for utt in tqdm(raw_test):
|
41 |
+
idx = utt["index"]
|
42 |
+
singer = utt["Singer"]
|
43 |
+
|
44 |
+
if singer in chosen_speakers and test_nums[singer] < TEST_MAX_NUM_EVERY_PERSON:
|
45 |
+
test_nums[singer] += 1
|
46 |
+
test_idxs.append("test_{}".format(idx))
|
47 |
+
|
48 |
+
# =========== Train ===========
|
49 |
+
for utt in tqdm(raw_train):
|
50 |
+
idx = utt["index"]
|
51 |
+
singer = utt["Singer"]
|
52 |
+
|
53 |
+
if singer in chosen_speakers and "train_{}".format(idx) not in test_idxs:
|
54 |
+
train_idxs.append("train_{}".format(idx))
|
55 |
+
|
56 |
+
for utt in tqdm(raw_test):
|
57 |
+
idx = utt["index"]
|
58 |
+
singer = utt["Singer"]
|
59 |
+
|
60 |
+
if singer in chosen_speakers and "test_{}".format(idx) not in test_idxs:
|
61 |
+
train_idxs.append("test_{}".format(idx))
|
62 |
+
|
63 |
+
train_idxs.sort()
|
64 |
+
test_idxs.sort()
|
65 |
+
return train_idxs, test_idxs, raw_train, raw_test
|
66 |
+
|
67 |
+
|
68 |
+
def statistics_of_speakers():
|
69 |
+
speaker2time = defaultdict(float)
|
70 |
+
sex2time = defaultdict(float)
|
71 |
+
|
72 |
+
with open(os.path.join(vctk_dir, "train.json"), "r") as f:
|
73 |
+
train = json.load(f)
|
74 |
+
with open(os.path.join(vctk_dir, "test.json"), "r") as f:
|
75 |
+
test = json.load(f)
|
76 |
+
|
77 |
+
for utt in train + test:
|
78 |
+
# minutes
|
79 |
+
speaker2time[utt["Singer"]] += utt["Duration"]
|
80 |
+
# hours
|
81 |
+
sex2time[utt["Singer"].split("_")[0]] += utt["Duration"]
|
82 |
+
|
83 |
+
print(
|
84 |
+
"Female: {:.2f} hours, Male: {:.2f} hours.\n".format(
|
85 |
+
sex2time["female"] / 3600, sex2time["male"] / 3600
|
86 |
+
)
|
87 |
+
)
|
88 |
+
|
89 |
+
speaker2time = sorted(speaker2time.items(), key=lambda x: x[-1], reverse=True)
|
90 |
+
for singer, seconds in speaker2time:
|
91 |
+
print("{}\t{:.2f} mins".format(singer, seconds / 60))
|
92 |
+
|
93 |
+
return speaker2time
|
94 |
+
|
95 |
+
|
96 |
+
def get_chosen_speakers():
|
97 |
+
speaker2time = statistics_of_speakers()
|
98 |
+
|
99 |
+
chosen_time = defaultdict(float)
|
100 |
+
chosen_speaker = defaultdict(list)
|
101 |
+
train_constrait = {
|
102 |
+
"male": TRAIN_MALE_MAX_SECONDS,
|
103 |
+
"female": TRAIN_FEMALE_MAX_SECONDS,
|
104 |
+
}
|
105 |
+
|
106 |
+
for speaker, seconds in speaker2time:
|
107 |
+
sex = speaker.split("_")[0]
|
108 |
+
if chosen_time[sex] < train_constrait[sex]:
|
109 |
+
chosen_time[sex] += seconds
|
110 |
+
chosen_speaker[sex].append(speaker)
|
111 |
+
|
112 |
+
speaker2time = dict(speaker2time)
|
113 |
+
chosen_speaker = chosen_speaker["male"] + chosen_speaker["female"]
|
114 |
+
print("\n#Chosen speakers = {}".format(len(chosen_speaker)))
|
115 |
+
for spk in chosen_speaker:
|
116 |
+
print("{}\t{:.2f} mins".format(spk, speaker2time[spk] / 60))
|
117 |
+
|
118 |
+
return chosen_speaker
|
119 |
+
|
120 |
+
|
121 |
+
if __name__ == "__main__":
|
122 |
+
root_path = ""
|
123 |
+
vctk_dir = os.path.join(root_path, "vctk")
|
124 |
+
fewspeaker_dir = os.path.join(root_path, "vctkfewspeaker")
|
125 |
+
os.makedirs(fewspeaker_dir, exist_ok=True)
|
126 |
+
|
127 |
+
train_idxs, test_idxs, raw_train, raw_test = select_sample_idxs()
|
128 |
+
print("#Train = {}, #Test = {}".format(len(train_idxs), len(test_idxs)))
|
129 |
+
|
130 |
+
# There are no data leakage
|
131 |
+
assert len(set(train_idxs).intersection(set(test_idxs))) == 0
|
132 |
+
for idx in train_idxs + test_idxs:
|
133 |
+
# No test data of raw vctk
|
134 |
+
assert "test_" not in idx
|
135 |
+
|
136 |
+
for split, chosen_idxs in zip(["train", "test"], [train_idxs, test_idxs]):
|
137 |
+
print("{}: #chosen idx = {}\n".format(split, len(chosen_idxs)))
|
138 |
+
|
139 |
+
# Select features
|
140 |
+
feat_files = glob.glob("**/train.pkl", root_dir=vctk_dir, recursive=True)
|
141 |
+
for file in tqdm(feat_files):
|
142 |
+
raw_file = os.path.join(vctk_dir, file)
|
143 |
+
new_file = os.path.join(
|
144 |
+
fewspeaker_dir, file.replace("train.pkl", "{}.pkl".format(split))
|
145 |
+
)
|
146 |
+
|
147 |
+
new_dir = "/".join(new_file.split("/")[:-1])
|
148 |
+
os.makedirs(new_dir, exist_ok=True)
|
149 |
+
|
150 |
+
if "mel_min" in file or "mel_max" in file:
|
151 |
+
os.system("cp {} {}".format(raw_file, new_file))
|
152 |
+
continue
|
153 |
+
|
154 |
+
with open(raw_file, "rb") as f:
|
155 |
+
raw_feats = pickle.load(f)
|
156 |
+
|
157 |
+
print("file: {}, #raw_feats = {}".format(file, len(raw_feats)))
|
158 |
+
new_feats = []
|
159 |
+
for idx in chosen_idxs:
|
160 |
+
chosen_split_is_train, raw_idx = idx.split("_")
|
161 |
+
assert chosen_split_is_train == "train"
|
162 |
+
new_feats.append(raw_feats[int(raw_idx)])
|
163 |
+
|
164 |
+
with open(new_file, "wb") as f:
|
165 |
+
pickle.dump(new_feats, f)
|
166 |
+
print("New file: {}, #new_feats = {}".format(new_file, len(new_feats)))
|
167 |
+
|
168 |
+
# Utterance re-index
|
169 |
+
news_utts = [raw_train[int(idx.split("_")[-1])] for idx in chosen_idxs]
|
170 |
+
for i, utt in enumerate(news_utts):
|
171 |
+
utt["Dataset"] = "vctkfewsinger"
|
172 |
+
utt["index"] = i
|
173 |
+
|
174 |
+
with open(os.path.join(fewspeaker_dir, "{}.json".format(split)), "w") as f:
|
175 |
+
json.dump(news_utts, f, indent=4)
|
preprocessors/vctksample.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import pickle
|
9 |
+
import glob
|
10 |
+
from collections import defaultdict
|
11 |
+
from tqdm import tqdm
|
12 |
+
from preprocessors import get_golden_samples_indexes
|
13 |
+
|
14 |
+
|
15 |
+
TRAIN_MAX_NUM_EVERY_PERSON = 250
|
16 |
+
TEST_MAX_NUM_EVERY_PERSON = 25
|
17 |
+
|
18 |
+
|
19 |
+
def select_sample_idxs():
|
20 |
+
# =========== Train ===========
|
21 |
+
with open(os.path.join(vctk_dir, "train.json"), "r") as f:
|
22 |
+
raw_train = json.load(f)
|
23 |
+
|
24 |
+
train_idxs = []
|
25 |
+
train_nums = defaultdict(int)
|
26 |
+
for utt in tqdm(raw_train):
|
27 |
+
idx = utt["index"]
|
28 |
+
singer = utt["Singer"]
|
29 |
+
|
30 |
+
if train_nums[singer] < TRAIN_MAX_NUM_EVERY_PERSON:
|
31 |
+
train_idxs.append(idx)
|
32 |
+
train_nums[singer] += 1
|
33 |
+
|
34 |
+
# =========== Test ===========
|
35 |
+
with open(os.path.join(vctk_dir, "test.json"), "r") as f:
|
36 |
+
raw_test = json.load(f)
|
37 |
+
|
38 |
+
# golden test
|
39 |
+
test_idxs = get_golden_samples_indexes(
|
40 |
+
dataset_name="vctk", split="test", dataset_dir=vctk_dir
|
41 |
+
)
|
42 |
+
test_nums = defaultdict(int)
|
43 |
+
for idx in test_idxs:
|
44 |
+
singer = raw_test[idx]["Singer"]
|
45 |
+
test_nums[singer] += 1
|
46 |
+
|
47 |
+
for utt in tqdm(raw_test):
|
48 |
+
idx = utt["index"]
|
49 |
+
singer = utt["Singer"]
|
50 |
+
|
51 |
+
if test_nums[singer] < TEST_MAX_NUM_EVERY_PERSON:
|
52 |
+
test_idxs.append(idx)
|
53 |
+
test_nums[singer] += 1
|
54 |
+
|
55 |
+
train_idxs.sort()
|
56 |
+
test_idxs.sort()
|
57 |
+
return train_idxs, test_idxs, raw_train, raw_test
|
58 |
+
|
59 |
+
|
60 |
+
if __name__ == "__main__":
|
61 |
+
root_path = ""
|
62 |
+
vctk_dir = os.path.join(root_path, "vctk")
|
63 |
+
sample_dir = os.path.join(root_path, "vctksample")
|
64 |
+
os.makedirs(sample_dir, exist_ok=True)
|
65 |
+
|
66 |
+
train_idxs, test_idxs, raw_train, raw_test = select_sample_idxs()
|
67 |
+
print("#Train = {}, #Test = {}".format(len(train_idxs), len(test_idxs)))
|
68 |
+
|
69 |
+
for split, chosen_idxs, utterances in zip(
|
70 |
+
["train", "test"], [train_idxs, test_idxs], [raw_train, raw_test]
|
71 |
+
):
|
72 |
+
print(
|
73 |
+
"#{} = {}, #chosen idx = {}\n".format(
|
74 |
+
split, len(utterances), len(chosen_idxs)
|
75 |
+
)
|
76 |
+
)
|
77 |
+
|
78 |
+
# Select features
|
79 |
+
feat_files = glob.glob(
|
80 |
+
"**/{}.pkl".format(split), root_dir=vctk_dir, recursive=True
|
81 |
+
)
|
82 |
+
for file in tqdm(feat_files):
|
83 |
+
raw_file = os.path.join(vctk_dir, file)
|
84 |
+
new_file = os.path.join(sample_dir, file)
|
85 |
+
|
86 |
+
new_dir = "/".join(new_file.split("/")[:-1])
|
87 |
+
os.makedirs(new_dir, exist_ok=True)
|
88 |
+
|
89 |
+
if "mel_min" in file or "mel_max" in file:
|
90 |
+
os.system("cp {} {}".format(raw_file, new_file))
|
91 |
+
continue
|
92 |
+
|
93 |
+
with open(raw_file, "rb") as f:
|
94 |
+
raw_feats = pickle.load(f)
|
95 |
+
|
96 |
+
print("file: {}, #raw_feats = {}".format(file, len(raw_feats)))
|
97 |
+
new_feats = [raw_feats[idx] for idx in chosen_idxs]
|
98 |
+
with open(new_file, "wb") as f:
|
99 |
+
pickle.dump(new_feats, f)
|
100 |
+
|
101 |
+
# Utterance re-index
|
102 |
+
news_utts = [utterances[idx] for idx in chosen_idxs]
|
103 |
+
for i, utt in enumerate(news_utts):
|
104 |
+
utt["Dataset"] = "vctksample"
|
105 |
+
utt["index"] = i
|
106 |
+
|
107 |
+
with open(os.path.join(sample_dir, "{}.json".format(split)), "w") as f:
|
108 |
+
json.dump(news_utts, f, indent=4)
|
preprocessors/vocalist.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import torchaudio
|
9 |
+
from tqdm import tqdm
|
10 |
+
from glob import glob
|
11 |
+
from collections import defaultdict
|
12 |
+
|
13 |
+
from utils.util import has_existed
|
14 |
+
|
15 |
+
|
16 |
+
def vocalist_statistics(data_dir):
|
17 |
+
singers = []
|
18 |
+
songs = []
|
19 |
+
global2singer2songs = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
20 |
+
|
21 |
+
global_infos = glob(data_dir + "/*")
|
22 |
+
|
23 |
+
for global_info in global_infos:
|
24 |
+
global_split = global_info.split("/")[-1]
|
25 |
+
|
26 |
+
singer_infos = glob(global_info + "/*")
|
27 |
+
|
28 |
+
for singer_info in singer_infos:
|
29 |
+
singer = singer_info.split("/")[-1]
|
30 |
+
|
31 |
+
singers.append(singer)
|
32 |
+
|
33 |
+
song_infos = glob(singer_info + "/*")
|
34 |
+
for song_info in song_infos:
|
35 |
+
song = song_info.split("/")[-1]
|
36 |
+
|
37 |
+
songs.append(song)
|
38 |
+
|
39 |
+
utts = glob(song_info + "/*.wav")
|
40 |
+
|
41 |
+
for utt in utts:
|
42 |
+
uid = utt.split("/")[-1].split(".")[0]
|
43 |
+
global2singer2songs[global_split][singer][song].append(uid)
|
44 |
+
|
45 |
+
unique_singers = list(set(singers))
|
46 |
+
unique_songs = list(set(songs))
|
47 |
+
unique_singers.sort()
|
48 |
+
unique_songs.sort()
|
49 |
+
|
50 |
+
print(
|
51 |
+
"vocalist: {} singers, {} songs ({} unique songs)".format(
|
52 |
+
len(unique_singers), len(songs), len(unique_songs)
|
53 |
+
)
|
54 |
+
)
|
55 |
+
print("Singers: \n{}".format("\t".join(unique_singers)))
|
56 |
+
return global2singer2songs, unique_singers
|
57 |
+
|
58 |
+
|
59 |
+
def main(output_path, dataset_path):
|
60 |
+
print("-" * 10)
|
61 |
+
print("Preparing test samples for vocalist...\n")
|
62 |
+
|
63 |
+
save_dir = os.path.join(output_path, "vocalist")
|
64 |
+
os.makedirs(save_dir, exist_ok=True)
|
65 |
+
train_output_file = os.path.join(save_dir, "train.json")
|
66 |
+
test_output_file = os.path.join(save_dir, "test.json")
|
67 |
+
singer_dict_file = os.path.join(save_dir, "singers.json")
|
68 |
+
utt2singer_file = os.path.join(save_dir, "utt2singer")
|
69 |
+
if (
|
70 |
+
has_existed(train_output_file)
|
71 |
+
and has_existed(test_output_file)
|
72 |
+
and has_existed(singer_dict_file)
|
73 |
+
and has_existed(utt2singer_file)
|
74 |
+
):
|
75 |
+
return
|
76 |
+
utt2singer = open(utt2singer_file, "w")
|
77 |
+
|
78 |
+
# Load
|
79 |
+
vocalist_path = dataset_path
|
80 |
+
|
81 |
+
global2singer2songs, unique_singers = vocalist_statistics(vocalist_path)
|
82 |
+
|
83 |
+
train = []
|
84 |
+
test = []
|
85 |
+
|
86 |
+
train_index_count = 0
|
87 |
+
test_index_count = 0
|
88 |
+
|
89 |
+
train_total_duration = 0
|
90 |
+
test_total_duration = 0
|
91 |
+
|
92 |
+
for global_info, singer2songs in tqdm(global2singer2songs.items()):
|
93 |
+
for singer, songs in tqdm(singer2songs.items()):
|
94 |
+
song_names = list(songs.keys())
|
95 |
+
|
96 |
+
for chosen_song in song_names:
|
97 |
+
for chosen_uid in songs[chosen_song]:
|
98 |
+
res = {
|
99 |
+
"Dataset": "opensinger",
|
100 |
+
"Singer": singer,
|
101 |
+
"Song": chosen_song,
|
102 |
+
"Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
|
103 |
+
}
|
104 |
+
res["Path"] = "{}/{}/{}/{}.wav".format(
|
105 |
+
global_info, singer, chosen_song, chosen_uid
|
106 |
+
)
|
107 |
+
res["Path"] = os.path.join(vocalist_path, res["Path"])
|
108 |
+
assert os.path.exists(res["Path"])
|
109 |
+
|
110 |
+
waveform, sample_rate = torchaudio.load(res["Path"])
|
111 |
+
duration = waveform.size(-1) / sample_rate
|
112 |
+
res["Duration"] = duration
|
113 |
+
|
114 |
+
res["index"] = test_index_count
|
115 |
+
test_total_duration += duration
|
116 |
+
test.append(res)
|
117 |
+
test_index_count += 1
|
118 |
+
|
119 |
+
utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
|
120 |
+
|
121 |
+
print("#Train = {}, #Test = {}".format(len(train), len(test)))
|
122 |
+
print(
|
123 |
+
"#Train hours= {}, #Test hours= {}".format(
|
124 |
+
train_total_duration / 3600, test_total_duration / 3600
|
125 |
+
)
|
126 |
+
)
|
127 |
+
|
128 |
+
# Save train.json and test.json
|
129 |
+
with open(train_output_file, "w") as f:
|
130 |
+
json.dump(train, f, indent=4, ensure_ascii=False)
|
131 |
+
with open(test_output_file, "w") as f:
|
132 |
+
json.dump(test, f, indent=4, ensure_ascii=False)
|
133 |
+
|
134 |
+
# Save singers.json
|
135 |
+
singer_lut = {name: i for i, name in enumerate(unique_singers)}
|
136 |
+
with open(singer_dict_file, "w") as f:
|
137 |
+
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
|
pretrained/bigvgan/args.json
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "egs/vocoder/gan/exp_config_base.json",
|
3 |
+
"exp_name": "bigvgan_large",
|
4 |
+
"inference": {
|
5 |
+
"batch_size": 1,
|
6 |
+
},
|
7 |
+
"model": {
|
8 |
+
"bigvgan": {
|
9 |
+
"activation": "snakebeta",
|
10 |
+
"resblock": "1",
|
11 |
+
"resblock_dilation_sizes": [
|
12 |
+
[
|
13 |
+
1,
|
14 |
+
3,
|
15 |
+
5,
|
16 |
+
],
|
17 |
+
[
|
18 |
+
1,
|
19 |
+
3,
|
20 |
+
5,
|
21 |
+
],
|
22 |
+
[
|
23 |
+
1,
|
24 |
+
3,
|
25 |
+
5,
|
26 |
+
],
|
27 |
+
],
|
28 |
+
"resblock_kernel_sizes": [
|
29 |
+
3,
|
30 |
+
7,
|
31 |
+
11,
|
32 |
+
],
|
33 |
+
"snake_logscale": true,
|
34 |
+
"upsample_initial_channel": 1536,
|
35 |
+
"upsample_kernel_sizes": [
|
36 |
+
8,
|
37 |
+
8,
|
38 |
+
4,
|
39 |
+
4,
|
40 |
+
4,
|
41 |
+
4,
|
42 |
+
],
|
43 |
+
"upsample_rates": [
|
44 |
+
4,
|
45 |
+
4,
|
46 |
+
2,
|
47 |
+
2,
|
48 |
+
2,
|
49 |
+
2,
|
50 |
+
],
|
51 |
+
},
|
52 |
+
"discriminators": [
|
53 |
+
"mpd",
|
54 |
+
"msstftd",
|
55 |
+
],
|
56 |
+
"generator": "bigvgan",
|
57 |
+
"mpd": {
|
58 |
+
"discriminator_channel_multi": 1,
|
59 |
+
"mpd_reshapes": [
|
60 |
+
2,
|
61 |
+
3,
|
62 |
+
5,
|
63 |
+
7,
|
64 |
+
11,
|
65 |
+
],
|
66 |
+
"use_spectral_norm": false,
|
67 |
+
},
|
68 |
+
"mrd": {
|
69 |
+
"discriminator_channel_multi": 1,
|
70 |
+
"mrd_override": false,
|
71 |
+
"resolutions": [
|
72 |
+
[
|
73 |
+
1024,
|
74 |
+
120,
|
75 |
+
600,
|
76 |
+
],
|
77 |
+
[
|
78 |
+
2048,
|
79 |
+
240,
|
80 |
+
1200,
|
81 |
+
],
|
82 |
+
[
|
83 |
+
512,
|
84 |
+
50,
|
85 |
+
240,
|
86 |
+
],
|
87 |
+
],
|
88 |
+
"use_spectral_norm": false,
|
89 |
+
},
|
90 |
+
"msstftd": {
|
91 |
+
"filters": 32,
|
92 |
+
},
|
93 |
+
},
|
94 |
+
"model_type": "GANVocoder",
|
95 |
+
"preprocess": {
|
96 |
+
"audio_dir": "audios",
|
97 |
+
"bits": 8,
|
98 |
+
"contentvec_dir": "contentvec",
|
99 |
+
"cut_mel_frame": 32,
|
100 |
+
"data_augment": false,
|
101 |
+
"dur_dir": "durs",
|
102 |
+
"duration_dir": "duration",
|
103 |
+
"emo2id": "emo2id.json",
|
104 |
+
"energy_dir": "energys",
|
105 |
+
"energy_extract_mode": "from_mel",
|
106 |
+
"energy_norm": false,
|
107 |
+
"extract_audio": true,
|
108 |
+
"extract_contentvec_feature": false,
|
109 |
+
"extract_duration": false,
|
110 |
+
"extract_energy": false,
|
111 |
+
"extract_label": false,
|
112 |
+
"extract_mcep": false,
|
113 |
+
"extract_mel": true,
|
114 |
+
"extract_mert_feature": false,
|
115 |
+
"extract_one_hot": false,
|
116 |
+
"extract_pitch": false,
|
117 |
+
"extract_uv": false,
|
118 |
+
"extract_wenet_feature": false,
|
119 |
+
"extract_whisper_feature": false,
|
120 |
+
"f0_max": 1100,
|
121 |
+
"f0_min": 50,
|
122 |
+
"file_lst": "file.lst",
|
123 |
+
"fmax": 12000,
|
124 |
+
"fmin": 0,
|
125 |
+
"hop_size": 256,
|
126 |
+
"is_mu_law": false,
|
127 |
+
"lab_dir": "labs",
|
128 |
+
"label_dir": "labels",
|
129 |
+
"mcep_dir": "mcep",
|
130 |
+
"mel_dir": "mels",
|
131 |
+
"mel_min_max_norm": false,
|
132 |
+
"min_level_db": -115,
|
133 |
+
"n_fft": 1024,
|
134 |
+
"n_mel": 100,
|
135 |
+
"num_silent_frames": 8,
|
136 |
+
"phone_seq_file": "phone_seq_file",
|
137 |
+
"pitch_bin": 256,
|
138 |
+
"pitch_dir": "pitches",
|
139 |
+
"pitch_extractor": "parselmouth",
|
140 |
+
"pitch_max": 1100.0,
|
141 |
+
"pitch_min": 50.0,
|
142 |
+
"pitch_norm": false,
|
143 |
+
"processed_dir": "processed_data",
|
144 |
+
"ref_level_db": 20,
|
145 |
+
"sample_rate": 24000,
|
146 |
+
"spk2id": "singers.json",
|
147 |
+
"train_file": "train.json",
|
148 |
+
"trim_fft_size": 512,
|
149 |
+
"trim_hop_size": 128,
|
150 |
+
"trim_silence": false,
|
151 |
+
"trim_top_db": 30,
|
152 |
+
"trimmed_wav_dir": "trimmed_wavs",
|
153 |
+
"use_audio": true,
|
154 |
+
"use_dur": false,
|
155 |
+
"use_emoid": false,
|
156 |
+
"use_frame_duration": false,
|
157 |
+
"use_frame_energy": false,
|
158 |
+
"use_frame_pitch": false,
|
159 |
+
"use_lab": false,
|
160 |
+
"use_label": false,
|
161 |
+
"use_log_scale_energy": false,
|
162 |
+
"use_log_scale_pitch": false,
|
163 |
+
"use_mel": true,
|
164 |
+
"use_one_hot": false,
|
165 |
+
"use_phn_seq": false,
|
166 |
+
"use_phone_duration": false,
|
167 |
+
"use_phone_energy": false,
|
168 |
+
"use_phone_pitch": false,
|
169 |
+
"use_spkid": false,
|
170 |
+
"use_uv": false,
|
171 |
+
"use_wav": false,
|
172 |
+
"use_wenet": false,
|
173 |
+
"utt2emo": "utt2emo",
|
174 |
+
"utt2spk": "utt2spk",
|
175 |
+
"uv_dir": "uvs",
|
176 |
+
"valid_file": "test.json",
|
177 |
+
"wav_dir": "wavs",
|
178 |
+
"wenet_dir": "wenet",
|
179 |
+
"win_size": 1024,
|
180 |
+
},
|
181 |
+
"supported_model_type": [
|
182 |
+
"GANVocoder",
|
183 |
+
"Fastspeech2",
|
184 |
+
"DiffSVC",
|
185 |
+
"Transformer",
|
186 |
+
"EDM",
|
187 |
+
"CD",
|
188 |
+
],
|
189 |
+
"train": {
|
190 |
+
"adamw": {
|
191 |
+
"adam_b1": 0.8,
|
192 |
+
"adam_b2": 0.99,
|
193 |
+
"lr": 0.0002,
|
194 |
+
},
|
195 |
+
"batch_size": 4,
|
196 |
+
"criterions": [
|
197 |
+
"feature",
|
198 |
+
"discriminator",
|
199 |
+
"generator",
|
200 |
+
"mel",
|
201 |
+
],
|
202 |
+
"dataloader": {
|
203 |
+
"num_worker": 4,
|
204 |
+
"pin_memory": true,
|
205 |
+
},
|
206 |
+
"ddp": true,
|
207 |
+
"epochs": 50000,
|
208 |
+
"exponential_lr": {
|
209 |
+
"lr_decay": 0.999,
|
210 |
+
},
|
211 |
+
"gradient_accumulation_step": 1,
|
212 |
+
"keep_checkpoint_max": 5,
|
213 |
+
"max_epoch": 1000000,
|
214 |
+
"max_steps": 1000000,
|
215 |
+
"multi_speaker_training": false,
|
216 |
+
"random_seed": 114514,
|
217 |
+
"run_eval": [
|
218 |
+
true,
|
219 |
+
],
|
220 |
+
"sampler": {
|
221 |
+
"drop_last": true,
|
222 |
+
"holistic_shuffle": true,
|
223 |
+
},
|
224 |
+
"save_checkpoint_stride": [
|
225 |
+
200,
|
226 |
+
],
|
227 |
+
"save_checkpoints_steps": 10000,
|
228 |
+
"save_summary_steps": 500,
|
229 |
+
"total_training_steps": 50000,
|
230 |
+
"tracker": [
|
231 |
+
"tensorboard",
|
232 |
+
],
|
233 |
+
"valid_interval": 10000,
|
234 |
+
},
|
235 |
+
}
|
pretrained/contentvec/README.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Download
|
2 |
+
|
3 |
+
- [Link](https://github.com/auspicious3000/contentvec)
|
4 |
+
- Model: `ContentVec_legacy`
|
5 |
+
- Classes: 500
|