Billpai commited on
Commit
0312eff
·
1 Parent(s): f196feb
optimizer/__init__.py ADDED
File without changes
optimizer/optimizers.py ADDED
@@ -0,0 +1,780 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This module is modified from https://github.com/Plachtaa/VALL-E-X/blob/3faaf8ccadb154d63b38070caf518ce9309ea0f4/modules/optim.py#L836
2
+
3
+ import logging
4
+ import contextlib
5
+ import torch
6
+ from torch import Tensor
7
+ from torch.optim.lr_scheduler import _LRScheduler
8
+ from torch.optim import Optimizer
9
+ from typing import List, Tuple
10
+ from collections import defaultdict
11
+
12
+
13
+ class NoamLR(_LRScheduler):
14
+ """
15
+ Implements the Noam Learning rate schedule. This corresponds to increasing the learning rate
16
+ linearly for the first ``num_warmup`` training steps, and decreasing it thereafter proportionally
17
+ to the inverse square root of the step number, scaled by the inverse square root of the
18
+ dimensionality of the model. Time will tell if this is just madness or it's actually important.
19
+ Parameters
20
+ ----------
21
+ num_warmup: ``int``, required.
22
+ The number of steps to linearly increase the learning rate.
23
+ """
24
+
25
+ def __init__(self, optimizer, num_warmup):
26
+ self.num_warmup = num_warmup
27
+ self.base_lr = optimizer.param_groups[0]["lr"]
28
+ super().__init__(optimizer)
29
+
30
+ def get_lr(self):
31
+ last_epoch = max(1, self.last_epoch)
32
+ scale = min(last_epoch ** (-0.5), last_epoch * self.num_warmup ** (-1.5))
33
+ return [scale * self.base_lr]
34
+
35
+ class Eve(Optimizer):
36
+ """
37
+ Implements Eve algorithm. This is a modified version of AdamW with a special
38
+ way of setting the weight-decay / shrinkage-factor, which is designed to make the
39
+ rms of the parameters approach a particular target_rms (default: 0.1). This is
40
+ for use with networks with 'scaled' versions of modules (see scaling.py), which
41
+ will be close to invariant to the absolute scale on the parameter matrix.
42
+
43
+ The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
44
+ The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
45
+ Eve is unpublished so far.
46
+
47
+ Arguments:
48
+ params (iterable): iterable of parameters to optimize or dicts defining
49
+ parameter groups
50
+ lr (float, optional): learning rate (default: 1e-3)
51
+ betas (Tuple[float, float], optional): coefficients used for computing
52
+ running averages of gradient and its square (default: (0.9, 0.999))
53
+ eps (float, optional): term added to the denominator to improve
54
+ numerical stability (default: 1e-8)
55
+ weight_decay (float, optional): weight decay coefficient (default: 3e-4;
56
+ this value means that the weight would decay significantly after
57
+ about 3k minibatches. Is not multiplied by learning rate, but
58
+ is conditional on RMS-value of parameter being > target_rms.
59
+ target_rms (float, optional): target root-mean-square value of
60
+ parameters, if they fall below this we will stop applying weight decay.
61
+
62
+
63
+ .. _Adam: A Method for Stochastic Optimization:
64
+ https://arxiv.org/abs/1412.6980
65
+ .. _Decoupled Weight Decay Regularization:
66
+ https://arxiv.org/abs/1711.05101
67
+ .. _On the Convergence of Adam and Beyond:
68
+ https://openreview.net/forum?id=ryQu7f-RZ
69
+ """
70
+
71
+ def __init__(
72
+ self,
73
+ params,
74
+ lr=1e-3,
75
+ betas=(0.9, 0.98),
76
+ eps=1e-8,
77
+ weight_decay=1e-3,
78
+ target_rms=0.1,
79
+ ):
80
+ if not 0.0 <= lr:
81
+ raise ValueError("Invalid learning rate: {}".format(lr))
82
+ if not 0.0 <= eps:
83
+ raise ValueError("Invalid epsilon value: {}".format(eps))
84
+ if not 0.0 <= betas[0] < 1.0:
85
+ raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
86
+ if not 0.0 <= betas[1] < 1.0:
87
+ raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
88
+ if not 0 <= weight_decay <= 0.1:
89
+ raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
90
+ if not 0 < target_rms <= 10.0:
91
+ raise ValueError("Invalid target_rms value: {}".format(target_rms))
92
+ defaults = dict(
93
+ lr=lr,
94
+ betas=betas,
95
+ eps=eps,
96
+ weight_decay=weight_decay,
97
+ target_rms=target_rms,
98
+ )
99
+ super(Eve, self).__init__(params, defaults)
100
+
101
+ def __setstate__(self, state):
102
+ super(Eve, self).__setstate__(state)
103
+
104
+ @torch.no_grad()
105
+ def step(self, closure=None):
106
+ """Performs a single optimization step.
107
+
108
+ Arguments:
109
+ closure (callable, optional): A closure that reevaluates the model
110
+ and returns the loss.
111
+ """
112
+ loss = None
113
+ if closure is not None:
114
+ with torch.enable_grad():
115
+ loss = closure()
116
+
117
+ for group in self.param_groups:
118
+ for p in group["params"]:
119
+ if p.grad is None:
120
+ continue
121
+
122
+ # Perform optimization step
123
+ grad = p.grad
124
+ if grad.is_sparse:
125
+ raise RuntimeError("AdamW does not support sparse gradients")
126
+
127
+ state = self.state[p]
128
+
129
+ # State initialization
130
+ if len(state) == 0:
131
+ state["step"] = 0
132
+ # Exponential moving average of gradient values
133
+ state["exp_avg"] = torch.zeros_like(
134
+ p, memory_format=torch.preserve_format
135
+ )
136
+ # Exponential moving average of squared gradient values
137
+ state["exp_avg_sq"] = torch.zeros_like(
138
+ p, memory_format=torch.preserve_format
139
+ )
140
+
141
+ exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
142
+
143
+ beta1, beta2 = group["betas"]
144
+
145
+ state["step"] += 1
146
+ bias_correction1 = 1 - beta1 ** state["step"]
147
+ bias_correction2 = 1 - beta2 ** state["step"]
148
+
149
+ # Decay the first and second moment running average coefficient
150
+ exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
151
+ exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
152
+ denom = (exp_avg_sq.sqrt() * (bias_correction2**-0.5)).add_(
153
+ group["eps"]
154
+ )
155
+
156
+ step_size = group["lr"] / bias_correction1
157
+ target_rms = group["target_rms"]
158
+ weight_decay = group["weight_decay"]
159
+
160
+ if p.numel() > 1:
161
+ # avoid applying this weight-decay on "scaling factors"
162
+ # (which are scalar).
163
+ is_above_target_rms = p.norm() > (target_rms * (p.numel() ** 0.5))
164
+ p.mul_(1 - (weight_decay * is_above_target_rms))
165
+
166
+ p.addcdiv_(exp_avg, denom, value=-step_size)
167
+
168
+ # if random.random() < 0.0005:
169
+ # step = (exp_avg / denom) * step_size
170
+ # logging.info(
171
+ # f"Delta rms = {(step**2).mean().item()}, shape = {step.shape}"
172
+ # )
173
+
174
+ return loss
175
+
176
+ class BatchedOptimizer(Optimizer):
177
+ """
178
+ This class adds to class Optimizer the capability to optimize parameters in batches:
179
+ it will stack the parameters and their grads for you so the optimizer can work
180
+ on tensors with an extra leading dimension. This is intended for speed with GPUs,
181
+ as it reduces the number of kernels launched in the optimizer.
182
+
183
+ Args:
184
+ params:
185
+ """
186
+
187
+ def __init__(self, params, defaults):
188
+ super(BatchedOptimizer, self).__init__(params, defaults)
189
+
190
+ @contextlib.contextmanager
191
+ def batched_params(self, param_group, group_params_names):
192
+ """
193
+ This function returns (technically, yields) a list of
194
+ of tuples (p, state), where
195
+ p is a `fake` parameter that is stacked (over axis 0) from real parameters
196
+ that share the same shape, and its gradient is also stacked;
197
+ `state` is the state corresponding to this batch of parameters
198
+ (it will be physically located in the "state" for one of the real
199
+ parameters, the last one that has any particular shape and dtype).
200
+
201
+ This function is decorated as a context manager so that it can
202
+ write parameters back to their "real" locations.
203
+
204
+ The idea is, instead of doing:
205
+ <code>
206
+ for p in group["params"]:
207
+ state = self.state[p]
208
+ ...
209
+ </code>
210
+ you can do:
211
+ <code>
212
+ with self.batched_params(group["params"]) as batches:
213
+ for p, state, p_names in batches:
214
+ ...
215
+ </code>
216
+
217
+ Args:
218
+ group: a parameter group, which is a list of parameters; should be
219
+ one of self.param_groups.
220
+ group_params_names: name for each parameter in group,
221
+ which is List[str].
222
+ """
223
+ batches = defaultdict(
224
+ list
225
+ ) # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter
226
+ batches_names = defaultdict(
227
+ list
228
+ ) # `batches` maps from tuple (dtype_as_str,*shape) to list of str
229
+
230
+ assert len(param_group) == len(group_params_names)
231
+ for p, named_p in zip(param_group, group_params_names):
232
+ key = (str(p.dtype), *p.shape)
233
+ batches[key].append(p)
234
+ batches_names[key].append(named_p)
235
+
236
+ batches_names_keys = list(batches_names.keys())
237
+ sorted_idx = sorted(
238
+ range(len(batches_names)), key=lambda i: batches_names_keys[i]
239
+ )
240
+ batches_names = [batches_names[batches_names_keys[idx]] for idx in sorted_idx]
241
+ batches = [batches[batches_names_keys[idx]] for idx in sorted_idx]
242
+
243
+ stacked_params_dict = dict()
244
+
245
+ # turn batches into a list, in deterministic order.
246
+ # tuples will contain tuples of (stacked_param, state, stacked_params_names),
247
+ # one for each batch in `batches`.
248
+ tuples = []
249
+
250
+ for batch, batch_names in zip(batches, batches_names):
251
+ p = batch[0]
252
+ # we arbitrarily store the state in the
253
+ # state corresponding to the 1st parameter in the
254
+ # group. class Optimizer will take care of saving/loading state.
255
+ state = self.state[p]
256
+ p_stacked = torch.stack(batch)
257
+ grad = torch.stack(
258
+ [torch.zeros_like(p) if p.grad is None else p.grad for p in batch]
259
+ )
260
+ p_stacked.grad = grad
261
+ stacked_params_dict[key] = p_stacked
262
+ tuples.append((p_stacked, state, batch_names))
263
+
264
+ yield tuples
265
+
266
+ for ((stacked_params, _state, _names), batch) in zip(tuples, batches):
267
+ for i, p in enumerate(batch):
268
+ p.copy_(stacked_params[i])
269
+
270
+ class ScaledAdam(BatchedOptimizer):
271
+ """
272
+ Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update
273
+ proportional to the norm of that parameter; and also learn the scale of the parameter,
274
+ in log space, subject to upper and lower limits (as if we had factored each parameter as
275
+ param = underlying_param * log_scale.exp())
276
+
277
+
278
+ Args:
279
+ params: The parameters or param_groups to optimize (like other Optimizer subclasses)
280
+ lr: The learning rate. We will typically use a learning rate schedule that starts
281
+ at 0.03 and decreases over time, i.e. much higher than other common
282
+ optimizers.
283
+ clipping_scale: (e.g. 2.0)
284
+ A scale for gradient-clipping: if specified, the normalized gradients
285
+ over the whole model will be clipped to have 2-norm equal to
286
+ `clipping_scale` times the median 2-norm over the most recent period
287
+ of `clipping_update_period` minibatches. By "normalized gradients",
288
+ we mean after multiplying by the rms parameter value for this tensor
289
+ [for non-scalars]; this is appropriate because our update is scaled
290
+ by this quantity.
291
+ betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad.
292
+ Must satisfy 0 < beta <= beta2 < 1.
293
+ scalar_lr_scale: A scaling factor on the learning rate, that we use to update the
294
+ scale of each parameter tensor and scalar parameters of the mode..
295
+ If each parameter were decomposed
296
+ as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale
297
+ would be a the scaling factor on the learning rate of p_scale.
298
+ eps: A general-purpose epsilon to prevent division by zero
299
+ param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of
300
+ learning the scale on the parameters (we'll constrain the rms of each non-scalar
301
+ parameter tensor to be >= this value)
302
+ param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of
303
+ learning the scale on the parameters (we'll constrain the rms of each non-scalar
304
+ parameter tensor to be <= this value)
305
+ scalar_max: Maximum absolute value for scalar parameters (applicable if your
306
+ model has any parameters with numel() == 1).
307
+ size_update_period: The periodicity, in steps, with which we update the size (scale)
308
+ of the parameter tensor. This is provided to save a little time
309
+ in the update.
310
+ clipping_update_period: if clipping_scale is specified, this is the period
311
+ """
312
+
313
+ def __init__(
314
+ self,
315
+ params,
316
+ lr=3e-02,
317
+ clipping_scale=None,
318
+ betas=(0.9, 0.98),
319
+ scalar_lr_scale=0.1,
320
+ eps=1.0e-08,
321
+ param_min_rms=1.0e-05,
322
+ param_max_rms=3.0,
323
+ scalar_max=10.0,
324
+ size_update_period=4,
325
+ clipping_update_period=100,
326
+ parameters_names=None,
327
+ show_dominant_parameters=True,
328
+ ):
329
+
330
+ assert parameters_names is not None, (
331
+ "Please prepare parameters_names,"
332
+ "which is a List[List[str]]. Each List[str] is for a group"
333
+ "and each str is for a parameter"
334
+ )
335
+ defaults = dict(
336
+ lr=lr,
337
+ clipping_scale=clipping_scale,
338
+ betas=betas,
339
+ scalar_lr_scale=scalar_lr_scale,
340
+ eps=eps,
341
+ param_min_rms=param_min_rms,
342
+ param_max_rms=param_max_rms,
343
+ scalar_max=scalar_max,
344
+ size_update_period=size_update_period,
345
+ clipping_update_period=clipping_update_period,
346
+ )
347
+
348
+ super(ScaledAdam, self).__init__(params, defaults)
349
+ assert len(self.param_groups) == len(parameters_names)
350
+ self.parameters_names = parameters_names
351
+ self.show_dominant_parameters = show_dominant_parameters
352
+
353
+ def __setstate__(self, state):
354
+ super(ScaledAdam, self).__setstate__(state)
355
+
356
+ @torch.no_grad()
357
+ def step(self, closure=None):
358
+ """Performs a single optimization step.
359
+
360
+ Arguments:
361
+ closure (callable, optional): A closure that reevaluates the model
362
+ and returns the loss.
363
+ """
364
+ loss = None
365
+ if closure is not None:
366
+ with torch.enable_grad():
367
+ loss = closure()
368
+
369
+ batch = True
370
+
371
+ for group, group_params_names in zip(self.param_groups, self.parameters_names):
372
+
373
+ with self.batched_params(group["params"], group_params_names) as batches:
374
+
375
+ # batches is list of pairs (stacked_param, state). stacked_param is like
376
+ # a regular parameter, and will have a .grad, but the 1st dim corresponds to
377
+ # a stacking dim, it is not a real dim.
378
+
379
+ if (
380
+ len(batches[0][1]) == 0
381
+ ):
382
+ clipping_scale = 1
383
+ else:
384
+ clipping_scale = self._get_clipping_scale(group, batches)
385
+
386
+ for p, state, _ in batches:
387
+ # Perform optimization step.
388
+ # grad is not going to be None, we handled that when creating the batches.
389
+ grad = p.grad
390
+ if grad.is_sparse:
391
+ raise RuntimeError(
392
+ "ScaledAdam optimizer does not support sparse gradients"
393
+ )
394
+ # State initialization
395
+ if len(state) == 0:
396
+ self._init_state(group, p, state)
397
+
398
+ self._step_one_batch(group, p, state, clipping_scale)
399
+
400
+ return loss
401
+
402
+ def _init_state(self, group: dict, p: Tensor, state: dict):
403
+ """
404
+ Initializes state dict for parameter 'p'. Assumes that dim 0 of tensor p
405
+ is actually the batch dimension, corresponding to batched-together
406
+ parameters of a given shape.
407
+
408
+
409
+ Args:
410
+ group: Dict to look up configuration values.
411
+ p: The parameter that we are initializing the state for
412
+ state: Dict from string to whatever state we are initializing
413
+ """
414
+ size_update_period = group["size_update_period"]
415
+
416
+ state["step"] = 0
417
+
418
+ kwargs = {"device": p.device, "dtype": p.dtype}
419
+
420
+ # 'delta' implements conventional momentum. There are
421
+ # several different kinds of update going on, so rather than
422
+ # compute "exp_avg" like in Adam, we store and decay a
423
+ # parameter-change "delta", which combines all forms of
424
+ # update. this is equivalent to how it's done in Adam,
425
+ # except for the first few steps.
426
+ state["delta"] = torch.zeros_like(p, memory_format=torch.preserve_format)
427
+
428
+ batch_size = p.shape[0]
429
+ numel = p.numel() // batch_size
430
+ numel = p.numel()
431
+
432
+ if numel > 1:
433
+ # "param_rms" just periodically records the scalar root-mean-square value of
434
+ # the parameter tensor.
435
+ # it has a shape like (batch_size, 1, 1, 1, 1)
436
+ param_rms = (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
437
+ state["param_rms"] = param_rms
438
+
439
+ state["scale_exp_avg_sq"] = torch.zeros_like(param_rms)
440
+ state["scale_grads"] = torch.zeros(
441
+ size_update_period, *param_rms.shape, **kwargs
442
+ )
443
+
444
+ # exp_avg_sq is the weighted sum of scaled gradients. as in Adam.
445
+ state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
446
+
447
+ def _get_clipping_scale(
448
+ self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]]
449
+ ) -> float:
450
+ """
451
+ Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients
452
+ by this amount before applying the rest of the update.
453
+
454
+ Args:
455
+ group: the parameter group, an item in self.param_groups
456
+ tuples: a list of tuples of (param, state, param_names)
457
+ where param is a batched set of parameters,
458
+ with a .grad (1st dim is batch dim)
459
+ and state is the state-dict where optimization parameters are kept.
460
+ param_names is a List[str] while each str is name for a parameter
461
+ in batched set of parameters "param".
462
+ """
463
+ assert len(tuples) >= 1
464
+ clipping_scale = group["clipping_scale"]
465
+ (first_p, first_state, _) = tuples[0]
466
+ step = first_state["step"]
467
+ if clipping_scale is None or step == 0:
468
+ # no clipping. return early on step == 0 because the other
469
+ # parameters' state won't have been initialized yet.
470
+ return 1.0
471
+ clipping_update_period = group["clipping_update_period"]
472
+
473
+ tot_sumsq = torch.tensor(0.0, device=first_p.device)
474
+ for (p, state, param_names) in tuples:
475
+ grad = p.grad
476
+ if grad.is_sparse:
477
+ raise RuntimeError(
478
+ "ScaledAdam optimizer does not support sparse gradients"
479
+ )
480
+ if p.numel() == p.shape[0]: # a batch of scalars
481
+ tot_sumsq += (grad**2).sum() # sum() to change shape [1] to []
482
+ else:
483
+ tot_sumsq += ((grad * state["param_rms"]) ** 2).sum()
484
+
485
+ tot_norm = tot_sumsq.sqrt()
486
+ if "model_norms" not in first_state:
487
+ first_state["model_norms"] = torch.zeros(
488
+ clipping_update_period, device=p.device
489
+ )
490
+ first_state["model_norms"][step % clipping_update_period] = tot_norm
491
+
492
+ if step % clipping_update_period == 0:
493
+ # Print some stats.
494
+ # We don't reach here if step == 0 because we would have returned
495
+ # above.
496
+ sorted_norms = first_state["model_norms"].sort()[0].to("cpu")
497
+ quartiles = []
498
+ for n in range(0, 5):
499
+ index = min(
500
+ clipping_update_period - 1,
501
+ (clipping_update_period // 4) * n,
502
+ )
503
+ quartiles.append(sorted_norms[index].item())
504
+
505
+ median = quartiles[2]
506
+ threshold = clipping_scale * median
507
+ first_state["model_norm_threshold"] = threshold
508
+ percent_clipped = (
509
+ first_state["num_clipped"] * 100.0 / clipping_update_period
510
+ if "num_clipped" in first_state
511
+ else 0.0
512
+ )
513
+ first_state["num_clipped"] = 0
514
+ quartiles = " ".join(["%.3e" % x for x in quartiles])
515
+ logging.info(
516
+ f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, "
517
+ f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}"
518
+ )
519
+
520
+ if step < clipping_update_period:
521
+ return 1.0 # We have not yet estimated a norm to clip to.
522
+ else:
523
+ try:
524
+ model_norm_threshold = first_state["model_norm_threshold"]
525
+ except KeyError:
526
+ logging.info(
527
+ "Warning: model_norm_threshold not in state: possibly "
528
+ "you changed config when restarting, adding clipping_scale option?"
529
+ )
530
+ return 1.0
531
+ ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item())
532
+ if ans < 1.0:
533
+ first_state["num_clipped"] += 1
534
+ if ans < 0.1:
535
+ logging.warn(
536
+ f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}"
537
+ )
538
+ if self.show_dominant_parameters:
539
+ assert p.shape[0] == len(param_names)
540
+ self._show_gradient_dominating_parameter(tuples, tot_sumsq)
541
+ return ans
542
+
543
+ def _show_gradient_dominating_parameter(
544
+ self, tuples: List[Tuple[Tensor, dict, List[str]]], tot_sumsq: Tensor
545
+ ):
546
+ """
547
+ Show information of parameter wihch dominanting tot_sumsq.
548
+
549
+ Args:
550
+ tuples: a list of tuples of (param, state, param_names)
551
+ where param is a batched set of parameters,
552
+ with a .grad (1st dim is batch dim)
553
+ and state is the state-dict where optimization parameters are kept.
554
+ param_names is a List[str] while each str is name for a parameter
555
+ in batched set of parameters "param".
556
+ tot_sumsq: sumsq of all parameters. Though it's could be calculated
557
+ from tuples, we still pass it to save some time.
558
+ """
559
+ all_sumsq_orig = {}
560
+ for (p, state, batch_param_names) in tuples:
561
+ # p is a stacked batch parameters.
562
+ batch_grad = p.grad
563
+ if p.numel() == p.shape[0]: # a batch of scalars
564
+ batch_sumsq_orig = batch_grad**2
565
+ # Dummpy values used by following `zip` statement.
566
+ batch_rms_orig = torch.ones(p.shape[0])
567
+ else:
568
+ batch_rms_orig = state["param_rms"]
569
+ batch_sumsq_orig = ((batch_grad * batch_rms_orig) ** 2).sum(
570
+ dim=list(range(1, batch_grad.ndim))
571
+ )
572
+
573
+ for name, sumsq_orig, rms, grad in zip(
574
+ batch_param_names, batch_sumsq_orig, batch_rms_orig, batch_grad
575
+ ):
576
+
577
+ proportion_orig = sumsq_orig / tot_sumsq
578
+ all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad)
579
+
580
+ assert torch.isclose(
581
+ sum([value[0] for value in all_sumsq_orig.values()]).cpu(),
582
+ torch.tensor(1.0),
583
+ )
584
+ sorted_by_proportion = {
585
+ k: v
586
+ for k, v in sorted(
587
+ all_sumsq_orig.items(),
588
+ key=lambda item: item[1][0],
589
+ reverse=True,
590
+ )
591
+ }
592
+ dominant_param_name = next(iter(sorted_by_proportion))
593
+ (
594
+ dominant_proportion,
595
+ dominant_sumsq,
596
+ dominant_rms,
597
+ dominant_grad,
598
+ ) = sorted_by_proportion[dominant_param_name]
599
+ logging.info(
600
+ f"Parameter Dominanting tot_sumsq {dominant_param_name}"
601
+ f" with proportion {dominant_proportion:.2f},"
602
+ f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)"
603
+ f"={dominant_sumsq:.3e},"
604
+ f" grad_sumsq = {(dominant_grad**2).sum():.3e},"
605
+ f" orig_rms_sq={(dominant_rms**2).item():.3e}"
606
+ )
607
+
608
+ def _step_one_batch(
609
+ self, group: dict, p: Tensor, state: dict, clipping_scale: float
610
+ ):
611
+ """
612
+ Do the step for one parameter, which is actually going to be a batch of
613
+ `real` parameters, with dim 0 as the batch dim.
614
+ Args:
615
+ group: dict to look up configuration values
616
+ p: parameter to update (actually multiple parameters stacked together
617
+ as a batch)
618
+ state: state-dict for p, to look up the optimizer state
619
+ """
620
+ lr = group["lr"]
621
+ size_update_period = group["size_update_period"]
622
+ beta1 = group["betas"][0]
623
+
624
+ grad = p.grad
625
+ if clipping_scale != 1.0:
626
+ grad = grad * clipping_scale
627
+ step = state["step"]
628
+ delta = state["delta"]
629
+
630
+ delta.mul_(beta1)
631
+ batch_size = p.shape[0]
632
+ numel = p.numel() // batch_size
633
+ if numel > 1:
634
+ # Update the size/scale of p, and set param_rms
635
+ scale_grads = state["scale_grads"]
636
+ scale_grads[step % size_update_period] = (p * grad).sum(
637
+ dim=list(range(1, p.ndim)), keepdim=True
638
+ )
639
+ if step % size_update_period == size_update_period - 1:
640
+ param_rms = state["param_rms"] # shape: (batch_size, 1, 1, ..)
641
+ param_rms.copy_(
642
+ (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
643
+ )
644
+ if step > 0:
645
+ # self._size_update() learns the overall scale on the
646
+ # parameter, by shrinking or expanding it.
647
+ self._size_update(group, scale_grads, p, state)
648
+
649
+ if numel == 1:
650
+ # For parameters with 1 element we just use regular Adam.
651
+ # Updates delta.
652
+ self._step_scalar(group, p, state)
653
+ else:
654
+ self._step(group, p, state)
655
+
656
+ state["step"] = step + 1
657
+
658
+ def _size_update(
659
+ self, group: dict, scale_grads: Tensor, p: Tensor, state: dict
660
+ ) -> None:
661
+ """
662
+ Called only where p.numel() > 1, this updates the scale of the parameter.
663
+ If we imagine: p = underlying_param * scale.exp(), and we are doing
664
+ gradient descent on underlying param and on scale, this function does the update
665
+ on `scale`.
666
+
667
+ Args:
668
+ group: dict to look up configuration values
669
+ scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing
670
+ grads w.r.t. the scales.
671
+ p: The parameter to update
672
+ state: The state-dict of p
673
+ """
674
+
675
+ param_rms = state["param_rms"]
676
+ beta1, beta2 = group["betas"]
677
+ size_lr = group["lr"] * group["scalar_lr_scale"]
678
+ param_min_rms = group["param_min_rms"]
679
+ param_max_rms = group["param_max_rms"]
680
+ eps = group["eps"]
681
+ step = state["step"]
682
+ batch_size = p.shape[0]
683
+
684
+ size_update_period = scale_grads.shape[0]
685
+ # correct beta2 for the size update period: we will have
686
+ # faster decay at this level.
687
+ beta2_corr = beta2**size_update_period
688
+
689
+ scale_exp_avg_sq = state["scale_exp_avg_sq"] # shape: (batch_size, 1, 1, ..)
690
+ scale_exp_avg_sq.mul_(beta2_corr).add_(
691
+ (scale_grads**2).mean(dim=0), # mean over dim `size_update_period`
692
+ alpha=1 - beta2_corr,
693
+ ) # shape is (batch_size, 1, 1, ...)
694
+
695
+ # The 1st time we reach here is when size_step == 1.
696
+ size_step = (step + 1) // size_update_period
697
+ bias_correction2 = 1 - beta2_corr**size_step
698
+ # we don't bother with bias_correction1; this will help prevent divergence
699
+ # at the start of training.
700
+
701
+ denom = scale_exp_avg_sq.sqrt() + eps
702
+
703
+ scale_step = (
704
+ -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom
705
+ )
706
+
707
+ is_too_small = param_rms < param_min_rms
708
+ is_too_large = param_rms > param_max_rms
709
+
710
+ # when the param gets too small, just don't shrink it any further.
711
+ scale_step.masked_fill_(is_too_small, 0.0)
712
+ # when it gets too large, stop it from getting any larger.
713
+ scale_step.masked_fill_(is_too_large, -size_lr * size_update_period)
714
+ delta = state["delta"]
715
+ # the factor of (1-beta1) relates to momentum.
716
+ delta.add_(p * scale_step, alpha=(1 - beta1))
717
+
718
+ def _step(self, group: dict, p: Tensor, state: dict):
719
+ """
720
+ This function does the core update of self.step(), in the case where the members of
721
+ the batch have more than 1 element.
722
+
723
+ Args:
724
+ group: A dict which will be used to look up configuration values
725
+ p: The parameter to be updated
726
+ grad: The grad of p
727
+ state: The state-dict corresponding to parameter p
728
+
729
+ This function modifies p.
730
+ """
731
+ grad = p.grad
732
+ lr = group["lr"]
733
+ beta1, beta2 = group["betas"]
734
+ eps = group["eps"]
735
+ param_min_rms = group["param_min_rms"]
736
+ step = state["step"]
737
+
738
+ exp_avg_sq = state["exp_avg_sq"]
739
+ exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2))
740
+
741
+ this_step = state["step"] - (state["zero_step"] if "zero_step" in state else 0)
742
+ bias_correction2 = 1 - beta2 ** (this_step + 1)
743
+ if bias_correction2 < 0.99:
744
+ # note: not in-place.
745
+ exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2)
746
+
747
+ denom = exp_avg_sq.sqrt()
748
+ denom += eps
749
+ grad = grad / denom
750
+
751
+ alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms)
752
+
753
+ delta = state["delta"]
754
+ delta.add_(grad * alpha)
755
+ p.add_(delta)
756
+
757
+ def _step_scalar(self, group: dict, p: Tensor, state: dict):
758
+ """
759
+ A simplified form of the core update for scalar tensors, where we cannot get a good
760
+ estimate of the parameter rms.
761
+ """
762
+ beta1, beta2 = group["betas"]
763
+ scalar_max = group["scalar_max"]
764
+ eps = group["eps"]
765
+ lr = group["lr"] * group["scalar_lr_scale"]
766
+ grad = p.grad
767
+
768
+ exp_avg_sq = state["exp_avg_sq"] # shape: (batch_size,)
769
+ exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
770
+
771
+ # bias_correction2 is like in Adam. Don't bother with bias_correction1;
772
+ # slower update at the start will help stability anyway.
773
+ bias_correction2 = 1 - beta2 ** (state["step"] + 1)
774
+ denom = (exp_avg_sq / bias_correction2).sqrt() + eps
775
+
776
+ delta = state["delta"]
777
+ delta.add_(grad / denom, alpha=-lr * (1 - beta1))
778
+ p.clamp_(min=-scalar_max, max=scalar_max)
779
+ p.add_(delta)
780
+
preprocessors/__init__.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """
7
+ For source datasets' standard samples
8
+ """
9
+
10
+ from collections import defaultdict
11
+ import os
12
+ import json
13
+
14
+ SPEECH_DATASETS = ["vctk", "vctksample"]
15
+
16
+ GOLDEN_TEST_SAMPLES = defaultdict(list)
17
+ GOLDEN_TEST_SAMPLES["m4singer"] = [
18
+ "Alto-1_美错_0014",
19
+ "Bass-1_十年_0008",
20
+ "Soprano-2_同桌的你_0018",
21
+ "Tenor-5_爱笑的眼睛_0010",
22
+ ]
23
+ GOLDEN_TEST_SAMPLES["svcc"] = [
24
+ # IDF1
25
+ "IDF1_10030",
26
+ "IDF1_10120",
27
+ "IDF1_10140",
28
+ # IDM1
29
+ "IDM1_10001",
30
+ "IDM1_10030",
31
+ "IDM1_10120",
32
+ # CDF1
33
+ "CDF1_10030",
34
+ "CDF1_10120",
35
+ "CDF1_10140",
36
+ # CDM1
37
+ "CDM1_10001",
38
+ "CDM1_10030",
39
+ "CDM1_10120",
40
+ ]
41
+ GOLDEN_TEST_SAMPLES["svcceval"] = [
42
+ # SF1
43
+ "SF1_30001",
44
+ "SF1_30002",
45
+ "SF1_30003",
46
+ # SM1
47
+ "SM1_30001",
48
+ "SM1_30002",
49
+ "SM1_30003",
50
+ ]
51
+ GOLDEN_TEST_SAMPLES["popbutfy"] = [
52
+ "Female1#you_are_my_sunshine_Professional#0",
53
+ "Female4#Someone_Like_You_Professional#10",
54
+ "Male2#Lemon_Tree_Professional#12",
55
+ "Male5#can_you_feel_the_love_tonight_Professional#20",
56
+ ]
57
+ GOLDEN_TEST_SAMPLES["opensinger"] = [
58
+ "Man_0_大鱼_10",
59
+ "Man_21_丑八怪_14",
60
+ "Woman_39_mojito_22",
61
+ "Woman_40_易燃易爆炸_12",
62
+ ]
63
+ GOLDEN_TEST_SAMPLES["nus48e"] = [
64
+ "ADIZ_read#01#0000",
65
+ "MCUR_sing#10#0000",
66
+ "JLEE_read#08#0001",
67
+ "SAMF_sing#18#0001",
68
+ ]
69
+ GOLDEN_TEST_SAMPLES["popcs"] = [
70
+ "明天会更好_0004",
71
+ "欧若拉_0005",
72
+ "虫儿飞_0006",
73
+ "隐形的翅膀_0008",
74
+ ]
75
+ GOLDEN_TEST_SAMPLES["kising"] = [
76
+ "421_0040",
77
+ "424_0013",
78
+ "431_0026",
79
+ ]
80
+ GOLDEN_TEST_SAMPLES["csd"] = [
81
+ "en_004a_0001",
82
+ "en_042b_0006",
83
+ "kr_013a_0006",
84
+ "kr_045b_0004",
85
+ ]
86
+ GOLDEN_TEST_SAMPLES["opera"] = [
87
+ "fem_01#neg_1#0000",
88
+ "fem_12#pos_3#0003",
89
+ "male_02#neg_1#0002",
90
+ "male_11#pos_2#0001",
91
+ ]
92
+ GOLDEN_TEST_SAMPLES["lijian"] = [
93
+ "058矜持_0000",
94
+ "079绒花_0000",
95
+ "120遥远的天空底下_0000",
96
+ ]
97
+ GOLDEN_TEST_SAMPLES["cdmusiceval"] = ["陶喆_普通朋友", "蔡琴_给电影人的情书"]
98
+
99
+ GOLDEN_TRAIN_SAMPLES = defaultdict(list)
100
+
101
+
102
+ def get_golden_samples_indexes(
103
+ dataset_name,
104
+ dataset_dir=None,
105
+ cfg=None,
106
+ split=None,
107
+ min_samples=5,
108
+ ):
109
+ """
110
+ # Get Standard samples' indexes
111
+ """
112
+ if dataset_dir is None:
113
+ assert cfg is not None
114
+ dataset_dir = os.path.join(
115
+ cfg.OUTPUT_PATH,
116
+ "preprocess/{}_version".format(cfg.PREPROCESS_VERSION),
117
+ dataset_name,
118
+ )
119
+
120
+ assert split is not None
121
+ utt_file = os.path.join(dataset_dir, "{}.json".format(split))
122
+ with open(utt_file, "r", encoding="utf-8") as f:
123
+ samples = json.load(f)
124
+
125
+ if "train" in split:
126
+ golden_samples = GOLDEN_TRAIN_SAMPLES[dataset_name]
127
+ if "test" in split:
128
+ golden_samples = GOLDEN_TEST_SAMPLES[dataset_name]
129
+
130
+ res = []
131
+ for idx, utt in enumerate(samples):
132
+ if utt["Uid"] in golden_samples:
133
+ res.append(idx)
134
+
135
+ if dataset_name == "cdmusiceval":
136
+ if "_".join(utt["Uid"].split("_")[:2]) in golden_samples:
137
+ res.append(idx)
138
+
139
+ if len(res) == 0:
140
+ res = [i for i in range(min_samples)]
141
+
142
+ return res
143
+
144
+
145
+ def get_specific_singer_indexes(dataset_dir, singer_name, split):
146
+ utt_file = os.path.join(dataset_dir, "{}.json".format(split))
147
+ with open(utt_file, "r", encoding="utf-8") as f:
148
+ samples = json.load(f)
149
+
150
+ res = []
151
+ for idx, utt in enumerate(samples):
152
+ if utt["Singer"] == singer_name:
153
+ res.append(idx)
154
+
155
+ assert len(res) != 0
156
+ return res
157
+
158
+
159
+ def get_uids_and_wav_paths(
160
+ cfg, dataset, dataset_type="train", only_specific_singer=None, return_singers=False
161
+ ):
162
+ dataset_dir = os.path.join(
163
+ cfg.OUTPUT_PATH, "preprocess/{}_version".format(cfg.PREPROCESS_VERSION), dataset
164
+ )
165
+ dataset_file = os.path.join(
166
+ dataset_dir, "{}.json".format(dataset_type.split("_")[-1])
167
+ )
168
+ with open(dataset_file, "r") as f:
169
+ utterances = json.load(f)
170
+
171
+ indexes = range(len(utterances))
172
+ if "golden" in dataset_type:
173
+ # golden_train or golden_test
174
+ indexes = get_golden_samples_indexes(
175
+ dataset, dataset_dir, split=dataset_type.split("_")[-1]
176
+ )
177
+ if only_specific_singer is not None:
178
+ indexes = get_specific_singer_indexes(
179
+ dataset_dir, only_specific_singer, dataset_type
180
+ )
181
+
182
+ uids = [utterances[i]["Uid"] for i in indexes]
183
+ wav_paths = [utterances[i]["Path"] for i in indexes]
184
+ singers = [utterances[i]["Singer"] for i in indexes]
185
+
186
+ if not return_singers:
187
+ return uids, wav_paths
188
+ else:
189
+ return uids, wav_paths, singers
preprocessors/bigdata.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import os
9
+ from collections import defaultdict
10
+ from tqdm import tqdm
11
+
12
+
13
+ def get_uids_and_wav_paths(cfg, dataset, dataset_type):
14
+ assert dataset == "bigdata"
15
+ dataset_dir = os.path.join(
16
+ cfg.OUTPUT_PATH,
17
+ "preprocess/{}_version".format(cfg.PREPROCESS_VERSION),
18
+ "bigdata/{}".format(cfg.BIGDATA_VERSION),
19
+ )
20
+ dataset_file = os.path.join(
21
+ dataset_dir, "{}.json".format(dataset_type.split("_")[-1])
22
+ )
23
+ with open(dataset_file, "r") as f:
24
+ utterances = json.load(f)
25
+
26
+ # Uids
27
+ uids = [u["Uid"] for u in utterances]
28
+
29
+ # Wav paths
30
+ wav_paths = [u["Path"] for u in utterances]
31
+
32
+ return uids, wav_paths
33
+
34
+
35
+ def take_duration(utt):
36
+ return utt["Duration"]
37
+
38
+
39
+ def main(output_path, cfg):
40
+ datasets = cfg.dataset
41
+
42
+ print("-" * 10)
43
+ print("Preparing samples for bigdata...")
44
+ print("Including: \n{}\n".format("\n".join(datasets)))
45
+
46
+ datasets.sort()
47
+ bigdata_version = "_".join(datasets)
48
+
49
+ save_dir = os.path.join(output_path, bigdata_version)
50
+ os.makedirs(save_dir, exist_ok=True)
51
+
52
+ train_output_file = os.path.join(save_dir, "train.json")
53
+ test_output_file = os.path.join(save_dir, "test.json")
54
+ singer_dict_file = os.path.join(save_dir, cfg.preprocess.spk2id)
55
+ utt2singer_file = os.path.join(save_dir, cfg.preprocess.utt2spk)
56
+ utt2singer = open(utt2singer_file, "a+")
57
+ # We select songs of standard samples as test songs
58
+ train = []
59
+ test = []
60
+
61
+ train_total_duration = 0
62
+ test_total_duration = 0
63
+
64
+ # Singer unique names
65
+ singer_names = set()
66
+
67
+ for dataset in datasets:
68
+ dataset_path = os.path.join(output_path, dataset)
69
+ train_json = os.path.join(dataset_path, "train.json")
70
+ test_json = os.path.join(dataset_path, "test.json")
71
+
72
+ with open(train_json, "r", encoding="utf-8") as f:
73
+ train_utterances = json.load(f)
74
+
75
+ with open(test_json, "r", encoding="utf-8") as f:
76
+ test_utterances = json.load(f)
77
+
78
+ for utt in tqdm(train_utterances):
79
+ train.append(utt)
80
+ train_total_duration += utt["Duration"]
81
+ singer_names.add("{}_{}".format(utt["Dataset"], utt["Singer"]))
82
+ utt2singer.write(
83
+ "{}_{}\t{}_{}\n".format(
84
+ utt["Dataset"], utt["Uid"], utt["Dataset"], utt["Singer"]
85
+ )
86
+ )
87
+
88
+ for utt in test_utterances:
89
+ test.append(utt)
90
+ test_total_duration += utt["Duration"]
91
+ singer_names.add("{}_{}".format(utt["Dataset"], utt["Singer"]))
92
+ utt2singer.write(
93
+ "{}_{}\t{}_{}\n".format(
94
+ utt["Dataset"], utt["Uid"], utt["Dataset"], utt["Singer"]
95
+ )
96
+ )
97
+
98
+ utt2singer.close()
99
+
100
+ train.sort(key=take_duration)
101
+ test.sort(key=take_duration)
102
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
103
+ print(
104
+ "#Train hours= {}, #Test hours= {}".format(
105
+ train_total_duration / 3600, test_total_duration / 3600
106
+ )
107
+ )
108
+
109
+ # Singer Look Up Table
110
+ singer_names = list(singer_names)
111
+ singer_names.sort()
112
+ singer_lut = {name: i for i, name in enumerate(singer_names)}
113
+ print("#Singers: {}\n".format(len(singer_lut)))
114
+
115
+ # Save
116
+ with open(train_output_file, "w") as f:
117
+ json.dump(train, f, indent=4, ensure_ascii=False)
118
+ with open(test_output_file, "w") as f:
119
+ json.dump(test, f, indent=4, ensure_ascii=False)
120
+ with open(singer_dict_file, "w") as f:
121
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
122
+
123
+ # Save meta info
124
+ meta_info = {
125
+ "datasets": datasets,
126
+ "train": {"size": len(train), "hours": round(train_total_duration / 3600, 4)},
127
+ "test": {"size": len(test), "hours": round(test_total_duration / 3600, 4)},
128
+ "singers": {"size": len(singer_lut)},
129
+ }
130
+ singer2mins = defaultdict(float)
131
+ for utt in train:
132
+ dataset, singer, duration = utt["Dataset"], utt["Singer"], utt["Duration"]
133
+ singer2mins["{}_{}".format(dataset, singer)] += duration / 60
134
+ singer2mins = sorted(singer2mins.items(), key=lambda x: x[1], reverse=True)
135
+ singer2mins = dict(
136
+ zip([i[0] for i in singer2mins], [round(i[1], 2) for i in singer2mins])
137
+ )
138
+ meta_info["singers"]["training_minutes"] = singer2mins
139
+
140
+ with open(os.path.join(save_dir, "meta_info.json"), "w") as f:
141
+ json.dump(meta_info, f, indent=4, ensure_ascii=False)
142
+
143
+ for singer, min in singer2mins.items():
144
+ print("Singer {}: {} mins".format(singer, min))
145
+ print("-" * 10, "\n")
preprocessors/cdmusiceval.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from glob import glob
7
+ import os
8
+ import json
9
+ import torchaudio
10
+ from tqdm import tqdm
11
+ from collections import defaultdict
12
+
13
+ from utils.util import has_existed, remove_and_create
14
+ from utils.audio_slicer import split_utterances_from_audio
15
+
16
+
17
+ def split_to_utterances(input_dir, output_dir):
18
+ print("Splitting to utterances for {}...".format(input_dir))
19
+
20
+ files_list = glob("*", root_dir=input_dir)
21
+ files_list.sort()
22
+ for wav_file in tqdm(files_list):
23
+ # # Load waveform
24
+ # waveform, fs = torchaudio.load(os.path.join(input_dir, wav_file))
25
+
26
+ # Singer name, Song name
27
+ song_name, singer_name = wav_file.split("_")[2].split("-")
28
+ save_dir = os.path.join(output_dir, singer_name, song_name)
29
+
30
+ split_utterances_from_audio(
31
+ os.path.join(input_dir, wav_file), save_dir, max_duration_of_utterance=10
32
+ )
33
+
34
+ # # Split
35
+ # slicer = Slicer(sr=fs, threshold=-30.0, max_sil_kept=3000, min_interval=1000)
36
+ # chunks = slicer.slice(waveform)
37
+
38
+ # for i, chunk in enumerate(chunks):
39
+ # save_dir = os.path.join(output_dir, singer_name, song_name)
40
+ # os.makedirs(save_dir, exist_ok=True)
41
+
42
+ # output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
43
+ # save_audio(output_file, chunk, fs)
44
+
45
+
46
+ def _main(dataset_path):
47
+ """
48
+ Split to utterances
49
+ """
50
+ utterance_dir = os.path.join(dataset_path, "utterances")
51
+ remove_and_create(utterance_dir)
52
+ split_to_utterances(os.path.join(dataset_path, "vocal"), utterance_dir)
53
+
54
+
55
+ def statistics(utterance_dir):
56
+ singers = []
57
+ songs = []
58
+ singers2songs = defaultdict(lambda: defaultdict(list))
59
+
60
+ singer_infos = glob(utterance_dir + "/*")
61
+
62
+ for singer_info in singer_infos:
63
+ singer = singer_info.split("/")[-1]
64
+
65
+ song_infos = glob(singer_info + "/*")
66
+
67
+ for song_info in song_infos:
68
+ song = song_info.split("/")[-1]
69
+
70
+ singers.append(singer)
71
+ songs.append(song)
72
+
73
+ utts = glob(song_info + "/*.wav")
74
+
75
+ for utt in utts:
76
+ uid = utt.split("/")[-1].split(".")[0]
77
+ singers2songs[singer][song].append(uid)
78
+
79
+ unique_singers = list(set(singers))
80
+ unique_songs = list(set(songs))
81
+ unique_singers.sort()
82
+ unique_songs.sort()
83
+
84
+ print(
85
+ "Statistics: {} singers, {} utterances ({} unique songs)".format(
86
+ len(unique_singers), len(songs), len(unique_songs)
87
+ )
88
+ )
89
+ print("Singers: \n{}".format("\t".join(unique_singers)))
90
+ return singers2songs, unique_singers
91
+
92
+
93
+ def main(output_path, dataset_path):
94
+ print("-" * 10)
95
+ print("Preparing samples for CD Music Eval...\n")
96
+
97
+ if not os.path.exists(os.path.join(dataset_path, "utterances")):
98
+ print("Spliting into utterances...\n")
99
+ _main(dataset_path)
100
+
101
+ save_dir = os.path.join(output_path, "cdmusiceval")
102
+ os.makedirs(save_dir, exist_ok=True)
103
+ train_output_file = os.path.join(save_dir, "train.json")
104
+ test_output_file = os.path.join(save_dir, "test.json")
105
+ singer_dict_file = os.path.join(save_dir, "singers.json")
106
+ utt2singer_file = os.path.join(save_dir, "utt2singer")
107
+ if (
108
+ has_existed(train_output_file)
109
+ and has_existed(test_output_file)
110
+ and has_existed(singer_dict_file)
111
+ and has_existed(utt2singer_file)
112
+ ):
113
+ return
114
+ utt2singer = open(utt2singer_file, "w")
115
+
116
+ # Load
117
+ utt_path = os.path.join(dataset_path, "utterances")
118
+ singers2songs, unique_singers = statistics(utt_path)
119
+
120
+ # We select songs of standard samples as test songs
121
+ train = []
122
+ test = []
123
+
124
+ train_index_count = 0
125
+ test_index_count = 0
126
+
127
+ train_total_duration = 0
128
+ test_total_duration = 0
129
+
130
+ for singer, songs in tqdm(singers2songs.items()):
131
+ song_names = list(songs.keys())
132
+
133
+ for chosen_song in song_names:
134
+ for chosen_uid in songs[chosen_song]:
135
+ res = {
136
+ "Dataset": "cdmusiceval",
137
+ "Singer": singer,
138
+ "Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
139
+ }
140
+ res["Path"] = "{}/{}/{}.wav".format(singer, chosen_song, chosen_uid)
141
+ res["Path"] = os.path.join(utt_path, res["Path"])
142
+ assert os.path.exists(res["Path"])
143
+
144
+ waveform, sample_rate = torchaudio.load(res["Path"])
145
+ duration = waveform.size(-1) / sample_rate
146
+ res["Duration"] = duration
147
+
148
+ if duration <= 1e-8:
149
+ continue
150
+
151
+ res["index"] = test_index_count
152
+ test_total_duration += duration
153
+ test.append(res)
154
+ test_index_count += 1
155
+
156
+ utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
157
+
158
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
159
+ print(
160
+ "#Train hours= {}, #Test hours= {}".format(
161
+ train_total_duration / 3600, test_total_duration / 3600
162
+ )
163
+ )
164
+
165
+ # Save train.json and test.json
166
+ with open(train_output_file, "w") as f:
167
+ json.dump(train, f, indent=4, ensure_ascii=False)
168
+ with open(test_output_file, "w") as f:
169
+ json.dump(test, f, indent=4, ensure_ascii=False)
170
+
171
+ # Save singers.json
172
+ singer_lut = {name: i for i, name in enumerate(unique_singers)}
173
+ with open(singer_dict_file, "w") as f:
174
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
preprocessors/coco.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import torchaudio
9
+ from tqdm import tqdm
10
+ from glob import glob
11
+ from collections import defaultdict
12
+
13
+ from utils.util import has_existed
14
+ from preprocessors import GOLDEN_TEST_SAMPLES
15
+
16
+
17
+ def get_test_songs():
18
+ return ["007Di Da Di"]
19
+
20
+
21
+ def coco_statistics(data_dir):
22
+ song2utts = defaultdict(list)
23
+
24
+ song_infos = glob(data_dir + "/*")
25
+
26
+ for song in song_infos:
27
+ song_name = song.split("/")[-1]
28
+ utts = glob(song + "/*.wav")
29
+ for utt in utts:
30
+ uid = utt.split("/")[-1].split(".")[0]
31
+ song2utts[song_name].append(uid)
32
+
33
+ print("Coco: {} songs".format(len(song_infos)))
34
+ return song2utts
35
+
36
+
37
+ def main(output_path, dataset_path):
38
+ print("-" * 10)
39
+ print("Preparing datasets for Coco...\n")
40
+
41
+ save_dir = os.path.join(output_path, "coco")
42
+ train_output_file = os.path.join(save_dir, "train.json")
43
+ test_output_file = os.path.join(save_dir, "test.json")
44
+ if has_existed(test_output_file):
45
+ return
46
+
47
+ # Load
48
+ song2utts = coco_statistics(dataset_path)
49
+ test_songs = get_test_songs()
50
+
51
+ # We select songs of standard samples as test songs
52
+ train = []
53
+ test = []
54
+
55
+ train_index_count = 0
56
+ test_index_count = 0
57
+
58
+ train_total_duration = 0
59
+ test_total_duration = 0
60
+
61
+ for song_name, uids in tqdm(song2utts.items()):
62
+ for chosen_uid in uids:
63
+ res = {
64
+ "Dataset": "coco",
65
+ "Singer": "coco",
66
+ "Song": song_name,
67
+ "Uid": "{}_{}".format(song_name, chosen_uid),
68
+ }
69
+ res["Path"] = "{}/{}.wav".format(song_name, chosen_uid)
70
+ res["Path"] = os.path.join(dataset_path, res["Path"])
71
+ assert os.path.exists(res["Path"])
72
+
73
+ waveform, sample_rate = torchaudio.load(res["Path"])
74
+ duration = waveform.size(-1) / sample_rate
75
+ res["Duration"] = duration
76
+
77
+ if song_name in test_songs:
78
+ res["index"] = test_index_count
79
+ test_total_duration += duration
80
+ test.append(res)
81
+ test_index_count += 1
82
+ else:
83
+ res["index"] = train_index_count
84
+ train_total_duration += duration
85
+ train.append(res)
86
+ train_index_count += 1
87
+
88
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
89
+ print(
90
+ "#Train hours= {}, #Test hours= {}".format(
91
+ train_total_duration / 3600, test_total_duration / 3600
92
+ )
93
+ )
94
+
95
+ # Save
96
+ os.makedirs(save_dir, exist_ok=True)
97
+ with open(train_output_file, "w") as f:
98
+ json.dump(train, f, indent=4, ensure_ascii=False)
99
+ with open(test_output_file, "w") as f:
100
+ json.dump(test, f, indent=4, ensure_ascii=False)
preprocessors/cocoeval.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import random
7
+ import os
8
+ import json
9
+ import torchaudio
10
+ from tqdm import tqdm
11
+ from glob import glob
12
+ from collections import defaultdict
13
+
14
+ from utils.util import has_existed
15
+ from utils.audio_slicer import split_utterances_from_audio
16
+ from preprocessors import GOLDEN_TEST_SAMPLES
17
+
18
+
19
+ def _split_utts():
20
+ raw_dir = "/mnt/chongqinggeminiceph1fs/geminicephfs/wx-mm-spr-xxxx/xueyaozhang/dataset/李玟/cocoeval/raw"
21
+ output_root = "/mnt/chongqinggeminiceph1fs/geminicephfs/wx-mm-spr-xxxx/xueyaozhang/dataset/李玟/cocoeval/utterances"
22
+
23
+ if os.path.exists(output_root):
24
+ os.system("rm -rf {}".format(output_root))
25
+
26
+ vocal_files = glob(os.path.join(raw_dir, "*/vocal.wav"))
27
+ for vocal_f in tqdm(vocal_files):
28
+ song_name = vocal_f.split("/")[-2]
29
+
30
+ output_dir = os.path.join(output_root, song_name)
31
+ os.makedirs(output_dir, exist_ok=True)
32
+
33
+ split_utterances_from_audio(vocal_f, output_dir, min_interval=300)
34
+
35
+
36
+ def cocoeval_statistics(data_dir):
37
+ song2utts = defaultdict(list)
38
+
39
+ song_infos = glob(data_dir + "/*")
40
+
41
+ for song in song_infos:
42
+ song_name = song.split("/")[-1]
43
+ utts = glob(song + "/*.wav")
44
+ for utt in utts:
45
+ uid = utt.split("/")[-1].split(".")[0]
46
+ song2utts[song_name].append(uid)
47
+
48
+ print("Cocoeval: {} songs".format(len(song_infos)))
49
+ return song2utts
50
+
51
+
52
+ def main(output_path, dataset_path):
53
+ print("-" * 10)
54
+ print("Preparing datasets for Cocoeval...\n")
55
+
56
+ save_dir = os.path.join(output_path, "cocoeval")
57
+ test_output_file = os.path.join(save_dir, "test.json")
58
+ if has_existed(test_output_file):
59
+ return
60
+
61
+ # Load
62
+ song2utts = cocoeval_statistics(dataset_path)
63
+
64
+ train, test = [], []
65
+ train_index_count, test_index_count = 0, 0
66
+ train_total_duration, test_total_duration = 0.0, 0.0
67
+
68
+ for song_name, uids in tqdm(song2utts.items()):
69
+ for chosen_uid in uids:
70
+ res = {
71
+ "Dataset": "cocoeval",
72
+ "Singer": "TBD",
73
+ "Song": song_name,
74
+ "Uid": "{}_{}".format(song_name, chosen_uid),
75
+ }
76
+ res["Path"] = "{}/{}.wav".format(song_name, chosen_uid)
77
+ res["Path"] = os.path.join(dataset_path, res["Path"])
78
+ assert os.path.exists(res["Path"])
79
+
80
+ waveform, sample_rate = torchaudio.load(res["Path"])
81
+ duration = waveform.size(-1) / sample_rate
82
+ res["Duration"] = duration
83
+
84
+ res["index"] = test_index_count
85
+ test_total_duration += duration
86
+ test.append(res)
87
+ test_index_count += 1
88
+
89
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
90
+ print(
91
+ "#Train hours= {}, #Test hours= {}".format(
92
+ train_total_duration / 3600, test_total_duration / 3600
93
+ )
94
+ )
95
+
96
+ # Save
97
+ os.makedirs(save_dir, exist_ok=True)
98
+ with open(test_output_file, "w") as f:
99
+ json.dump(test, f, indent=4, ensure_ascii=False)
preprocessors/csd.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import os
9
+ import glob
10
+ from tqdm import tqdm
11
+ import torchaudio
12
+ import pandas as pd
13
+ from glob import glob
14
+ from collections import defaultdict
15
+
16
+ from utils.io import save_audio
17
+ from utils.util import has_existed
18
+ from preprocessors import GOLDEN_TEST_SAMPLES
19
+
20
+
21
+ def save_utterance(output_file, waveform, fs, start, end, overlap=0.1):
22
+ """
23
+ waveform: [#channel, audio_len]
24
+ start, end, overlap: seconds
25
+ """
26
+ start = int((start - overlap) * fs)
27
+ end = int((end + overlap) * fs)
28
+ utterance = waveform[:, start:end]
29
+ save_audio(output_file, utterance, fs)
30
+
31
+
32
+ def split_to_utterances(language_dir, output_dir):
33
+ print("Splitting to utterances for {}...".format(language_dir))
34
+ wav_dir = os.path.join(language_dir, "wav")
35
+ phoneme_dir = os.path.join(language_dir, "txt")
36
+ annot_dir = os.path.join(language_dir, "csv")
37
+
38
+ pitches = set()
39
+ for wav_file in tqdm(glob("{}/*.wav".format(wav_dir))):
40
+ # Load waveform
41
+ song_name = wav_file.split("/")[-1].split(".")[0]
42
+ waveform, fs = torchaudio.load(wav_file)
43
+
44
+ # Load utterances
45
+ phoneme_file = os.path.join(phoneme_dir, "{}.txt".format(song_name))
46
+ with open(phoneme_file, "r") as f:
47
+ lines = f.readlines()
48
+ utterances = [l.strip().split() for l in lines]
49
+ utterances = [utt for utt in utterances if len(utt) > 0]
50
+
51
+ # Load annotation
52
+ annot_file = os.path.join(annot_dir, "{}.csv".format(song_name))
53
+ annot_df = pd.read_csv(annot_file)
54
+ pitches = pitches.union(set(annot_df["pitch"]))
55
+ starts = annot_df["start"].tolist()
56
+ ends = annot_df["end"].tolist()
57
+ syllables = annot_df["syllable"].tolist()
58
+
59
+ # Split
60
+ curr = 0
61
+ for i, phones in enumerate(utterances):
62
+ sz = len(phones)
63
+ assert phones[0] == syllables[curr]
64
+ assert phones[-1] == syllables[curr + sz - 1]
65
+
66
+ s = starts[curr]
67
+ e = ends[curr + sz - 1]
68
+ curr += sz
69
+
70
+ save_dir = os.path.join(output_dir, song_name)
71
+ os.makedirs(save_dir, exist_ok=True)
72
+
73
+ output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
74
+ save_utterance(output_file, waveform, fs, start=s, end=e)
75
+
76
+
77
+ def _main(dataset_path):
78
+ """
79
+ Split to utterances
80
+ """
81
+ utterance_dir = os.path.join(dataset_path, "utterances")
82
+
83
+ for lang in ["english", "korean"]:
84
+ split_to_utterances(os.path.join(dataset_path, lang), utterance_dir)
85
+
86
+
87
+ def get_test_songs():
88
+ golden_samples = GOLDEN_TEST_SAMPLES["csd"]
89
+ # every item is a tuple (language, song)
90
+ golden_songs = [s.split("_")[:2] for s in golden_samples]
91
+ # language_song, eg: en_001a
92
+ return golden_songs
93
+
94
+
95
+ def csd_statistics(data_dir):
96
+ languages = []
97
+ songs = []
98
+ languages2songs = defaultdict(lambda: defaultdict(list))
99
+
100
+ folder_infos = glob(data_dir + "/*")
101
+
102
+ for folder_info in folder_infos:
103
+ folder_info_split = folder_info.split("/")[-1]
104
+
105
+ language = folder_info_split[:2]
106
+ song = folder_info_split[2:]
107
+
108
+ languages.append(language)
109
+ songs.append(song)
110
+
111
+ utts = glob(folder_info + "/*")
112
+
113
+ for utt in utts:
114
+ uid = utt.split("/")[-1].split(".")[0]
115
+ languages2songs[language][song].append(uid)
116
+
117
+ unique_languages = list(set(languages))
118
+ unique_songs = list(set(songs))
119
+ unique_languages.sort()
120
+ unique_songs.sort()
121
+
122
+ print(
123
+ "csd: {} languages, {} utterances ({} unique songs)".format(
124
+ len(unique_languages), len(songs), len(unique_songs)
125
+ )
126
+ )
127
+ print("Languages: \n{}".format("\t".join(unique_languages)))
128
+ return languages2songs
129
+
130
+
131
+ def main(output_path, dataset_path):
132
+ print("-" * 10)
133
+ print("Preparing test samples for csd...\n")
134
+
135
+ if not os.path.exists(os.path.join(dataset_path, "utterances")):
136
+ print("Spliting into utterances...\n")
137
+ _main(dataset_path)
138
+
139
+ save_dir = os.path.join(output_path, "csd")
140
+ train_output_file = os.path.join(save_dir, "train.json")
141
+ test_output_file = os.path.join(save_dir, "test.json")
142
+ if has_existed(test_output_file):
143
+ return
144
+
145
+ # Load
146
+ csd_path = os.path.join(dataset_path, "utterances")
147
+
148
+ language2songs = csd_statistics(csd_path)
149
+ test_songs = get_test_songs()
150
+
151
+ # We select songs of standard samples as test songs
152
+ train = []
153
+ test = []
154
+
155
+ train_index_count = 0
156
+ test_index_count = 0
157
+
158
+ train_total_duration = 0
159
+ test_total_duration = 0
160
+
161
+ for language, songs in tqdm(language2songs.items()):
162
+ song_names = list(songs.keys())
163
+
164
+ for chosen_song in song_names:
165
+ for chosen_uid in songs[chosen_song]:
166
+ res = {
167
+ "Dataset": "csd",
168
+ "Singer": "Female1_{}".format(language),
169
+ "Uid": "{}_{}_{}".format(language, chosen_song, chosen_uid),
170
+ }
171
+ res["Path"] = "{}{}/{}.wav".format(language, chosen_song, chosen_uid)
172
+ res["Path"] = os.path.join(csd_path, res["Path"])
173
+ assert os.path.exists(res["Path"])
174
+
175
+ waveform, sample_rate = torchaudio.load(res["Path"])
176
+ duration = waveform.size(-1) / sample_rate
177
+ res["Duration"] = duration
178
+
179
+ if [language, chosen_song] in test_songs:
180
+ res["index"] = test_index_count
181
+ test_total_duration += duration
182
+ test.append(res)
183
+ test_index_count += 1
184
+ else:
185
+ res["index"] = train_index_count
186
+ train_total_duration += duration
187
+ train.append(res)
188
+ train_index_count += 1
189
+
190
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
191
+ print(
192
+ "#Train hours= {}, #Test hours= {}".format(
193
+ train_total_duration / 3600, test_total_duration / 3600
194
+ )
195
+ )
196
+
197
+ # Save
198
+ os.makedirs(save_dir, exist_ok=True)
199
+ with open(train_output_file, "w") as f:
200
+ json.dump(train, f, indent=4, ensure_ascii=False)
201
+ with open(test_output_file, "w") as f:
202
+ json.dump(test, f, indent=4, ensure_ascii=False)
preprocessors/custom.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from glob import glob
7
+ import os
8
+ import json
9
+ import torchaudio
10
+ from tqdm import tqdm
11
+ from collections import defaultdict
12
+
13
+ from utils.util import has_existed
14
+
15
+
16
+ def statistics(utterance_dir):
17
+ singers = []
18
+ songs = []
19
+ singers2songs = defaultdict(lambda: defaultdict(list))
20
+
21
+ singer_infos = glob(utterance_dir + "/*")
22
+
23
+ for singer_info in singer_infos:
24
+ singer = singer_info.split("/")[-1]
25
+
26
+ song_infos = glob(singer_info + "/*")
27
+
28
+ for song_info in song_infos:
29
+ song = song_info.split("/")[-1]
30
+
31
+ singers.append(singer)
32
+ songs.append(song)
33
+
34
+ utts = glob(song_info + "/*.wav")
35
+
36
+ for utt in utts:
37
+ uid = utt.split("/")[-1].split(".")[0]
38
+ singers2songs[singer][song].append(uid)
39
+
40
+ unique_singers = list(set(singers))
41
+ unique_songs = list(set(songs))
42
+ unique_singers.sort()
43
+ unique_songs.sort()
44
+
45
+ print(
46
+ "Statistics: {} singers, {} utterances ({} unique songs)".format(
47
+ len(unique_singers), len(songs), len(unique_songs)
48
+ )
49
+ )
50
+ print("Singers: \n{}".format("\t".join(unique_singers)))
51
+ return singers2songs, unique_singers
52
+
53
+
54
+ def main(output_path, dataset_path, dataset_name):
55
+ print("-" * 10)
56
+ print("Preparing samples for {}...\n".format(dataset_name))
57
+
58
+ save_dir = os.path.join(output_path, dataset_name)
59
+ os.makedirs(save_dir, exist_ok=True)
60
+
61
+ train_output_file = os.path.join(save_dir, "train.json")
62
+ test_output_file = os.path.join(save_dir, "test.json")
63
+ singer_dict_file = os.path.join(save_dir, "singers.json")
64
+ utt2singer_file = os.path.join(save_dir, "utt2singer")
65
+ if (
66
+ has_existed(train_output_file)
67
+ and has_existed(test_output_file)
68
+ and has_existed(singer_dict_file)
69
+ and has_existed(utt2singer_file)
70
+ ):
71
+ return
72
+ utt2singer = open(utt2singer_file, "w")
73
+
74
+ # Load
75
+ singers2songs, unique_singers = statistics(dataset_path)
76
+
77
+ # We select songs of standard samples as test songs
78
+ train = []
79
+ test = []
80
+ test_songs = set()
81
+
82
+ train_index_count = 0
83
+ test_index_count = 0
84
+
85
+ train_total_duration = 0
86
+ test_total_duration = 0
87
+
88
+ for singer, songs in singers2songs.items():
89
+ song_names = list(songs.keys())
90
+
91
+ print("Singer {}...".format(singer))
92
+ for chosen_song in tqdm(song_names):
93
+ for chosen_uid in songs[chosen_song]:
94
+ res = {
95
+ "Dataset": dataset_name,
96
+ "Singer": singer,
97
+ "Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
98
+ }
99
+ res["Path"] = "{}/{}/{}.wav".format(singer, chosen_song, chosen_uid)
100
+ res["Path"] = os.path.join(dataset_path, res["Path"])
101
+ assert os.path.exists(res["Path"])
102
+
103
+ waveform, sample_rate = torchaudio.load(res["Path"])
104
+ duration = waveform.size(-1) / sample_rate
105
+ res["Duration"] = duration
106
+
107
+ # Remove the utterance whose duration is shorter than 0.1s
108
+ if duration <= 1e-2:
109
+ continue
110
+
111
+ # Place into train or test
112
+ if "{}_{}".format(singer, chosen_song) not in test_songs:
113
+ test_songs.add("{}_{}".format(singer, chosen_song))
114
+
115
+ res["index"] = test_index_count
116
+ test_total_duration += duration
117
+ test.append(res)
118
+ test_index_count += 1
119
+ else:
120
+ res["index"] = train_index_count
121
+ train_total_duration += duration
122
+ train.append(res)
123
+ train_index_count += 1
124
+
125
+ utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
126
+
127
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
128
+ print(
129
+ "#Train hours= {}, #Test hours= {}".format(
130
+ train_total_duration / 3600, test_total_duration / 3600
131
+ )
132
+ )
133
+
134
+ # Save train.json and test.json
135
+ with open(train_output_file, "w") as f:
136
+ json.dump(train, f, indent=4, ensure_ascii=False)
137
+ with open(test_output_file, "w") as f:
138
+ json.dump(test, f, indent=4, ensure_ascii=False)
139
+
140
+ # Save singers.json
141
+ singer_lut = {name: i for i, name in enumerate(unique_singers)}
142
+ with open(singer_dict_file, "w") as f:
143
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
preprocessors/kising.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import random
7
+ import os
8
+ import json
9
+ import torchaudio
10
+ from tqdm import tqdm
11
+ from glob import glob
12
+ from collections import defaultdict
13
+
14
+ from utils.util import has_existed
15
+ from preprocessors import GOLDEN_TEST_SAMPLES
16
+
17
+
18
+ def get_test_folders():
19
+ golden_samples = GOLDEN_TEST_SAMPLES["kising"]
20
+ # every item is a string
21
+ golden_folders = [s.split("_")[:1] for s in golden_samples]
22
+ # folder, eg: 422
23
+ return golden_folders
24
+
25
+
26
+ def KiSing_statistics(data_dir):
27
+ folders = []
28
+ folders2utts = defaultdict(list)
29
+
30
+ folder_infos = glob(data_dir + "/*")
31
+
32
+ for folder_info in folder_infos:
33
+ folder = folder_info.split("/")[-1]
34
+
35
+ folders.append(folder)
36
+
37
+ utts = glob(folder_info + "/*.wav")
38
+
39
+ for utt in utts:
40
+ uid = utt.split("/")[-1].split(".")[0]
41
+ folders2utts[folder].append(uid)
42
+
43
+ unique_folders = list(set(folders))
44
+ unique_folders.sort()
45
+
46
+ print("KiSing: {} unique songs".format(len(unique_folders)))
47
+ return folders2utts
48
+
49
+
50
+ def main(output_path, dataset_path):
51
+ print("-" * 10)
52
+ print("Preparing test samples for KiSing...\n")
53
+
54
+ save_dir = os.path.join(output_path, "kising")
55
+ train_output_file = os.path.join(save_dir, "train.json")
56
+ test_output_file = os.path.join(save_dir, "test.json")
57
+ if has_existed(test_output_file):
58
+ return
59
+
60
+ # Load
61
+ KiSing_dir = dataset_path
62
+
63
+ folders2utts = KiSing_statistics(KiSing_dir)
64
+ test_folders = get_test_folders()
65
+
66
+ # We select songs of standard samples as test songs
67
+ train = []
68
+ test = []
69
+
70
+ train_index_count = 0
71
+ test_index_count = 0
72
+
73
+ train_total_duration = 0
74
+ test_total_duration = 0
75
+
76
+ folder_names = list(folders2utts.keys())
77
+
78
+ for chosen_folder in folder_names:
79
+ for chosen_uid in folders2utts[chosen_folder]:
80
+ res = {
81
+ "Dataset": "kising",
82
+ "Singer": "female1",
83
+ "Uid": "{}_{}".format(chosen_folder, chosen_uid),
84
+ }
85
+ res["Path"] = "{}/{}.wav".format(chosen_folder, chosen_uid)
86
+ res["Path"] = os.path.join(KiSing_dir, res["Path"])
87
+ assert os.path.exists(res["Path"])
88
+
89
+ waveform, sample_rate = torchaudio.load(res["Path"])
90
+ duration = waveform.size(-1) / sample_rate
91
+ res["Duration"] = duration
92
+
93
+ if ([chosen_folder]) in test_folders:
94
+ res["index"] = test_index_count
95
+ test_total_duration += duration
96
+ test.append(res)
97
+ test_index_count += 1
98
+ else:
99
+ res["index"] = train_index_count
100
+ train_total_duration += duration
101
+ train.append(res)
102
+ train_index_count += 1
103
+
104
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
105
+ print(
106
+ "#Train hours= {}, #Test hours= {}".format(
107
+ train_total_duration / 3600, test_total_duration / 3600
108
+ )
109
+ )
110
+
111
+ # Save
112
+ os.makedirs(save_dir, exist_ok=True)
113
+ with open(train_output_file, "w") as f:
114
+ json.dump(train, f, indent=4, ensure_ascii=False)
115
+ with open(test_output_file, "w") as f:
116
+ json.dump(test, f, indent=4, ensure_ascii=False)
preprocessors/libritts.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import torchaudio
9
+ from tqdm import tqdm
10
+ from glob import glob
11
+ from collections import defaultdict
12
+
13
+ from utils.util import has_existed
14
+
15
+
16
+ def libritts_statistics(data_dir):
17
+ speakers = []
18
+ distribution2speakers2pharases2utts = defaultdict(
19
+ lambda: defaultdict(lambda: defaultdict(list))
20
+ )
21
+
22
+ distribution_infos = glob(data_dir + "/*")
23
+
24
+ for distribution_info in distribution_infos:
25
+ distribution = distribution_info.split("/")[-1]
26
+ print(distribution)
27
+
28
+ speaker_infos = glob(distribution_info + "/*")
29
+
30
+ if len(speaker_infos) == 0:
31
+ continue
32
+
33
+ for speaker_info in speaker_infos:
34
+ speaker = speaker_info.split("/")[-1]
35
+
36
+ speakers.append(speaker)
37
+
38
+ pharase_infos = glob(speaker_info + "/*")
39
+
40
+ for pharase_info in pharase_infos:
41
+ pharase = pharase_info.split("/")[-1]
42
+
43
+ utts = glob(pharase_info + "/*.wav")
44
+
45
+ for utt in utts:
46
+ uid = utt.split("/")[-1].split(".")[0]
47
+ distribution2speakers2pharases2utts[distribution][speaker][
48
+ pharase
49
+ ].append(uid)
50
+
51
+ unique_speakers = list(set(speakers))
52
+ unique_speakers.sort()
53
+
54
+ print("Speakers: \n{}".format("\t".join(unique_speakers)))
55
+ return distribution2speakers2pharases2utts, unique_speakers
56
+
57
+
58
+ def main(output_path, dataset_path):
59
+ print("-" * 10)
60
+ print("Preparing samples for libritts...\n")
61
+
62
+ save_dir = os.path.join(output_path, "libritts")
63
+ os.makedirs(save_dir, exist_ok=True)
64
+ train_output_file = os.path.join(save_dir, "train.json")
65
+ test_output_file = os.path.join(save_dir, "test.json")
66
+ singer_dict_file = os.path.join(save_dir, "singers.json")
67
+ utt2singer_file = os.path.join(save_dir, "utt2singer")
68
+ if has_existed(train_output_file):
69
+ return
70
+ utt2singer = open(utt2singer_file, "w")
71
+
72
+ # Load
73
+ libritts_path = dataset_path
74
+
75
+ distribution2speakers2pharases2utts, unique_speakers = libritts_statistics(
76
+ libritts_path
77
+ )
78
+
79
+ # We select pharases of standard spekaer as test songs
80
+ train = []
81
+ test = []
82
+
83
+ train_index_count = 0
84
+ test_index_count = 0
85
+
86
+ train_total_duration = 0
87
+ test_total_duration = 0
88
+
89
+ for distribution, speakers2pharases2utts in tqdm(
90
+ distribution2speakers2pharases2utts.items()
91
+ ):
92
+ for speaker, pharases2utts in tqdm(speakers2pharases2utts.items()):
93
+ pharase_names = list(pharases2utts.keys())
94
+
95
+ for chosen_pharase in pharase_names:
96
+ for chosen_uid in pharases2utts[chosen_pharase]:
97
+ res = {
98
+ "Dataset": "libritts",
99
+ "Singer": speaker,
100
+ "Uid": "{}#{}#{}#{}".format(
101
+ distribution, speaker, chosen_pharase, chosen_uid
102
+ ),
103
+ }
104
+ res["Path"] = "{}/{}/{}/{}.wav".format(
105
+ distribution, speaker, chosen_pharase, chosen_uid
106
+ )
107
+ res["Path"] = os.path.join(libritts_path, res["Path"])
108
+ assert os.path.exists(res["Path"])
109
+
110
+ waveform, sample_rate = torchaudio.load(res["Path"])
111
+ duration = waveform.size(-1) / sample_rate
112
+ res["Duration"] = duration
113
+
114
+ if not "train" in distribution:
115
+ res["index"] = test_index_count
116
+ test_total_duration += duration
117
+ test.append(res)
118
+ test_index_count += 1
119
+ else:
120
+ res["index"] = train_index_count
121
+ train_total_duration += duration
122
+ train.append(res)
123
+ train_index_count += 1
124
+
125
+ utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
126
+
127
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
128
+ print(
129
+ "#Train hours= {}, #Test hours= {}".format(
130
+ train_total_duration / 3600, test_total_duration / 3600
131
+ )
132
+ )
133
+
134
+ # Save train.json and test.json
135
+ with open(train_output_file, "w") as f:
136
+ json.dump(train, f, indent=4, ensure_ascii=False)
137
+ with open(test_output_file, "w") as f:
138
+ json.dump(test, f, indent=4, ensure_ascii=False)
139
+
140
+ # Save singers.json
141
+ singer_lut = {name: i for i, name in enumerate(unique_speakers)}
142
+ with open(singer_dict_file, "w") as f:
143
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
preprocessors/lijian.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import glob
7
+ import os
8
+ import json
9
+ import torchaudio
10
+ from tqdm import tqdm
11
+ from collections import defaultdict
12
+
13
+
14
+ from utils.io import save_audio
15
+ from utils.util import has_existed, remove_and_create
16
+ from utils.audio_slicer import Slicer
17
+ from preprocessors import GOLDEN_TEST_SAMPLES
18
+
19
+
20
+ def split_to_utterances(input_dir, output_dir):
21
+ print("Splitting to utterances for {}...".format(input_dir))
22
+
23
+ files_list = glob.glob("*.flac", root_dir=input_dir)
24
+ files_list.sort()
25
+ for wav_file in tqdm(files_list):
26
+ # Load waveform
27
+ waveform, fs = torchaudio.load(os.path.join(input_dir, wav_file))
28
+
29
+ # Song name
30
+ filename = wav_file.replace(" ", "")
31
+ filename = filename.replace("(Live)", "")
32
+ song_id, filename = filename.split("李健-")
33
+
34
+ song_id = song_id.split("_")[0]
35
+ song_name = "{:03d}".format(int(song_id)) + filename.split("_")[0].split("-")[0]
36
+
37
+ # Split
38
+ slicer = Slicer(sr=fs, threshold=-30.0, max_sil_kept=3000)
39
+ chunks = slicer.slice(waveform)
40
+
41
+ save_dir = os.path.join(output_dir, song_name)
42
+ remove_and_create(save_dir)
43
+
44
+ for i, chunk in enumerate(chunks):
45
+ output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
46
+ save_audio(output_file, chunk, fs)
47
+
48
+
49
+ def _main(dataset_path):
50
+ """
51
+ Split to utterances
52
+ """
53
+ utterance_dir = os.path.join(dataset_path, "utterances")
54
+ split_to_utterances(os.path.join(dataset_path, "vocal_v2"), utterance_dir)
55
+
56
+
57
+ def get_test_songs():
58
+ golden_samples = GOLDEN_TEST_SAMPLES["lijian"]
59
+ golden_songs = [s.split("_")[0] for s in golden_samples]
60
+ return golden_songs
61
+
62
+
63
+ def statistics(utt_dir):
64
+ song2utts = defaultdict(list)
65
+
66
+ song_infos = glob.glob(utt_dir + "/*")
67
+ song_infos.sort()
68
+ for song in song_infos:
69
+ song_name = song.split("/")[-1]
70
+ utt_infos = glob.glob(song + "/*.wav")
71
+ utt_infos.sort()
72
+ for utt in utt_infos:
73
+ uid = utt.split("/")[-1].split(".")[0]
74
+ song2utts[song_name].append(uid)
75
+
76
+ utt_sum = sum([len(utts) for utts in song2utts.values()])
77
+ print("Li Jian: {} unique songs, {} utterances".format(len(song2utts), utt_sum))
78
+ return song2utts
79
+
80
+
81
+ def main(output_path, dataset_path):
82
+ print("-" * 10)
83
+ print("Preparing test samples for Li Jian...\n")
84
+
85
+ if not os.path.exists(os.path.join(dataset_path, "utterances")):
86
+ print("Spliting into utterances...\n")
87
+ _main(dataset_path)
88
+
89
+ save_dir = os.path.join(output_path, "lijian")
90
+ train_output_file = os.path.join(save_dir, "train.json")
91
+ test_output_file = os.path.join(save_dir, "test.json")
92
+ if has_existed(test_output_file):
93
+ return
94
+
95
+ # Load
96
+ lijian_path = os.path.join(dataset_path, "utterances")
97
+ song2utts = statistics(lijian_path)
98
+ test_songs = get_test_songs()
99
+
100
+ # We select songs of standard samples as test songs
101
+ train = []
102
+ test = []
103
+
104
+ train_index_count = 0
105
+ test_index_count = 0
106
+
107
+ train_total_duration = 0
108
+ test_total_duration = 0
109
+
110
+ for chosen_song, utts in tqdm(song2utts.items()):
111
+ for chosen_uid in song2utts[chosen_song]:
112
+ res = {
113
+ "Dataset": "lijian",
114
+ "Singer": "lijian",
115
+ "Uid": "{}_{}".format(chosen_song, chosen_uid),
116
+ }
117
+ res["Path"] = "{}/{}.wav".format(chosen_song, chosen_uid)
118
+ res["Path"] = os.path.join(lijian_path, res["Path"])
119
+ assert os.path.exists(res["Path"])
120
+
121
+ waveform, sample_rate = torchaudio.load(res["Path"])
122
+ duration = waveform.size(-1) / sample_rate
123
+ res["Duration"] = duration
124
+
125
+ if duration <= 1e-8:
126
+ continue
127
+
128
+ if chosen_song in test_songs:
129
+ res["index"] = test_index_count
130
+ test_total_duration += duration
131
+ test.append(res)
132
+ test_index_count += 1
133
+ else:
134
+ res["index"] = train_index_count
135
+ train_total_duration += duration
136
+ train.append(res)
137
+ train_index_count += 1
138
+
139
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
140
+ print(
141
+ "#Train hours= {}, #Test hours= {}".format(
142
+ train_total_duration / 3600, test_total_duration / 3600
143
+ )
144
+ )
145
+
146
+ # Save
147
+ os.makedirs(save_dir, exist_ok=True)
148
+ with open(train_output_file, "w") as f:
149
+ json.dump(train, f, indent=4, ensure_ascii=False)
150
+ with open(test_output_file, "w") as f:
151
+ json.dump(test, f, indent=4, ensure_ascii=False)
preprocessors/ljspeech.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import json
7
+ from tqdm import tqdm
8
+ import os
9
+ import torchaudio
10
+ from utils import audio
11
+ import csv
12
+ import random
13
+
14
+ from utils.util import has_existed
15
+ from text import _clean_text
16
+ import librosa
17
+ import soundfile as sf
18
+ from scipy.io import wavfile
19
+
20
+ from pathlib import Path
21
+ import numpy as np
22
+
23
+
24
+ def textgird_extract(
25
+ corpus_directory,
26
+ output_directory,
27
+ mfa_path=os.path.join("mfa", "montreal-forced-aligner", "bin", "mfa_align"),
28
+ lexicon=os.path.join("mfa", "lexicon", "librispeech-lexicon.txt"),
29
+ acoustic_model_path=os.path.join(
30
+ "mfa", "montreal-forced-aligner", "pretrained_models", "english.zip"
31
+ ),
32
+ jobs="8",
33
+ ):
34
+ assert os.path.exists(
35
+ corpus_directory
36
+ ), "Please check the directionary contains *.wav, *.lab"
37
+ assert (
38
+ os.path.exists(mfa_path)
39
+ and os.path.exists(lexicon)
40
+ and os.path.exists(acoustic_model_path)
41
+ ), f"Please download the MFA tools to {mfa_path} firstly"
42
+ Path(output_directory).mkdir(parents=True, exist_ok=True)
43
+ print(f"MFA results are save in {output_directory}")
44
+ os.system(
45
+ f".{os.path.sep}{mfa_path} {corpus_directory} {lexicon} {acoustic_model_path} {output_directory} -j {jobs} --clean"
46
+ )
47
+
48
+
49
+ def get_lines(file):
50
+ lines = []
51
+ with open(file, encoding="utf-8") as f:
52
+ for line in tqdm(f):
53
+ lines.append(line.strip())
54
+ return lines
55
+
56
+
57
+ def get_uid2utt(ljspeech_path, dataset, cfg):
58
+ index_count = 0
59
+ total_duration = 0
60
+
61
+ uid2utt = []
62
+ for l in tqdm(dataset):
63
+ items = l.split("|")
64
+ uid = items[0]
65
+ text = items[2]
66
+
67
+ res = {
68
+ "Dataset": "LJSpeech",
69
+ "index": index_count,
70
+ "Singer": "LJSpeech",
71
+ "Uid": uid,
72
+ "Text": text,
73
+ }
74
+
75
+ # Duration in wav files
76
+ audio_file = os.path.join(ljspeech_path, "wavs/{}.wav".format(uid))
77
+
78
+ res["Path"] = audio_file
79
+
80
+ waveform, sample_rate = torchaudio.load(audio_file)
81
+ duration = waveform.size(-1) / sample_rate
82
+ res["Duration"] = duration
83
+
84
+ uid2utt.append(res)
85
+
86
+ index_count = index_count + 1
87
+ total_duration += duration
88
+
89
+ return uid2utt, total_duration / 3600
90
+
91
+
92
+ def split_dataset(lines, test_rate=0.05, test_size=None):
93
+ if test_size == None:
94
+ test_size = int(len(lines) * test_rate)
95
+ random.shuffle(lines)
96
+
97
+ train_set = []
98
+ test_set = []
99
+
100
+ for line in lines[:test_size]:
101
+ test_set.append(line)
102
+ for line in lines[test_size:]:
103
+ train_set.append(line)
104
+ return train_set, test_set
105
+
106
+
107
+ max_wav_value = 32768.0
108
+
109
+
110
+ def prepare_align(dataset, dataset_path, cfg, output_path):
111
+ in_dir = dataset_path
112
+ out_dir = os.path.join(output_path, dataset, cfg.raw_data)
113
+ sampling_rate = cfg.sample_rate
114
+ cleaners = cfg.text_cleaners
115
+ speaker = "LJSpeech"
116
+ with open(os.path.join(dataset_path, "metadata.csv"), encoding="utf-8") as f:
117
+ for line in tqdm(f):
118
+ parts = line.strip().split("|")
119
+ base_name = parts[0]
120
+ text = parts[2]
121
+ text = _clean_text(text, cleaners)
122
+
123
+ output_wav_path = os.path.join(out_dir, speaker, "{}.wav".format(base_name))
124
+ output_lab_path = os.path.join(out_dir, speaker, "{}.lab".format(base_name))
125
+
126
+ if os.path.exists(output_wav_path) and os.path.exists(output_lab_path):
127
+ continue
128
+
129
+ wav_path = os.path.join(in_dir, "wavs", "{}.wav".format(base_name))
130
+ if os.path.exists(wav_path):
131
+ os.makedirs(os.path.join(out_dir, speaker), exist_ok=True)
132
+ wav, _ = librosa.load(wav_path, sampling_rate)
133
+ wav = wav / max(abs(wav)) * max_wav_value
134
+
135
+ wavfile.write(
136
+ os.path.join(out_dir, speaker, "{}.wav".format(base_name)),
137
+ sampling_rate,
138
+ wav.astype(np.int16),
139
+ )
140
+
141
+ with open(
142
+ os.path.join(out_dir, speaker, "{}.lab".format(base_name)),
143
+ "w",
144
+ ) as f1:
145
+ f1.write(text)
146
+ # Extract textgird with MFA
147
+ textgird_extract(
148
+ corpus_directory=out_dir,
149
+ output_directory=os.path.join(output_path, dataset, "TextGrid"),
150
+ )
151
+
152
+
153
+ def main(output_path, dataset_path, cfg):
154
+ print("-" * 10)
155
+ print("Dataset splits for {}...\n".format("LJSpeech"))
156
+
157
+ dataset = "LJSpeech"
158
+
159
+ save_dir = os.path.join(output_path, dataset)
160
+ os.makedirs(save_dir, exist_ok=True)
161
+ ljspeech_path = dataset_path
162
+
163
+ train_output_file = os.path.join(save_dir, "train.json")
164
+ test_output_file = os.path.join(save_dir, "test.json")
165
+ singer_dict_file = os.path.join(save_dir, "singers.json")
166
+
167
+ speaker = "LJSpeech"
168
+ speakers = [dataset + "_" + speaker]
169
+ singer_lut = {name: i for i, name in enumerate(sorted(speakers))}
170
+ with open(singer_dict_file, "w") as f:
171
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
172
+
173
+ if has_existed(train_output_file) and has_existed(test_output_file):
174
+ return
175
+
176
+ meta_file = os.path.join(ljspeech_path, "metadata.csv")
177
+ lines = get_lines(meta_file)
178
+
179
+ train_set, test_set = split_dataset(lines)
180
+
181
+ res, hours = get_uid2utt(ljspeech_path, train_set, cfg)
182
+
183
+ # Save train
184
+ os.makedirs(save_dir, exist_ok=True)
185
+ with open(train_output_file, "w") as f:
186
+ json.dump(res, f, indent=4, ensure_ascii=False)
187
+
188
+ print("Train_hours= {}".format(hours))
189
+
190
+ res, hours = get_uid2utt(ljspeech_path, test_set, cfg)
191
+
192
+ # Save test
193
+ os.makedirs(save_dir, exist_ok=True)
194
+ with open(test_output_file, "w") as f:
195
+ json.dump(res, f, indent=4, ensure_ascii=False)
196
+
197
+ print("Test_hours= {}".format(hours))
preprocessors/ljspeech_vocoder.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import torchaudio
9
+ from tqdm import tqdm
10
+ from glob import glob
11
+
12
+ from utils.util import has_existed
13
+
14
+
15
+ def main(output_path, dataset_path):
16
+ print("-" * 10)
17
+ print("Dataset splits for ljspeech...\n")
18
+
19
+ save_dir = os.path.join(output_path, "ljspeech")
20
+ ljspeech_path = dataset_path
21
+
22
+ wave_files = glob(ljspeech_path + "/wavs/*.wav")
23
+
24
+ train_output_file = os.path.join(save_dir, "train.json")
25
+ test_output_file = os.path.join(save_dir, "test.json")
26
+
27
+ if has_existed(train_output_file):
28
+ return
29
+
30
+ utts = []
31
+
32
+ for wave_file in tqdm(wave_files):
33
+ res = {
34
+ "Dataset": "ljspeech",
35
+ "Singer": "female1",
36
+ "Uid": "{}".format(wave_file.split("/")[-1].split(".")[0]),
37
+ }
38
+ res["Path"] = wave_file
39
+ assert os.path.exists(res["Path"])
40
+
41
+ waveform, sample_rate = torchaudio.load(res["Path"])
42
+ duration = waveform.size(-1) / sample_rate
43
+ res["Duration"] = duration
44
+
45
+ if duration <= 1e-8:
46
+ continue
47
+
48
+ utts.append(res)
49
+
50
+ test_length = len(utts) // 20
51
+
52
+ train_utts = []
53
+ train_index_count = 0
54
+ train_total_duration = 0
55
+
56
+ for i in tqdm(range(len(utts) - test_length)):
57
+ tmp = utts[i]
58
+ tmp["index"] = train_index_count
59
+ train_index_count += 1
60
+ train_total_duration += tmp["Duration"]
61
+ train_utts.append(tmp)
62
+
63
+ test_utts = []
64
+ test_index_count = 0
65
+ test_total_duration = 0
66
+
67
+ for i in tqdm(range(len(utts) - test_length, len(utts))):
68
+ tmp = utts[i]
69
+ tmp["index"] = test_index_count
70
+ test_index_count += 1
71
+ test_total_duration += tmp["Duration"]
72
+ test_utts.append(tmp)
73
+
74
+ print("#Train = {}, #Test = {}".format(len(train_utts), len(test_utts)))
75
+ print(
76
+ "#Train hours= {}, #Test hours= {}".format(
77
+ train_total_duration / 3600, test_total_duration / 3600
78
+ )
79
+ )
80
+
81
+ # Save
82
+ os.makedirs(save_dir, exist_ok=True)
83
+ with open(train_output_file, "w") as f:
84
+ json.dump(train_utts, f, indent=4, ensure_ascii=False)
85
+ with open(test_output_file, "w") as f:
86
+ json.dump(test_utts, f, indent=4, ensure_ascii=False)
preprocessors/m4singer.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import librosa
9
+ from tqdm import tqdm
10
+ from collections import defaultdict
11
+
12
+ from utils.util import has_existed
13
+ from preprocessors import GOLDEN_TEST_SAMPLES
14
+
15
+
16
+ def get_test_songs():
17
+ golden_samples = GOLDEN_TEST_SAMPLES["m4singer"]
18
+ # every item is a tuple (singer, song)
19
+ golden_songs = [s.split("_")[:2] for s in golden_samples]
20
+ # singer_song, eg: Alto-1_美错
21
+ golden_songs = ["_".join(t) for t in golden_songs]
22
+ return golden_songs
23
+
24
+
25
+ def m4singer_statistics(meta):
26
+ singers = []
27
+ songs = []
28
+ singer2songs = defaultdict(lambda: defaultdict(list))
29
+ for utt in meta:
30
+ p, s, uid = utt["item_name"].split("#")
31
+ singers.append(p)
32
+ songs.append(s)
33
+ singer2songs[p][s].append(uid)
34
+
35
+ unique_singers = list(set(singers))
36
+ unique_songs = list(set(songs))
37
+ unique_singers.sort()
38
+ unique_songs.sort()
39
+
40
+ print(
41
+ "M4Singer: {} singers, {} utterances ({} unique songs)".format(
42
+ len(unique_singers), len(songs), len(unique_songs)
43
+ )
44
+ )
45
+ print("Singers: \n{}".format("\t".join(unique_singers)))
46
+ return singer2songs, unique_singers
47
+
48
+
49
+ def main(output_path, dataset_path):
50
+ print("-" * 10)
51
+ print("Preparing test samples for m4singer...\n")
52
+
53
+ save_dir = os.path.join(output_path, "m4singer")
54
+ os.makedirs(save_dir, exist_ok=True)
55
+ train_output_file = os.path.join(save_dir, "train.json")
56
+ test_output_file = os.path.join(save_dir, "test.json")
57
+ singer_dict_file = os.path.join(save_dir, "singers.json")
58
+ utt2singer_file = os.path.join(save_dir, "utt2singer")
59
+ if (
60
+ has_existed(train_output_file)
61
+ and has_existed(test_output_file)
62
+ and has_existed(singer_dict_file)
63
+ and has_existed(utt2singer_file)
64
+ ):
65
+ return
66
+ utt2singer = open(utt2singer_file, "w")
67
+
68
+ # Load
69
+ m4singer_dir = dataset_path
70
+ meta_file = os.path.join(m4singer_dir, "meta.json")
71
+ with open(meta_file, "r", encoding="utf-8") as f:
72
+ meta = json.load(f)
73
+
74
+ singer2songs, unique_singers = m4singer_statistics(meta)
75
+
76
+ test_songs = get_test_songs()
77
+
78
+ # We select songs of standard samples as test songs
79
+ train = []
80
+ test = []
81
+
82
+ train_index_count = 0
83
+ test_index_count = 0
84
+
85
+ train_total_duration = 0
86
+ test_total_duration = 0
87
+
88
+ for singer, songs in tqdm(singer2songs.items()):
89
+ song_names = list(songs.keys())
90
+
91
+ for chosen_song in song_names:
92
+ chosen_song = chosen_song.replace(" ", "-")
93
+ for chosen_uid in songs[chosen_song]:
94
+ res = {
95
+ "Dataset": "m4singer",
96
+ "Singer": singer,
97
+ "Song": chosen_song,
98
+ "Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
99
+ }
100
+
101
+ res["Path"] = os.path.join(
102
+ m4singer_dir, "{}#{}/{}.wav".format(singer, chosen_song, chosen_uid)
103
+ )
104
+ assert os.path.exists(res["Path"])
105
+
106
+ duration = librosa.get_duration(filename=res["Path"])
107
+ res["Duration"] = duration
108
+
109
+ if "_".join([singer, chosen_song]) in test_songs:
110
+ res["index"] = test_index_count
111
+ test_total_duration += duration
112
+ test.append(res)
113
+ test_index_count += 1
114
+ else:
115
+ res["index"] = train_index_count
116
+ train_total_duration += duration
117
+ train.append(res)
118
+ train_index_count += 1
119
+
120
+ utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
121
+
122
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
123
+ print(
124
+ "#Train hours= {}, #Test hours= {}".format(
125
+ train_total_duration / 3600, test_total_duration / 3600
126
+ )
127
+ )
128
+
129
+ # Save train.json and test.json
130
+ with open(train_output_file, "w") as f:
131
+ json.dump(train, f, indent=4, ensure_ascii=False)
132
+ with open(test_output_file, "w") as f:
133
+ json.dump(test, f, indent=4, ensure_ascii=False)
134
+
135
+ # Save singers.json
136
+ singer_lut = {name: i for i, name in enumerate(unique_singers)}
137
+ with open(singer_dict_file, "w") as f:
138
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
preprocessors/metadata.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ from tqdm import tqdm
9
+
10
+
11
+ def cal_metadata(cfg):
12
+ """
13
+ Dump metadata (singers.json, meta_info.json, utt2singer) for singer dataset or multi-datasets.
14
+ """
15
+ from collections import Counter
16
+
17
+ datasets = cfg.dataset
18
+
19
+ print("-" * 10)
20
+ print("Preparing metadata...")
21
+ print("Including: \n{}\n".format("\n".join(datasets)))
22
+
23
+ datasets.sort()
24
+
25
+ for dataset in tqdm(datasets):
26
+ save_dir = os.path.join(cfg.preprocess.processed_dir, dataset)
27
+ assert os.path.exists(save_dir)
28
+
29
+ # 'train.json' and 'test.json' of target dataset
30
+ train_metadata = os.path.join(save_dir, "train.json")
31
+ test_metadata = os.path.join(save_dir, "test.json")
32
+
33
+ # Sort the metadata as the duration order
34
+ with open(train_metadata, "r", encoding="utf-8") as f:
35
+ train_utterances = json.load(f)
36
+ with open(test_metadata, "r", encoding="utf-8") as f:
37
+ test_utterances = json.load(f)
38
+
39
+ train_utterances = sorted(train_utterances, key=lambda x: x["Duration"])
40
+ test_utterances = sorted(test_utterances, key=lambda x: x["Duration"])
41
+
42
+ # Write back the sorted metadata
43
+ with open(train_metadata, "w") as f:
44
+ json.dump(train_utterances, f, indent=4, ensure_ascii=False)
45
+ with open(test_metadata, "w") as f:
46
+ json.dump(test_utterances, f, indent=4, ensure_ascii=False)
47
+
48
+ # Paths of metadata needed to be generated
49
+ singer_dict_file = os.path.join(save_dir, cfg.preprocess.spk2id)
50
+ utt2singer_file = os.path.join(save_dir, cfg.preprocess.utt2spk)
51
+
52
+ # Get the total duration and singer names for train and test utterances
53
+ train_total_duration = sum(utt["Duration"] for utt in train_utterances)
54
+ test_total_duration = sum(utt["Duration"] for utt in test_utterances)
55
+
56
+ singer_names = set(
57
+ f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
58
+ for utt in train_utterances + test_utterances
59
+ )
60
+
61
+ # Write the utt2singer file and sort the singer names
62
+ with open(utt2singer_file, "w", encoding="utf-8") as f:
63
+ for utt in train_utterances + test_utterances:
64
+ f.write(
65
+ f"{utt['Dataset']}_{utt['Uid']}\t{replace_augment_name(utt['Dataset'])}_{utt['Singer']}\n"
66
+ )
67
+
68
+ singer_names = sorted(singer_names)
69
+ singer_lut = {name: i for i, name in enumerate(singer_names)}
70
+
71
+ # dump singers.json
72
+ with open(singer_dict_file, "w", encoding="utf-8") as f:
73
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
74
+
75
+ meta_info = {
76
+ "dataset": dataset,
77
+ "statistics": {
78
+ "size": len(train_utterances) + len(test_utterances),
79
+ "hours": round(train_total_duration / 3600, 4)
80
+ + round(test_total_duration / 3600, 4),
81
+ },
82
+ "train": {
83
+ "size": len(train_utterances),
84
+ "hours": round(train_total_duration / 3600, 4),
85
+ },
86
+ "test": {
87
+ "size": len(test_utterances),
88
+ "hours": round(test_total_duration / 3600, 4),
89
+ },
90
+ "singers": {"size": len(singer_lut)},
91
+ }
92
+ # Use Counter to count the minutes for each singer
93
+ total_singer2mins = Counter()
94
+ training_singer2mins = Counter()
95
+ for utt in train_utterances:
96
+ k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
97
+ training_singer2mins[k] += utt["Duration"] / 60
98
+ total_singer2mins[k] += utt["Duration"] / 60
99
+ for utt in test_utterances:
100
+ k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
101
+ total_singer2mins[k] += utt["Duration"] / 60
102
+
103
+ training_singer2mins = dict(
104
+ sorted(training_singer2mins.items(), key=lambda x: x[1], reverse=True)
105
+ )
106
+ training_singer2mins = {k: round(v, 2) for k, v in training_singer2mins.items()}
107
+ meta_info["singers"]["training_minutes"] = training_singer2mins
108
+
109
+ total_singer2mins = dict(
110
+ sorted(total_singer2mins.items(), key=lambda x: x[1], reverse=True)
111
+ )
112
+ total_singer2mins = {k: round(v, 2) for k, v in total_singer2mins.items()}
113
+ meta_info["singers"]["minutes"] = total_singer2mins
114
+
115
+ with open(os.path.join(save_dir, "meta_info.json"), "w") as f:
116
+ json.dump(meta_info, f, indent=4, ensure_ascii=False)
117
+
118
+ for singer, min in training_singer2mins.items():
119
+ print(f"Singer {singer}: {min} mins for training")
120
+ print("-" * 10, "\n")
121
+
122
+
123
+ def replace_augment_name(dataset: str) -> str:
124
+ """Replace the augmented dataset name with the original dataset name.
125
+ >>> print(replace_augment_name("dataset_equalizer"))
126
+ dataset
127
+ """
128
+ if "equalizer" in dataset:
129
+ dataset = dataset.replace("_equalizer", "")
130
+ elif "formant_shift" in dataset:
131
+ dataset = dataset.replace("_formant_shift", "")
132
+ elif "pitch_shift" in dataset:
133
+ dataset = dataset.replace("_pitch_shift", "")
134
+ elif "time_stretch" in dataset:
135
+ dataset = dataset.replace("_time_stretch", "")
136
+ else:
137
+ pass
138
+ return dataset
preprocessors/nus48e.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import torchaudio
9
+ from tqdm import tqdm
10
+ from glob import glob
11
+ from collections import defaultdict
12
+
13
+
14
+ from utils.io import save_audio
15
+ from utils.util import has_existed
16
+ from utils.audio_slicer import Slicer
17
+ from preprocessors import GOLDEN_TEST_SAMPLES
18
+
19
+
20
+ def split_to_utterances(dataset_path, singer, style, output_dir):
21
+ data_dir = os.path.join(dataset_path, singer, style)
22
+
23
+ print("Splitting to utterances for {}...".format(data_dir))
24
+
25
+ wave_files = glob(data_dir + "/*.wav")
26
+
27
+ for wav_file in tqdm(wave_files):
28
+ # Load waveform
29
+ song_name = wav_file.split("/")[-1].split(".")[0]
30
+ waveform, fs = torchaudio.load(wav_file)
31
+
32
+ # Split
33
+ slicer = Slicer(sr=fs, threshold=-40.0, max_sil_kept=4000)
34
+ chunks = slicer.slice(waveform)
35
+
36
+ for i, chunk in enumerate(chunks):
37
+ save_dir = os.path.join(output_dir, singer, style, song_name)
38
+ os.makedirs(save_dir, exist_ok=True)
39
+
40
+ output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
41
+ save_audio(output_file, chunk, fs)
42
+
43
+
44
+ def _main(dataset_path):
45
+ """
46
+ Split to utterances
47
+ """
48
+ utterance_dir = os.path.join(dataset_path, "utterances")
49
+
50
+ singer_infos = glob(dataset_path + "/*")
51
+
52
+ for singer_info in singer_infos:
53
+ singer = singer_info.split("/")[-1]
54
+
55
+ for style in ["read", "sing"]:
56
+ split_to_utterances(dataset_path, singer, style, utterance_dir)
57
+
58
+
59
+ def get_test_songs():
60
+ golden_samples = GOLDEN_TEST_SAMPLES["nus48e"]
61
+ # every item is a tuple (singer, song)
62
+ golden_songs = [s.split("#")[:2] for s in golden_samples]
63
+ # singer_song, eg: Female1#Almost_lover_Amateur
64
+ return golden_songs
65
+
66
+
67
+ def nus48e_statistics(data_dir):
68
+ singers = []
69
+ songs = []
70
+ singer2songs = defaultdict(lambda: defaultdict(list))
71
+
72
+ singer_infos = glob(data_dir + "/*")
73
+
74
+ for singer_info in singer_infos:
75
+ singer_info_split = singer_info.split("/")[-1]
76
+
77
+ style_infos = glob(singer_info + "/*")
78
+
79
+ for style_info in style_infos:
80
+ style_info_split = style_info.split("/")[-1]
81
+
82
+ singer = singer_info_split + "_" + style_info_split
83
+ singers.append(singer)
84
+
85
+ song_infos = glob(style_info + "/*")
86
+
87
+ for song_info in song_infos:
88
+ song = song_info.split("/")[-1]
89
+
90
+ songs.append(song)
91
+
92
+ utts = glob(song_info + "/*.wav")
93
+
94
+ for utt in utts:
95
+ uid = utt.split("/")[-1].split(".")[0]
96
+ singer2songs[singer][song].append(uid)
97
+
98
+ unique_singers = list(set(singers))
99
+ unique_songs = list(set(songs))
100
+ unique_singers.sort()
101
+ unique_songs.sort()
102
+
103
+ print(
104
+ "nus_48_e: {} singers, {} utterances ({} unique songs)".format(
105
+ len(unique_singers), len(songs), len(unique_songs)
106
+ )
107
+ )
108
+ print("Singers: \n{}".format("\t".join(unique_singers)))
109
+ return singer2songs, unique_singers
110
+
111
+
112
+ def main(output_path, dataset_path):
113
+ print("-" * 10)
114
+ print("Preparing test samples for nus48e...\n")
115
+
116
+ if not os.path.exists(os.path.join(dataset_path, "utterances")):
117
+ print("Spliting into utterances...\n")
118
+ _main(dataset_path)
119
+
120
+ save_dir = os.path.join(output_path, "nus48e")
121
+ os.makedirs(save_dir, exist_ok=True)
122
+ train_output_file = os.path.join(save_dir, "train.json")
123
+ test_output_file = os.path.join(save_dir, "test.json")
124
+ singer_dict_file = os.path.join(save_dir, "singers.json")
125
+ utt2singer_file = os.path.join(save_dir, "utt2singer")
126
+ if (
127
+ has_existed(train_output_file)
128
+ and has_existed(test_output_file)
129
+ and has_existed(singer_dict_file)
130
+ and has_existed(utt2singer_file)
131
+ ):
132
+ return
133
+ utt2singer = open(utt2singer_file, "w")
134
+
135
+ # Load
136
+ nus48e_path = os.path.join(dataset_path, "utterances")
137
+
138
+ singer2songs, unique_singers = nus48e_statistics(nus48e_path)
139
+ test_songs = get_test_songs()
140
+
141
+ # We select songs of standard samples as test songs
142
+ train = []
143
+ test = []
144
+
145
+ train_index_count = 0
146
+ test_index_count = 0
147
+
148
+ train_total_duration = 0
149
+ test_total_duration = 0
150
+
151
+ for singer, songs in singer2songs.items():
152
+ song_names = list(songs.keys())
153
+
154
+ for chosen_song in song_names:
155
+ for chosen_uid in songs[chosen_song]:
156
+ res = {
157
+ "Dataset": "nus48e",
158
+ "Singer": singer,
159
+ "Uid": "{}#{}#{}".format(singer, chosen_song, chosen_uid),
160
+ }
161
+ res["Path"] = "{}/{}/{}/{}.wav".format(
162
+ singer.split("_")[0], singer.split("_")[-1], chosen_song, chosen_uid
163
+ )
164
+ res["Path"] = os.path.join(nus48e_path, res["Path"])
165
+ assert os.path.exists(res["Path"])
166
+
167
+ waveform, sample_rate = torchaudio.load(res["Path"])
168
+ duration = waveform.size(-1) / sample_rate
169
+ res["Duration"] = duration
170
+
171
+ if duration <= 1e-8:
172
+ continue
173
+
174
+ if ([singer, chosen_song]) in test_songs:
175
+ res["index"] = test_index_count
176
+ test_total_duration += duration
177
+ test.append(res)
178
+ test_index_count += 1
179
+ else:
180
+ res["index"] = train_index_count
181
+ train_total_duration += duration
182
+ train.append(res)
183
+ train_index_count += 1
184
+
185
+ utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
186
+
187
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
188
+ print(
189
+ "#Train hours= {}, #Test hours= {}".format(
190
+ train_total_duration / 3600, test_total_duration / 3600
191
+ )
192
+ )
193
+
194
+ # Save train.json and test.json
195
+ with open(train_output_file, "w") as f:
196
+ json.dump(train, f, indent=4, ensure_ascii=False)
197
+ with open(test_output_file, "w") as f:
198
+ json.dump(test, f, indent=4, ensure_ascii=False)
199
+
200
+ # Save singers.json
201
+ singer_lut = {name: i for i, name in enumerate(unique_singers)}
202
+ with open(singer_dict_file, "w") as f:
203
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
preprocessors/opencpop.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import json
7
+ from tqdm import tqdm
8
+ import os
9
+ import librosa
10
+
11
+ from utils.util import has_existed
12
+
13
+
14
+ def get_lines(file):
15
+ with open(file, "r") as f:
16
+ lines = f.readlines()
17
+ lines = [l.strip() for l in lines]
18
+ return lines
19
+
20
+
21
+ def get_uid2utt(opencpop_path, dataset, dataset_type):
22
+ index_count = 0
23
+ total_duration = 0
24
+
25
+ file = os.path.join(opencpop_path, "segments", "{}.txt".format(dataset_type))
26
+ lines = get_lines(file)
27
+
28
+ uid2utt = []
29
+ for l in tqdm(lines):
30
+ items = l.split("|")
31
+ uid = items[0]
32
+
33
+ res = {
34
+ "Dataset": dataset,
35
+ "index": index_count,
36
+ "Singer": "female1",
37
+ "Uid": uid,
38
+ }
39
+
40
+ # Duration in wav files
41
+ audio_file = os.path.join(opencpop_path, "segments/wavs/{}.wav".format(uid))
42
+ res["Path"] = audio_file
43
+
44
+ duration = librosa.get_duration(filename=res["Path"])
45
+ res["Duration"] = duration
46
+
47
+ uid2utt.append(res)
48
+
49
+ index_count = index_count + 1
50
+ total_duration += duration
51
+
52
+ return uid2utt, total_duration / 3600
53
+
54
+
55
+ def main(dataset, output_path, dataset_path):
56
+ print("-" * 10)
57
+ print("Dataset splits for {}...\n".format(dataset))
58
+
59
+ save_dir = os.path.join(output_path, dataset)
60
+ opencpop_path = dataset_path
61
+ for dataset_type in ["train", "test"]:
62
+ output_file = os.path.join(save_dir, "{}.json".format(dataset_type))
63
+ if has_existed(output_file):
64
+ continue
65
+
66
+ res, hours = get_uid2utt(opencpop_path, dataset, dataset_type)
67
+
68
+ # Save
69
+ os.makedirs(save_dir, exist_ok=True)
70
+ with open(output_file, "w") as f:
71
+ json.dump(res, f, indent=4, ensure_ascii=False)
72
+
73
+ print("{}_{}_hours= {}".format(dataset, dataset_type, hours))
preprocessors/opensinger.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import random
7
+ import os
8
+ import json
9
+ import librosa
10
+ from tqdm import tqdm
11
+ from glob import glob
12
+ from collections import defaultdict
13
+
14
+ from utils.util import has_existed
15
+ from preprocessors import GOLDEN_TEST_SAMPLES
16
+
17
+
18
+ def get_test_songs():
19
+ golden_samples = GOLDEN_TEST_SAMPLES["opensinger"]
20
+ # every item is a tuple (singer, song)
21
+ golden_songs = [s.split("_")[:3] for s in golden_samples]
22
+ # singer_song, eg: Female1#Almost_lover_Amateur
23
+ return golden_songs
24
+
25
+
26
+ def opensinger_statistics(data_dir):
27
+ singers = []
28
+ songs = []
29
+ singer2songs = defaultdict(lambda: defaultdict(list))
30
+
31
+ gender_infos = glob(data_dir + "/*")
32
+
33
+ for gender_info in gender_infos:
34
+ gender_info_split = gender_info.split("/")[-1][:-3]
35
+
36
+ singer_and_song_infos = glob(gender_info + "/*")
37
+
38
+ for singer_and_song_info in singer_and_song_infos:
39
+ singer_and_song_info_split = singer_and_song_info.split("/")[-1].split("_")
40
+ singer_id, song = (
41
+ singer_and_song_info_split[0],
42
+ singer_and_song_info_split[1],
43
+ )
44
+ singer = gender_info_split + "_" + singer_id
45
+ singers.append(singer)
46
+ songs.append(song)
47
+
48
+ utts = glob(singer_and_song_info + "/*.wav")
49
+
50
+ for utt in utts:
51
+ uid = utt.split("/")[-1].split("_")[-1].split(".")[0]
52
+ singer2songs[singer][song].append(uid)
53
+
54
+ unique_singers = list(set(singers))
55
+ unique_songs = list(set(songs))
56
+ unique_singers.sort()
57
+ unique_songs.sort()
58
+
59
+ print(
60
+ "opensinger: {} singers, {} songs ({} unique songs)".format(
61
+ len(unique_singers), len(songs), len(unique_songs)
62
+ )
63
+ )
64
+ print("Singers: \n{}".format("\t".join(unique_singers)))
65
+ return singer2songs, unique_singers
66
+
67
+
68
+ def main(output_path, dataset_path):
69
+ print("-" * 10)
70
+ print("Preparing test samples for opensinger...\n")
71
+
72
+ save_dir = os.path.join(output_path, "opensinger")
73
+ os.makedirs(save_dir, exist_ok=True)
74
+ train_output_file = os.path.join(save_dir, "train.json")
75
+ test_output_file = os.path.join(save_dir, "test.json")
76
+ singer_dict_file = os.path.join(save_dir, "singers.json")
77
+ utt2singer_file = os.path.join(save_dir, "utt2singer")
78
+ if (
79
+ has_existed(train_output_file)
80
+ and has_existed(test_output_file)
81
+ and has_existed(singer_dict_file)
82
+ and has_existed(utt2singer_file)
83
+ ):
84
+ return
85
+ utt2singer = open(utt2singer_file, "w")
86
+
87
+ # Load
88
+ opensinger_path = dataset_path
89
+
90
+ singer2songs, unique_singers = opensinger_statistics(opensinger_path)
91
+ test_songs = get_test_songs()
92
+
93
+ # We select songs of standard samples as test songs
94
+ train = []
95
+ test = []
96
+
97
+ train_index_count = 0
98
+ test_index_count = 0
99
+
100
+ train_total_duration = 0
101
+ test_total_duration = 0
102
+
103
+ for i, (singer, songs) in enumerate(singer2songs.items()):
104
+ song_names = list(songs.keys())
105
+
106
+ for chosen_song in tqdm(
107
+ song_names, desc="Singer {}/{}".format(i, len(singer2songs))
108
+ ):
109
+ for chosen_uid in songs[chosen_song]:
110
+ res = {
111
+ "Dataset": "opensinger",
112
+ "Singer": singer,
113
+ "Song": chosen_song,
114
+ "Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
115
+ }
116
+ res["Path"] = "{}Raw/{}_{}/{}_{}_{}.wav".format(
117
+ singer.split("_")[0],
118
+ singer.split("_")[1],
119
+ chosen_song,
120
+ singer.split("_")[1],
121
+ chosen_song,
122
+ chosen_uid,
123
+ )
124
+ res["Path"] = os.path.join(opensinger_path, res["Path"])
125
+ assert os.path.exists(res["Path"])
126
+
127
+ duration = librosa.get_duration(filename=res["Path"])
128
+ res["Duration"] = duration
129
+
130
+ if duration > 30:
131
+ print(
132
+ "Wav file: {}, the duration = {:.2f}s > 30s, which has been abandoned.".format(
133
+ res["Path"], duration
134
+ )
135
+ )
136
+ continue
137
+
138
+ if (
139
+ [singer.split("_")[0], singer.split("_")[1], chosen_song]
140
+ ) in test_songs:
141
+ res["index"] = test_index_count
142
+ test_total_duration += duration
143
+ test.append(res)
144
+ test_index_count += 1
145
+ else:
146
+ res["index"] = train_index_count
147
+ train_total_duration += duration
148
+ train.append(res)
149
+ train_index_count += 1
150
+
151
+ utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
152
+
153
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
154
+ print(
155
+ "#Train hours= {}, #Test hours= {}".format(
156
+ train_total_duration / 3600, test_total_duration / 3600
157
+ )
158
+ )
159
+
160
+ # Save train.json and test.json
161
+ with open(train_output_file, "w") as f:
162
+ json.dump(train, f, indent=4, ensure_ascii=False)
163
+ with open(test_output_file, "w") as f:
164
+ json.dump(test, f, indent=4, ensure_ascii=False)
165
+
166
+ # Save singers.json
167
+ singer_lut = {name: i for i, name in enumerate(unique_singers)}
168
+ with open(singer_dict_file, "w") as f:
169
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
preprocessors/opera.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import os
9
+ from tqdm import tqdm
10
+ import torchaudio
11
+ from glob import glob
12
+ from collections import defaultdict
13
+
14
+ from utils.util import has_existed
15
+ from utils.io import save_audio
16
+ from utils.audio_slicer import Slicer
17
+ from preprocessors import GOLDEN_TEST_SAMPLES
18
+
19
+
20
+ def split_to_utterances(language_dir, output_dir):
21
+ print("Splitting to utterances for {}...".format(language_dir))
22
+
23
+ for wav_file in tqdm(glob("{}/*/*".format(language_dir))):
24
+ # Load waveform
25
+ singer_name, song_name = wav_file.split("/")[-2:]
26
+ song_name = song_name.split(".")[0]
27
+ waveform, fs = torchaudio.load(wav_file)
28
+
29
+ # Split
30
+ slicer = Slicer(sr=fs, threshold=-30.0, max_sil_kept=3000)
31
+ chunks = slicer.slice(waveform)
32
+
33
+ for i, chunk in enumerate(chunks):
34
+ save_dir = os.path.join(output_dir, singer_name, song_name)
35
+ os.makedirs(save_dir, exist_ok=True)
36
+
37
+ output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
38
+ save_audio(output_file, chunk, fs)
39
+
40
+
41
+ def _main(dataset_path):
42
+ """
43
+ Split to utterances
44
+ """
45
+ utterance_dir = os.path.join(dataset_path, "utterances")
46
+
47
+ for lang in ["chinese", "western"]:
48
+ split_to_utterances(os.path.join(dataset_path, lang), utterance_dir)
49
+
50
+
51
+ def get_test_songs():
52
+ golden_samples = GOLDEN_TEST_SAMPLES["opera"]
53
+ # every item is a tuple (singer, song)
54
+ golden_songs = [s.split("#")[:2] for s in golden_samples]
55
+ # singer#song, eg:fem_01#neg_01
56
+ return golden_songs
57
+
58
+
59
+ def opera_statistics(data_dir):
60
+ singers = []
61
+ songs = []
62
+ singers2songs = defaultdict(lambda: defaultdict(list))
63
+
64
+ singer_infos = glob(data_dir + "/*")
65
+
66
+ for singer_info in singer_infos:
67
+ singer = singer_info.split("/")[-1]
68
+
69
+ song_infos = glob(singer_info + "/*")
70
+
71
+ for song_info in song_infos:
72
+ song = song_info.split("/")[-1]
73
+
74
+ singers.append(singer)
75
+ songs.append(song)
76
+
77
+ utts = glob(song_info + "/*.wav")
78
+
79
+ for utt in utts:
80
+ uid = utt.split("/")[-1].split(".")[0]
81
+ singers2songs[singer][song].append(uid)
82
+
83
+ unique_singers = list(set(singers))
84
+ unique_songs = list(set(songs))
85
+ unique_singers.sort()
86
+ unique_songs.sort()
87
+
88
+ print(
89
+ "opera: {} singers, {} utterances ({} unique songs)".format(
90
+ len(unique_singers), len(songs), len(unique_songs)
91
+ )
92
+ )
93
+ print("Singers: \n{}".format("\t".join(unique_singers)))
94
+ return singers2songs, unique_singers
95
+
96
+
97
+ def main(output_path, dataset_path):
98
+ print("-" * 10)
99
+ print("Preparing test samples for opera...\n")
100
+
101
+ if not os.path.exists(os.path.join(dataset_path, "utterances")):
102
+ print("Spliting into utterances...\n")
103
+ _main(dataset_path)
104
+
105
+ save_dir = os.path.join(output_path, "opera")
106
+ os.makedirs(save_dir, exist_ok=True)
107
+ train_output_file = os.path.join(save_dir, "train.json")
108
+ test_output_file = os.path.join(save_dir, "test.json")
109
+ singer_dict_file = os.path.join(save_dir, "singers.json")
110
+ utt2singer_file = os.path.join(save_dir, "utt2singer")
111
+ if (
112
+ has_existed(train_output_file)
113
+ and has_existed(test_output_file)
114
+ and has_existed(singer_dict_file)
115
+ and has_existed(utt2singer_file)
116
+ ):
117
+ return
118
+ utt2singer = open(utt2singer_file, "w")
119
+
120
+ # Load
121
+ opera_path = os.path.join(dataset_path, "utterances")
122
+
123
+ singers2songs, unique_singers = opera_statistics(opera_path)
124
+ test_songs = get_test_songs()
125
+
126
+ # We select songs of standard samples as test songs
127
+ train = []
128
+ test = []
129
+
130
+ train_index_count = 0
131
+ test_index_count = 0
132
+
133
+ train_total_duration = 0
134
+ test_total_duration = 0
135
+
136
+ for singer, songs in tqdm(singers2songs.items()):
137
+ song_names = list(songs.keys())
138
+
139
+ for chosen_song in song_names:
140
+ for chosen_uid in songs[chosen_song]:
141
+ res = {
142
+ "Dataset": "opera",
143
+ "Singer": singer,
144
+ "Uid": "{}#{}#{}".format(singer, chosen_song, chosen_uid),
145
+ }
146
+ res["Path"] = "{}/{}/{}.wav".format(singer, chosen_song, chosen_uid)
147
+ res["Path"] = os.path.join(opera_path, res["Path"])
148
+ assert os.path.exists(res["Path"])
149
+
150
+ waveform, sample_rate = torchaudio.load(res["Path"])
151
+ duration = waveform.size(-1) / sample_rate
152
+ res["Duration"] = duration
153
+
154
+ if duration <= 1e-8:
155
+ continue
156
+
157
+ if ([singer, chosen_song]) in test_songs:
158
+ res["index"] = test_index_count
159
+ test_total_duration += duration
160
+ test.append(res)
161
+ test_index_count += 1
162
+ else:
163
+ res["index"] = train_index_count
164
+ train_total_duration += duration
165
+ train.append(res)
166
+ train_index_count += 1
167
+
168
+ utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
169
+
170
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
171
+ print(
172
+ "#Train hours= {}, #Test hours= {}".format(
173
+ train_total_duration / 3600, test_total_duration / 3600
174
+ )
175
+ )
176
+
177
+ # Save train.json and test.json
178
+ with open(train_output_file, "w") as f:
179
+ json.dump(train, f, indent=4, ensure_ascii=False)
180
+ with open(test_output_file, "w") as f:
181
+ json.dump(test, f, indent=4, ensure_ascii=False)
182
+
183
+ # Save singers.json
184
+ singer_lut = {name: i for i, name in enumerate(unique_singers)}
185
+ with open(singer_dict_file, "w") as f:
186
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
preprocessors/pjs.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ from tqdm import tqdm
8
+ import glob
9
+ import json
10
+ import torchaudio
11
+
12
+ from utils.util import has_existed
13
+ from utils.io import save_audio
14
+
15
+
16
+ def get_splitted_utterances(
17
+ raw_wav_dir, trimed_wav_dir, n_utterance_splits, overlapping
18
+ ):
19
+ res = []
20
+ raw_song_files = glob.glob(
21
+ os.path.join(raw_wav_dir, "**/pjs*_song.wav"), recursive=True
22
+ )
23
+ trimed_song_files = glob.glob(
24
+ os.path.join(trimed_wav_dir, "**/*.wav"), recursive=True
25
+ )
26
+
27
+ if len(raw_song_files) * n_utterance_splits == len(trimed_song_files):
28
+ print("Splitted done...")
29
+ for wav_file in tqdm(trimed_song_files):
30
+ uid = wav_file.split("/")[-1].split(".")[0]
31
+ utt = {"Dataset": "pjs", "Singer": "male1", "Uid": uid, "Path": wav_file}
32
+
33
+ waveform, sample_rate = torchaudio.load(wav_file)
34
+ duration = waveform.size(-1) / sample_rate
35
+ utt["Duration"] = duration
36
+
37
+ res.append(utt)
38
+
39
+ else:
40
+ for wav_file in tqdm(raw_song_files):
41
+ song_id = wav_file.split("/")[-1].split(".")[0]
42
+
43
+ waveform, sample_rate = torchaudio.load(wav_file)
44
+ trimed_waveform = torchaudio.functional.vad(waveform, sample_rate)
45
+ trimed_waveform = torchaudio.functional.vad(
46
+ trimed_waveform.flip(dims=[1]), sample_rate
47
+ ).flip(dims=[1])
48
+
49
+ audio_len = trimed_waveform.size(-1)
50
+ lapping_len = overlapping * sample_rate
51
+
52
+ for i in range(n_utterance_splits):
53
+ start = i * audio_len // 3
54
+ end = start + audio_len // 3 + lapping_len
55
+ splitted_waveform = trimed_waveform[:, start:end]
56
+
57
+ utt = {
58
+ "Dataset": "pjs",
59
+ "Singer": "male1",
60
+ "Uid": "{}_{}".format(song_id, i),
61
+ }
62
+
63
+ # Duration
64
+ duration = splitted_waveform.size(-1) / sample_rate
65
+ utt["Duration"] = duration
66
+
67
+ # Save trimed wav
68
+ splitted_waveform_file = os.path.join(
69
+ trimed_wav_dir, "{}.wav".format(utt["Uid"])
70
+ )
71
+ save_audio(splitted_waveform_file, splitted_waveform, sample_rate)
72
+
73
+ # Path
74
+ utt["Path"] = splitted_waveform_file
75
+
76
+ res.append(utt)
77
+
78
+ res = sorted(res, key=lambda x: x["Uid"])
79
+ return res
80
+
81
+
82
+ def main(output_path, dataset_path, n_utterance_splits=3, overlapping=1):
83
+ """
84
+ 1. Split one raw utterance to three splits (since some samples are too long)
85
+ 2. Overlapping of ajacent splits is 1 s
86
+ """
87
+ print("-" * 10)
88
+ print("Preparing training dataset for PJS...")
89
+
90
+ save_dir = os.path.join(output_path, "pjs")
91
+ raw_wav_dir = os.path.join(dataset_path, "PJS_corpus_ver1.1")
92
+
93
+ # Trim for silence
94
+ trimed_wav_dir = os.path.join(dataset_path, "trim")
95
+ os.makedirs(trimed_wav_dir, exist_ok=True)
96
+
97
+ # Total utterances
98
+ utterances = get_splitted_utterances(
99
+ raw_wav_dir, trimed_wav_dir, n_utterance_splits, overlapping
100
+ )
101
+ total_uids = [utt["Uid"] for utt in utterances]
102
+
103
+ # Test uids
104
+ n_test_songs = 3
105
+ test_uids = []
106
+ for i in range(1, n_test_songs + 1):
107
+ test_uids += [
108
+ "pjs00{}_song_{}".format(i, split_id)
109
+ for split_id in range(n_utterance_splits)
110
+ ]
111
+
112
+ # Train uids
113
+ train_uids = [uid for uid in total_uids if uid not in test_uids]
114
+
115
+ for dataset_type in ["train", "test"]:
116
+ output_file = os.path.join(save_dir, "{}.json".format(dataset_type))
117
+ if has_existed(output_file):
118
+ continue
119
+
120
+ uids = eval("{}_uids".format(dataset_type))
121
+ res = [utt for utt in utterances if utt["Uid"] in uids]
122
+ for i in range(len(res)):
123
+ res[i]["index"] = i
124
+
125
+ time = sum([utt["Duration"] for utt in res])
126
+ print(
127
+ "{}, Total size: {}, Total Duraions = {} s = {:.2f} hour\n".format(
128
+ dataset_type, len(res), time, time / 3600
129
+ )
130
+ )
131
+
132
+ # Save
133
+ os.makedirs(save_dir, exist_ok=True)
134
+ with open(output_file, "w") as f:
135
+ json.dump(res, f, indent=4, ensure_ascii=False)
preprocessors/popbutfy.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import torchaudio
9
+ import librosa
10
+ from tqdm import tqdm
11
+ from glob import glob
12
+ from collections import defaultdict
13
+
14
+ from utils.util import has_existed
15
+ from preprocessors import GOLDEN_TEST_SAMPLES
16
+
17
+
18
+ def get_test_songs():
19
+ golden_samples = GOLDEN_TEST_SAMPLES["popbutfy"]
20
+ # every item is a tuple (singer, song)
21
+ golden_songs = [s.split("#")[:2] for s in golden_samples]
22
+ # singer#song, eg: Female1#Almost_lover_Amateur
23
+ return golden_songs
24
+
25
+
26
+ def popbutfy_statistics(data_dir):
27
+ singers = []
28
+ songs = []
29
+ singer2songs = defaultdict(lambda: defaultdict(list))
30
+
31
+ data_infos = glob(data_dir + "/*")
32
+
33
+ for data_info in data_infos:
34
+ data_info_split = data_info.split("/")[-1].split("#")
35
+
36
+ singer, song = data_info_split[0], data_info_split[-1]
37
+ singers.append(singer)
38
+ songs.append(song)
39
+
40
+ utts = glob(data_info + "/*")
41
+
42
+ for utt in utts:
43
+ uid = utt.split("/")[-1].split("_")[-1].split(".")[0]
44
+ singer2songs[singer][song].append(uid)
45
+
46
+ unique_singers = list(set(singers))
47
+ unique_songs = list(set(songs))
48
+ unique_singers.sort()
49
+ unique_songs.sort()
50
+
51
+ print(
52
+ "PopBuTFy: {} singers, {} utterances ({} unique songs)".format(
53
+ len(unique_singers), len(songs), len(unique_songs)
54
+ )
55
+ )
56
+ print("Singers: \n{}".format("\t".join(unique_singers)))
57
+ return singer2songs, unique_singers
58
+
59
+
60
+ def main(output_path, dataset_path):
61
+ print("-" * 10)
62
+ print("Preparing test samples for popbutfy...\n")
63
+
64
+ save_dir = os.path.join(output_path, "popbutfy")
65
+ os.makedirs(save_dir, exist_ok=True)
66
+ train_output_file = os.path.join(save_dir, "train.json")
67
+ test_output_file = os.path.join(save_dir, "test.json")
68
+ singer_dict_file = os.path.join(save_dir, "singers.json")
69
+ utt2singer_file = os.path.join(save_dir, "utt2singer")
70
+ if (
71
+ has_existed(train_output_file)
72
+ and has_existed(test_output_file)
73
+ and has_existed(singer_dict_file)
74
+ and has_existed(utt2singer_file)
75
+ ):
76
+ return
77
+ utt2singer = open(utt2singer_file, "w")
78
+
79
+ # Load
80
+ popbutfy_dir = dataset_path
81
+
82
+ singer2songs, unique_singers = popbutfy_statistics(popbutfy_dir)
83
+ test_songs = get_test_songs()
84
+
85
+ # We select songs of standard samples as test songs
86
+ train = []
87
+ test = []
88
+
89
+ train_index_count = 0
90
+ test_index_count = 0
91
+
92
+ train_total_duration = 0
93
+ test_total_duration = 0
94
+
95
+ for singer, songs in tqdm(singer2songs.items()):
96
+ song_names = list(songs.keys())
97
+
98
+ for chosen_song in song_names:
99
+ for chosen_uid in songs[chosen_song]:
100
+ res = {
101
+ "Dataset": "popbutfy",
102
+ "Singer": singer,
103
+ "Song": chosen_song,
104
+ "Uid": "{}#{}#".format(singer, chosen_song, chosen_uid),
105
+ }
106
+ res["Path"] = "{}#singing#{}/{}#singing#{}_{}.mp3".format(
107
+ singer, chosen_song, singer, chosen_song, chosen_uid
108
+ )
109
+ if not os.path.exists(os.path.join(popbutfy_dir, res["Path"])):
110
+ res["Path"] = "{}#singing#{}/{}#singing#{}_{}.wav".format(
111
+ singer, chosen_song, singer, chosen_song, chosen_uid
112
+ )
113
+ res["Path"] = os.path.join(popbutfy_dir, res["Path"])
114
+ assert os.path.exists(res["Path"])
115
+
116
+ if res["Path"].split("/")[-1].split(".")[-1] == "wav":
117
+ waveform, sample_rate = torchaudio.load(res["Path"])
118
+ duration = waveform.size(-1) / sample_rate
119
+ else:
120
+ waveform, sample_rate = librosa.load(res["Path"])
121
+ duration = waveform.shape[-1] / sample_rate
122
+ res["Duration"] = duration
123
+
124
+ if ([singer, chosen_song]) in test_songs:
125
+ res["index"] = test_index_count
126
+ test_total_duration += duration
127
+ test.append(res)
128
+ test_index_count += 1
129
+ else:
130
+ res["index"] = train_index_count
131
+ train_total_duration += duration
132
+ train.append(res)
133
+ train_index_count += 1
134
+
135
+ utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
136
+
137
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
138
+ print(
139
+ "#Train hours= {}, #Test hours= {}".format(
140
+ train_total_duration / 3600, test_total_duration / 3600
141
+ )
142
+ )
143
+
144
+ # Save train.json and test.json
145
+ with open(train_output_file, "w") as f:
146
+ json.dump(train, f, indent=4, ensure_ascii=False)
147
+ with open(test_output_file, "w") as f:
148
+ json.dump(test, f, indent=4, ensure_ascii=False)
149
+
150
+ # Save singers.json
151
+ singer_lut = {name: i for i, name in enumerate(unique_singers)}
152
+ with open(singer_dict_file, "w") as f:
153
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
preprocessors/popcs.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import torchaudio
9
+ from glob import glob
10
+ from collections import defaultdict
11
+
12
+ from utils.util import has_existed
13
+ from preprocessors import GOLDEN_TEST_SAMPLES
14
+
15
+
16
+ def get_test_songs():
17
+ golden_samples = GOLDEN_TEST_SAMPLES["popcs"]
18
+ # every item is a string
19
+ golden_songs = [s.split("_")[:1] for s in golden_samples]
20
+ # song, eg: 万有引力
21
+ return golden_songs
22
+
23
+
24
+ def popcs_statistics(data_dir):
25
+ songs = []
26
+ songs2utts = defaultdict(list)
27
+
28
+ song_infos = glob(data_dir + "/*")
29
+
30
+ for song_info in song_infos:
31
+ song_info_split = song_info.split("/")[-1].split("-")[-1]
32
+
33
+ songs.append(song_info_split)
34
+
35
+ utts = glob(song_info + "/*.wav")
36
+
37
+ for utt in utts:
38
+ uid = utt.split("/")[-1].split("_")[0]
39
+ songs2utts[song_info_split].append(uid)
40
+
41
+ unique_songs = list(set(songs))
42
+ unique_songs.sort()
43
+
44
+ print(
45
+ "popcs: {} utterances ({} unique songs)".format(len(songs), len(unique_songs))
46
+ )
47
+ print("Songs: \n{}".format("\t".join(unique_songs)))
48
+ return songs2utts
49
+
50
+
51
+ def main(output_path, dataset_path):
52
+ print("-" * 10)
53
+ print("Preparing test samples for popcs...\n")
54
+
55
+ save_dir = os.path.join(output_path, "popcs")
56
+ train_output_file = os.path.join(save_dir, "train.json")
57
+ test_output_file = os.path.join(save_dir, "test.json")
58
+ if has_existed(test_output_file):
59
+ return
60
+
61
+ # Load
62
+ popcs_dir = dataset_path
63
+
64
+ songs2utts = popcs_statistics(popcs_dir)
65
+ test_songs = get_test_songs()
66
+
67
+ # We select songs of standard samples as test songs
68
+ train = []
69
+ test = []
70
+
71
+ train_index_count = 0
72
+ test_index_count = 0
73
+
74
+ train_total_duration = 0
75
+ test_total_duration = 0
76
+
77
+ song_names = list(songs2utts.keys())
78
+
79
+ for chosen_song in song_names:
80
+ for chosen_uid in songs2utts[chosen_song]:
81
+ res = {
82
+ "Dataset": "popcs",
83
+ "Singer": "female1",
84
+ "Song": chosen_song,
85
+ "Uid": "{}_{}".format(chosen_song, chosen_uid),
86
+ }
87
+ res["Path"] = "popcs-{}/{}_wf0.wav".format(chosen_song, chosen_uid)
88
+ res["Path"] = os.path.join(popcs_dir, res["Path"])
89
+ assert os.path.exists(res["Path"])
90
+
91
+ waveform, sample_rate = torchaudio.load(res["Path"])
92
+ duration = waveform.size(-1) / sample_rate
93
+ res["Duration"] = duration
94
+
95
+ if ([chosen_song]) in test_songs:
96
+ res["index"] = test_index_count
97
+ test_total_duration += duration
98
+ test.append(res)
99
+ test_index_count += 1
100
+ else:
101
+ res["index"] = train_index_count
102
+ train_total_duration += duration
103
+ train.append(res)
104
+ train_index_count += 1
105
+
106
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
107
+ print(
108
+ "#Train hours= {}, #Test hours= {}".format(
109
+ train_total_duration / 3600, test_total_duration / 3600
110
+ )
111
+ )
112
+
113
+ # Save
114
+ os.makedirs(save_dir, exist_ok=True)
115
+ with open(train_output_file, "w") as f:
116
+ json.dump(train, f, indent=4, ensure_ascii=False)
117
+ with open(test_output_file, "w") as f:
118
+ json.dump(test, f, indent=4, ensure_ascii=False)
preprocessors/processor.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import re
7
+ from preprocessors import (
8
+ m4singer,
9
+ opencpop,
10
+ svcc,
11
+ pjs,
12
+ popbutfy,
13
+ opensinger,
14
+ popcs,
15
+ kising,
16
+ csd,
17
+ opera,
18
+ nus48e,
19
+ svcceval,
20
+ vctk,
21
+ vctksample,
22
+ libritts,
23
+ lijian,
24
+ cdmusiceval,
25
+ ljspeech,
26
+ coco,
27
+ cocoeval,
28
+ custom,
29
+ vocalist,
30
+ ljspeech_vocoder,
31
+ )
32
+
33
+
34
+ def preprocess_dataset(
35
+ dataset, dataset_path, output_path, cfg, is_custom_dataset=False
36
+ ):
37
+ """Call specific function to handle specific dataset
38
+ Args:
39
+ dataset (str): name of a dataset, e.g. opencpop, m4singer
40
+ dataset_path (str): path to dataset
41
+ output_path (str): path to store preprocessing result files
42
+ """
43
+ if is_custom_dataset:
44
+ custom.main(output_path, dataset_path, dataset_name=dataset)
45
+ return
46
+
47
+ if re.match("opencpop*", dataset):
48
+ opencpop.main(dataset, output_path, dataset_path)
49
+ if dataset == "m4singer":
50
+ m4singer.main(output_path, dataset_path)
51
+ if dataset == "svcc":
52
+ svcc.main(output_path, dataset_path)
53
+ if dataset == "pjs":
54
+ pjs.main(output_path, dataset_path)
55
+ if dataset == "popbutfy":
56
+ popbutfy.main(output_path, dataset_path)
57
+ if dataset == "opensinger":
58
+ opensinger.main(output_path, dataset_path)
59
+ if dataset == "popcs":
60
+ popcs.main(output_path, dataset_path)
61
+ if dataset == "kising":
62
+ kising.main(output_path, dataset_path)
63
+ if dataset == "csd":
64
+ csd.main(output_path, dataset_path)
65
+ if dataset == "opera":
66
+ opera.main(output_path, dataset_path)
67
+ if dataset == "nus48e":
68
+ nus48e.main(output_path, dataset_path)
69
+ if dataset == "vctk":
70
+ vctk.main(output_path, dataset_path)
71
+ if dataset == "svcceval":
72
+ svcceval.main(output_path, dataset_path)
73
+ if dataset == "libritts":
74
+ libritts.main(output_path, dataset_path)
75
+ if dataset == "lijian":
76
+ lijian.main(output_path, dataset_path)
77
+ if dataset == "cdmusiceval":
78
+ cdmusiceval.main(output_path, dataset_path)
79
+ if dataset == "LJSpeech":
80
+ ljspeech.main(output_path, dataset_path, cfg)
81
+ if dataset == "ljspeech":
82
+ ljspeech_vocoder.main(output_path, dataset_path)
83
+ if dataset == "coco":
84
+ coco.main(output_path, dataset_path)
85
+ if dataset == "cocoeval":
86
+ cocoeval.main(output_path, dataset_path)
87
+ if dataset == "vocalist":
88
+ vocalist.main(output_path, dataset_path)
89
+
90
+
91
+ def prepare_align(dataset, dataset_path, cfg, output_path):
92
+ """Call specific function to handle specific dataset
93
+
94
+ Args:
95
+ dataset (str): name of a dataset, e.g. ljspeech
96
+ dataset_path (str): path to dataset
97
+ output_path (str): path to store preprocessing result files
98
+ """
99
+ if dataset == "LJSpeech":
100
+ ljspeech.prepare_align(dataset, dataset_path, cfg, output_path)
preprocessors/svcc.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import glob
8
+ import librosa
9
+ import json
10
+
11
+ from utils.util import has_existed
12
+ from preprocessors import GOLDEN_TEST_SAMPLES
13
+
14
+
15
+ def main(output_path, dataset_path):
16
+ print("-" * 10)
17
+ print("Preparing training dataset for svcc...")
18
+
19
+ data_dir = os.path.join(dataset_path, "Data")
20
+ save_dir = os.path.join(output_path, "svcc")
21
+ os.makedirs(save_dir, exist_ok=True)
22
+
23
+ singer_dict_file = os.path.join(save_dir, "singers.json")
24
+ utt2singer_file = os.path.join(save_dir, "utt2singer")
25
+ utt2singer = open(utt2singer_file, "w")
26
+
27
+ # Load utterances
28
+ train = []
29
+ test = []
30
+ singers = []
31
+
32
+ for wav_file in glob.glob(os.path.join(data_dir, "*/*.wav")):
33
+ singer, filename = wav_file.split("/")[-2:]
34
+ uid = filename.split(".")[0]
35
+ utt = {
36
+ "Dataset": "svcc",
37
+ "Singer": singer,
38
+ "Uid": "{}_{}".format(singer, uid),
39
+ "Path": wav_file,
40
+ }
41
+
42
+ # Duration
43
+ duration = librosa.get_duration(filename=wav_file)
44
+ utt["Duration"] = duration
45
+
46
+ if utt["Uid"] in GOLDEN_TEST_SAMPLES["svcc"]:
47
+ test.append(utt)
48
+ else:
49
+ train.append(utt)
50
+
51
+ singers.append(singer)
52
+ utt2singer.write("{}\t{}\n".format(utt["Uid"], utt["Singer"]))
53
+
54
+ # Save singers.json
55
+ unique_singers = list(set(singers))
56
+ unique_singers.sort()
57
+ singer_lut = {name: i for i, name in enumerate(unique_singers)}
58
+ with open(singer_dict_file, "w") as f:
59
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
60
+
61
+ train_total_duration = sum([utt["Duration"] for utt in train])
62
+ test_total_duration = sum([utt["Duration"] for utt in test])
63
+
64
+ for dataset_type in ["train", "test"]:
65
+ output_file = os.path.join(save_dir, "{}.json".format(dataset_type))
66
+ if has_existed(output_file):
67
+ continue
68
+
69
+ utterances = eval(dataset_type)
70
+ utterances = sorted(utterances, key=lambda x: x["Uid"])
71
+
72
+ for i in range(len(utterances)):
73
+ utterances[i]["index"] = i
74
+
75
+ print("{}: Total size: {}\n".format(dataset_type, len(utterances)))
76
+
77
+ # Save
78
+ with open(output_file, "w") as f:
79
+ json.dump(utterances, f, indent=4, ensure_ascii=False)
80
+
81
+ print(
82
+ "#Train hours= {}, #Test hours= {}".format(
83
+ train_total_duration / 3600, test_total_duration / 3600
84
+ )
85
+ )
preprocessors/svcceval.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import glob
8
+ import librosa
9
+ import json
10
+
11
+ from utils.util import has_existed
12
+
13
+
14
+ def main(output_path, dataset_path):
15
+ print("-" * 10)
16
+ print("Preparing training dataset for svcceval...")
17
+
18
+ data_dir = os.path.join(dataset_path, "Data")
19
+ save_dir = os.path.join(output_path, "svcceval")
20
+ os.makedirs(save_dir, exist_ok=True)
21
+
22
+ singer_dict_file = os.path.join(save_dir, "singers.json")
23
+ utt2singer_file = os.path.join(save_dir, "utt2singer")
24
+ utt2singer = open(utt2singer_file, "w")
25
+
26
+ # Load utterances
27
+ train = []
28
+ test = []
29
+ singers = []
30
+ for wav_file in glob.glob(os.path.join(data_dir, "*/*.wav")):
31
+ singer, filename = wav_file.split("/")[-2:]
32
+ uid = filename.split(".")[0]
33
+ utt = {
34
+ "Dataset": "svcceval",
35
+ "Singer": singer,
36
+ "Uid": "{}_{}".format(singer, uid),
37
+ "Path": wav_file,
38
+ }
39
+
40
+ # Duration
41
+ duration = librosa.get_duration(filename=wav_file)
42
+ utt["Duration"] = duration
43
+
44
+ test.append(utt)
45
+
46
+ singers.append(singer)
47
+ utt2singer.write("{}\t{}\n".format(utt["Uid"], utt["Singer"]))
48
+
49
+ # Save singers.json
50
+ unique_singers = list(set(singers))
51
+ unique_singers.sort()
52
+ singer_lut = {name: i for i, name in enumerate(unique_singers)}
53
+ with open(singer_dict_file, "w") as f:
54
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
55
+
56
+ train_total_duration = sum([utt["Duration"] for utt in train])
57
+ test_total_duration = sum([utt["Duration"] for utt in test])
58
+
59
+ for dataset_type in ["train", "test"]:
60
+ output_file = os.path.join(save_dir, "{}.json".format(dataset_type))
61
+ if has_existed(output_file):
62
+ continue
63
+
64
+ utterances = eval(dataset_type)
65
+ utterances = sorted(utterances, key=lambda x: x["Uid"])
66
+
67
+ for i in range(len(utterances)):
68
+ utterances[i]["index"] = i
69
+
70
+ print("{}: Total size: {}\n".format(dataset_type, len(utterances)))
71
+
72
+ # Save
73
+ with open(output_file, "w") as f:
74
+ json.dump(utterances, f, indent=4, ensure_ascii=False)
75
+
76
+ print(
77
+ "#Train hours= {}, #Test hours= {}".format(
78
+ train_total_duration / 3600, test_total_duration / 3600
79
+ )
80
+ )
preprocessors/vctk.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import librosa
9
+ from tqdm import tqdm
10
+ from glob import glob
11
+ from collections import defaultdict
12
+
13
+ from utils.util import has_existed
14
+
15
+
16
+ def get_lines(file):
17
+ with open(file, "r") as f:
18
+ lines = f.readlines()
19
+ lines = [l.strip() for l in lines]
20
+ return lines
21
+
22
+
23
+ def vctk_statistics(data_dir):
24
+ speakers = []
25
+ speakers2utts = defaultdict(list)
26
+
27
+ speaker_infos = glob(data_dir + "/wav48_silence_trimmed" + "/*")
28
+
29
+ for speaker_info in speaker_infos:
30
+ speaker = speaker_info.split("/")[-1]
31
+
32
+ if speaker == "log.txt":
33
+ continue
34
+
35
+ speakers.append(speaker)
36
+
37
+ utts = glob(speaker_info + "/*")
38
+
39
+ for utt in utts:
40
+ uid = (
41
+ utt.split("/")[-1].split("_")[1]
42
+ + "_"
43
+ + utt.split("/")[-1].split("_")[2].split(".")[0]
44
+ )
45
+ speakers2utts[speaker].append(uid)
46
+
47
+ unique_speakers = list(set(speakers))
48
+ unique_speakers.sort()
49
+
50
+ print("Speakers: \n{}".format("\t".join(unique_speakers)))
51
+ return speakers2utts, unique_speakers
52
+
53
+
54
+ def vctk_speaker_infos(data_dir):
55
+ file = os.path.join(data_dir, "speaker-info.txt")
56
+ lines = get_lines(file)
57
+
58
+ ID2speakers = defaultdict()
59
+ for l in tqdm(lines):
60
+ items = l.replace(" ", "")
61
+
62
+ if items[:2] == "ID":
63
+ # The header line
64
+ continue
65
+
66
+ if items[0] == "p":
67
+ id = items[:4]
68
+ gender = items[6]
69
+ elif items[0] == "s":
70
+ id = items[:2]
71
+ gender = items[4]
72
+
73
+ if gender == "F":
74
+ speaker = "female_{}".format(id)
75
+ elif gender == "M":
76
+ speaker = "male_{}".format(id)
77
+
78
+ ID2speakers[id] = speaker
79
+
80
+ return ID2speakers
81
+
82
+
83
+ def main(output_path, dataset_path, TEST_NUM_OF_EVERY_SPEAKER=3):
84
+ print("-" * 10)
85
+ print("Preparing test samples for vctk...")
86
+
87
+ save_dir = os.path.join(output_path, "vctk")
88
+ os.makedirs(save_dir, exist_ok=True)
89
+ train_output_file = os.path.join(save_dir, "train.json")
90
+ test_output_file = os.path.join(save_dir, "test.json")
91
+ singer_dict_file = os.path.join(save_dir, "singers.json")
92
+ utt2singer_file = os.path.join(save_dir, "utt2singer")
93
+ if has_existed(train_output_file):
94
+ return
95
+ utt2singer = open(utt2singer_file, "w")
96
+
97
+ # Load
98
+ vctk_dir = dataset_path
99
+
100
+ ID2speakers = vctk_speaker_infos(vctk_dir)
101
+ speaker2utts, unique_speakers = vctk_statistics(vctk_dir)
102
+
103
+ # We select speakers of standard samples as test utts
104
+ train = []
105
+ test = []
106
+
107
+ train_index_count = 0
108
+ test_index_count = 0
109
+ test_speaker_count = defaultdict(int)
110
+
111
+ train_total_duration = 0
112
+ test_total_duration = 0
113
+
114
+ for i, speaker in enumerate(speaker2utts.keys()):
115
+ for chosen_uid in tqdm(
116
+ speaker2utts[speaker],
117
+ desc="Speaker {}/{}, #Train = {}, #Test = {}".format(
118
+ i + 1, len(speaker2utts), train_index_count, test_index_count
119
+ ),
120
+ ):
121
+ res = {
122
+ "Dataset": "vctk",
123
+ "Singer": ID2speakers[speaker],
124
+ "Uid": "{}#{}".format(ID2speakers[speaker], chosen_uid),
125
+ }
126
+ res["Path"] = "{}/{}_{}.flac".format(speaker, speaker, chosen_uid)
127
+ res["Path"] = os.path.join(vctk_dir, "wav48_silence_trimmed", res["Path"])
128
+ assert os.path.exists(res["Path"])
129
+
130
+ duration = librosa.get_duration(filename=res["Path"])
131
+ res["Duration"] = duration
132
+
133
+ if test_speaker_count[speaker] < TEST_NUM_OF_EVERY_SPEAKER:
134
+ res["index"] = test_index_count
135
+ test_total_duration += duration
136
+ test.append(res)
137
+ test_index_count += 1
138
+ test_speaker_count[speaker] += 1
139
+ else:
140
+ res["index"] = train_index_count
141
+ train_total_duration += duration
142
+ train.append(res)
143
+ train_index_count += 1
144
+
145
+ utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
146
+
147
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
148
+ print(
149
+ "#Train hours= {}, #Test hours= {}".format(
150
+ train_total_duration / 3600, test_total_duration / 3600
151
+ )
152
+ )
153
+
154
+ # Save train.json and test.json
155
+ with open(train_output_file, "w") as f:
156
+ json.dump(train, f, indent=4, ensure_ascii=False)
157
+ with open(test_output_file, "w") as f:
158
+ json.dump(test, f, indent=4, ensure_ascii=False)
159
+
160
+ # Save singers.json
161
+ singer_lut = {name: i for i, name in enumerate(unique_speakers)}
162
+ with open(singer_dict_file, "w") as f:
163
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
preprocessors/vctkfewsinger.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import pickle
9
+ import glob
10
+ from collections import defaultdict
11
+ from tqdm import tqdm
12
+
13
+
14
+ # Train: male 20 hours, female 10 hours
15
+ TRAIN_MALE_MAX_SECONDS = 20 * 3600
16
+ TRAIN_FEMALE_MAX_SECONDS = 10 * 3600
17
+ TEST_MAX_NUM_EVERY_PERSON = 5
18
+
19
+
20
+ def select_sample_idxs():
21
+ chosen_speakers = get_chosen_speakers()
22
+
23
+ with open(os.path.join(vctk_dir, "train.json"), "r") as f:
24
+ raw_train = json.load(f)
25
+ with open(os.path.join(vctk_dir, "test.json"), "r") as f:
26
+ raw_test = json.load(f)
27
+
28
+ train_idxs, test_idxs = [], []
29
+
30
+ # =========== Test ===========
31
+ test_nums = defaultdict(int)
32
+ for utt in tqdm(raw_train):
33
+ idx = utt["index"]
34
+ singer = utt["Singer"]
35
+
36
+ if singer in chosen_speakers and test_nums[singer] < TEST_MAX_NUM_EVERY_PERSON:
37
+ test_nums[singer] += 1
38
+ test_idxs.append("train_{}".format(idx))
39
+
40
+ for utt in tqdm(raw_test):
41
+ idx = utt["index"]
42
+ singer = utt["Singer"]
43
+
44
+ if singer in chosen_speakers and test_nums[singer] < TEST_MAX_NUM_EVERY_PERSON:
45
+ test_nums[singer] += 1
46
+ test_idxs.append("test_{}".format(idx))
47
+
48
+ # =========== Train ===========
49
+ for utt in tqdm(raw_train):
50
+ idx = utt["index"]
51
+ singer = utt["Singer"]
52
+
53
+ if singer in chosen_speakers and "train_{}".format(idx) not in test_idxs:
54
+ train_idxs.append("train_{}".format(idx))
55
+
56
+ for utt in tqdm(raw_test):
57
+ idx = utt["index"]
58
+ singer = utt["Singer"]
59
+
60
+ if singer in chosen_speakers and "test_{}".format(idx) not in test_idxs:
61
+ train_idxs.append("test_{}".format(idx))
62
+
63
+ train_idxs.sort()
64
+ test_idxs.sort()
65
+ return train_idxs, test_idxs, raw_train, raw_test
66
+
67
+
68
+ def statistics_of_speakers():
69
+ speaker2time = defaultdict(float)
70
+ sex2time = defaultdict(float)
71
+
72
+ with open(os.path.join(vctk_dir, "train.json"), "r") as f:
73
+ train = json.load(f)
74
+ with open(os.path.join(vctk_dir, "test.json"), "r") as f:
75
+ test = json.load(f)
76
+
77
+ for utt in train + test:
78
+ # minutes
79
+ speaker2time[utt["Singer"]] += utt["Duration"]
80
+ # hours
81
+ sex2time[utt["Singer"].split("_")[0]] += utt["Duration"]
82
+
83
+ print(
84
+ "Female: {:.2f} hours, Male: {:.2f} hours.\n".format(
85
+ sex2time["female"] / 3600, sex2time["male"] / 3600
86
+ )
87
+ )
88
+
89
+ speaker2time = sorted(speaker2time.items(), key=lambda x: x[-1], reverse=True)
90
+ for singer, seconds in speaker2time:
91
+ print("{}\t{:.2f} mins".format(singer, seconds / 60))
92
+
93
+ return speaker2time
94
+
95
+
96
+ def get_chosen_speakers():
97
+ speaker2time = statistics_of_speakers()
98
+
99
+ chosen_time = defaultdict(float)
100
+ chosen_speaker = defaultdict(list)
101
+ train_constrait = {
102
+ "male": TRAIN_MALE_MAX_SECONDS,
103
+ "female": TRAIN_FEMALE_MAX_SECONDS,
104
+ }
105
+
106
+ for speaker, seconds in speaker2time:
107
+ sex = speaker.split("_")[0]
108
+ if chosen_time[sex] < train_constrait[sex]:
109
+ chosen_time[sex] += seconds
110
+ chosen_speaker[sex].append(speaker)
111
+
112
+ speaker2time = dict(speaker2time)
113
+ chosen_speaker = chosen_speaker["male"] + chosen_speaker["female"]
114
+ print("\n#Chosen speakers = {}".format(len(chosen_speaker)))
115
+ for spk in chosen_speaker:
116
+ print("{}\t{:.2f} mins".format(spk, speaker2time[spk] / 60))
117
+
118
+ return chosen_speaker
119
+
120
+
121
+ if __name__ == "__main__":
122
+ root_path = ""
123
+ vctk_dir = os.path.join(root_path, "vctk")
124
+ fewspeaker_dir = os.path.join(root_path, "vctkfewspeaker")
125
+ os.makedirs(fewspeaker_dir, exist_ok=True)
126
+
127
+ train_idxs, test_idxs, raw_train, raw_test = select_sample_idxs()
128
+ print("#Train = {}, #Test = {}".format(len(train_idxs), len(test_idxs)))
129
+
130
+ # There are no data leakage
131
+ assert len(set(train_idxs).intersection(set(test_idxs))) == 0
132
+ for idx in train_idxs + test_idxs:
133
+ # No test data of raw vctk
134
+ assert "test_" not in idx
135
+
136
+ for split, chosen_idxs in zip(["train", "test"], [train_idxs, test_idxs]):
137
+ print("{}: #chosen idx = {}\n".format(split, len(chosen_idxs)))
138
+
139
+ # Select features
140
+ feat_files = glob.glob("**/train.pkl", root_dir=vctk_dir, recursive=True)
141
+ for file in tqdm(feat_files):
142
+ raw_file = os.path.join(vctk_dir, file)
143
+ new_file = os.path.join(
144
+ fewspeaker_dir, file.replace("train.pkl", "{}.pkl".format(split))
145
+ )
146
+
147
+ new_dir = "/".join(new_file.split("/")[:-1])
148
+ os.makedirs(new_dir, exist_ok=True)
149
+
150
+ if "mel_min" in file or "mel_max" in file:
151
+ os.system("cp {} {}".format(raw_file, new_file))
152
+ continue
153
+
154
+ with open(raw_file, "rb") as f:
155
+ raw_feats = pickle.load(f)
156
+
157
+ print("file: {}, #raw_feats = {}".format(file, len(raw_feats)))
158
+ new_feats = []
159
+ for idx in chosen_idxs:
160
+ chosen_split_is_train, raw_idx = idx.split("_")
161
+ assert chosen_split_is_train == "train"
162
+ new_feats.append(raw_feats[int(raw_idx)])
163
+
164
+ with open(new_file, "wb") as f:
165
+ pickle.dump(new_feats, f)
166
+ print("New file: {}, #new_feats = {}".format(new_file, len(new_feats)))
167
+
168
+ # Utterance re-index
169
+ news_utts = [raw_train[int(idx.split("_")[-1])] for idx in chosen_idxs]
170
+ for i, utt in enumerate(news_utts):
171
+ utt["Dataset"] = "vctkfewsinger"
172
+ utt["index"] = i
173
+
174
+ with open(os.path.join(fewspeaker_dir, "{}.json".format(split)), "w") as f:
175
+ json.dump(news_utts, f, indent=4)
preprocessors/vctksample.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import pickle
9
+ import glob
10
+ from collections import defaultdict
11
+ from tqdm import tqdm
12
+ from preprocessors import get_golden_samples_indexes
13
+
14
+
15
+ TRAIN_MAX_NUM_EVERY_PERSON = 250
16
+ TEST_MAX_NUM_EVERY_PERSON = 25
17
+
18
+
19
+ def select_sample_idxs():
20
+ # =========== Train ===========
21
+ with open(os.path.join(vctk_dir, "train.json"), "r") as f:
22
+ raw_train = json.load(f)
23
+
24
+ train_idxs = []
25
+ train_nums = defaultdict(int)
26
+ for utt in tqdm(raw_train):
27
+ idx = utt["index"]
28
+ singer = utt["Singer"]
29
+
30
+ if train_nums[singer] < TRAIN_MAX_NUM_EVERY_PERSON:
31
+ train_idxs.append(idx)
32
+ train_nums[singer] += 1
33
+
34
+ # =========== Test ===========
35
+ with open(os.path.join(vctk_dir, "test.json"), "r") as f:
36
+ raw_test = json.load(f)
37
+
38
+ # golden test
39
+ test_idxs = get_golden_samples_indexes(
40
+ dataset_name="vctk", split="test", dataset_dir=vctk_dir
41
+ )
42
+ test_nums = defaultdict(int)
43
+ for idx in test_idxs:
44
+ singer = raw_test[idx]["Singer"]
45
+ test_nums[singer] += 1
46
+
47
+ for utt in tqdm(raw_test):
48
+ idx = utt["index"]
49
+ singer = utt["Singer"]
50
+
51
+ if test_nums[singer] < TEST_MAX_NUM_EVERY_PERSON:
52
+ test_idxs.append(idx)
53
+ test_nums[singer] += 1
54
+
55
+ train_idxs.sort()
56
+ test_idxs.sort()
57
+ return train_idxs, test_idxs, raw_train, raw_test
58
+
59
+
60
+ if __name__ == "__main__":
61
+ root_path = ""
62
+ vctk_dir = os.path.join(root_path, "vctk")
63
+ sample_dir = os.path.join(root_path, "vctksample")
64
+ os.makedirs(sample_dir, exist_ok=True)
65
+
66
+ train_idxs, test_idxs, raw_train, raw_test = select_sample_idxs()
67
+ print("#Train = {}, #Test = {}".format(len(train_idxs), len(test_idxs)))
68
+
69
+ for split, chosen_idxs, utterances in zip(
70
+ ["train", "test"], [train_idxs, test_idxs], [raw_train, raw_test]
71
+ ):
72
+ print(
73
+ "#{} = {}, #chosen idx = {}\n".format(
74
+ split, len(utterances), len(chosen_idxs)
75
+ )
76
+ )
77
+
78
+ # Select features
79
+ feat_files = glob.glob(
80
+ "**/{}.pkl".format(split), root_dir=vctk_dir, recursive=True
81
+ )
82
+ for file in tqdm(feat_files):
83
+ raw_file = os.path.join(vctk_dir, file)
84
+ new_file = os.path.join(sample_dir, file)
85
+
86
+ new_dir = "/".join(new_file.split("/")[:-1])
87
+ os.makedirs(new_dir, exist_ok=True)
88
+
89
+ if "mel_min" in file or "mel_max" in file:
90
+ os.system("cp {} {}".format(raw_file, new_file))
91
+ continue
92
+
93
+ with open(raw_file, "rb") as f:
94
+ raw_feats = pickle.load(f)
95
+
96
+ print("file: {}, #raw_feats = {}".format(file, len(raw_feats)))
97
+ new_feats = [raw_feats[idx] for idx in chosen_idxs]
98
+ with open(new_file, "wb") as f:
99
+ pickle.dump(new_feats, f)
100
+
101
+ # Utterance re-index
102
+ news_utts = [utterances[idx] for idx in chosen_idxs]
103
+ for i, utt in enumerate(news_utts):
104
+ utt["Dataset"] = "vctksample"
105
+ utt["index"] = i
106
+
107
+ with open(os.path.join(sample_dir, "{}.json".format(split)), "w") as f:
108
+ json.dump(news_utts, f, indent=4)
preprocessors/vocalist.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import torchaudio
9
+ from tqdm import tqdm
10
+ from glob import glob
11
+ from collections import defaultdict
12
+
13
+ from utils.util import has_existed
14
+
15
+
16
+ def vocalist_statistics(data_dir):
17
+ singers = []
18
+ songs = []
19
+ global2singer2songs = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
20
+
21
+ global_infos = glob(data_dir + "/*")
22
+
23
+ for global_info in global_infos:
24
+ global_split = global_info.split("/")[-1]
25
+
26
+ singer_infos = glob(global_info + "/*")
27
+
28
+ for singer_info in singer_infos:
29
+ singer = singer_info.split("/")[-1]
30
+
31
+ singers.append(singer)
32
+
33
+ song_infos = glob(singer_info + "/*")
34
+ for song_info in song_infos:
35
+ song = song_info.split("/")[-1]
36
+
37
+ songs.append(song)
38
+
39
+ utts = glob(song_info + "/*.wav")
40
+
41
+ for utt in utts:
42
+ uid = utt.split("/")[-1].split(".")[0]
43
+ global2singer2songs[global_split][singer][song].append(uid)
44
+
45
+ unique_singers = list(set(singers))
46
+ unique_songs = list(set(songs))
47
+ unique_singers.sort()
48
+ unique_songs.sort()
49
+
50
+ print(
51
+ "vocalist: {} singers, {} songs ({} unique songs)".format(
52
+ len(unique_singers), len(songs), len(unique_songs)
53
+ )
54
+ )
55
+ print("Singers: \n{}".format("\t".join(unique_singers)))
56
+ return global2singer2songs, unique_singers
57
+
58
+
59
+ def main(output_path, dataset_path):
60
+ print("-" * 10)
61
+ print("Preparing test samples for vocalist...\n")
62
+
63
+ save_dir = os.path.join(output_path, "vocalist")
64
+ os.makedirs(save_dir, exist_ok=True)
65
+ train_output_file = os.path.join(save_dir, "train.json")
66
+ test_output_file = os.path.join(save_dir, "test.json")
67
+ singer_dict_file = os.path.join(save_dir, "singers.json")
68
+ utt2singer_file = os.path.join(save_dir, "utt2singer")
69
+ if (
70
+ has_existed(train_output_file)
71
+ and has_existed(test_output_file)
72
+ and has_existed(singer_dict_file)
73
+ and has_existed(utt2singer_file)
74
+ ):
75
+ return
76
+ utt2singer = open(utt2singer_file, "w")
77
+
78
+ # Load
79
+ vocalist_path = dataset_path
80
+
81
+ global2singer2songs, unique_singers = vocalist_statistics(vocalist_path)
82
+
83
+ train = []
84
+ test = []
85
+
86
+ train_index_count = 0
87
+ test_index_count = 0
88
+
89
+ train_total_duration = 0
90
+ test_total_duration = 0
91
+
92
+ for global_info, singer2songs in tqdm(global2singer2songs.items()):
93
+ for singer, songs in tqdm(singer2songs.items()):
94
+ song_names = list(songs.keys())
95
+
96
+ for chosen_song in song_names:
97
+ for chosen_uid in songs[chosen_song]:
98
+ res = {
99
+ "Dataset": "opensinger",
100
+ "Singer": singer,
101
+ "Song": chosen_song,
102
+ "Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
103
+ }
104
+ res["Path"] = "{}/{}/{}/{}.wav".format(
105
+ global_info, singer, chosen_song, chosen_uid
106
+ )
107
+ res["Path"] = os.path.join(vocalist_path, res["Path"])
108
+ assert os.path.exists(res["Path"])
109
+
110
+ waveform, sample_rate = torchaudio.load(res["Path"])
111
+ duration = waveform.size(-1) / sample_rate
112
+ res["Duration"] = duration
113
+
114
+ res["index"] = test_index_count
115
+ test_total_duration += duration
116
+ test.append(res)
117
+ test_index_count += 1
118
+
119
+ utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
120
+
121
+ print("#Train = {}, #Test = {}".format(len(train), len(test)))
122
+ print(
123
+ "#Train hours= {}, #Test hours= {}".format(
124
+ train_total_duration / 3600, test_total_duration / 3600
125
+ )
126
+ )
127
+
128
+ # Save train.json and test.json
129
+ with open(train_output_file, "w") as f:
130
+ json.dump(train, f, indent=4, ensure_ascii=False)
131
+ with open(test_output_file, "w") as f:
132
+ json.dump(test, f, indent=4, ensure_ascii=False)
133
+
134
+ # Save singers.json
135
+ singer_lut = {name: i for i, name in enumerate(unique_singers)}
136
+ with open(singer_dict_file, "w") as f:
137
+ json.dump(singer_lut, f, indent=4, ensure_ascii=False)
pretrained/bigvgan/args.json ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "exp_name": "bigvgan_large",
4
+ "inference": {
5
+ "batch_size": 1,
6
+ },
7
+ "model": {
8
+ "bigvgan": {
9
+ "activation": "snakebeta",
10
+ "resblock": "1",
11
+ "resblock_dilation_sizes": [
12
+ [
13
+ 1,
14
+ 3,
15
+ 5,
16
+ ],
17
+ [
18
+ 1,
19
+ 3,
20
+ 5,
21
+ ],
22
+ [
23
+ 1,
24
+ 3,
25
+ 5,
26
+ ],
27
+ ],
28
+ "resblock_kernel_sizes": [
29
+ 3,
30
+ 7,
31
+ 11,
32
+ ],
33
+ "snake_logscale": true,
34
+ "upsample_initial_channel": 1536,
35
+ "upsample_kernel_sizes": [
36
+ 8,
37
+ 8,
38
+ 4,
39
+ 4,
40
+ 4,
41
+ 4,
42
+ ],
43
+ "upsample_rates": [
44
+ 4,
45
+ 4,
46
+ 2,
47
+ 2,
48
+ 2,
49
+ 2,
50
+ ],
51
+ },
52
+ "discriminators": [
53
+ "mpd",
54
+ "msstftd",
55
+ ],
56
+ "generator": "bigvgan",
57
+ "mpd": {
58
+ "discriminator_channel_multi": 1,
59
+ "mpd_reshapes": [
60
+ 2,
61
+ 3,
62
+ 5,
63
+ 7,
64
+ 11,
65
+ ],
66
+ "use_spectral_norm": false,
67
+ },
68
+ "mrd": {
69
+ "discriminator_channel_multi": 1,
70
+ "mrd_override": false,
71
+ "resolutions": [
72
+ [
73
+ 1024,
74
+ 120,
75
+ 600,
76
+ ],
77
+ [
78
+ 2048,
79
+ 240,
80
+ 1200,
81
+ ],
82
+ [
83
+ 512,
84
+ 50,
85
+ 240,
86
+ ],
87
+ ],
88
+ "use_spectral_norm": false,
89
+ },
90
+ "msstftd": {
91
+ "filters": 32,
92
+ },
93
+ },
94
+ "model_type": "GANVocoder",
95
+ "preprocess": {
96
+ "audio_dir": "audios",
97
+ "bits": 8,
98
+ "contentvec_dir": "contentvec",
99
+ "cut_mel_frame": 32,
100
+ "data_augment": false,
101
+ "dur_dir": "durs",
102
+ "duration_dir": "duration",
103
+ "emo2id": "emo2id.json",
104
+ "energy_dir": "energys",
105
+ "energy_extract_mode": "from_mel",
106
+ "energy_norm": false,
107
+ "extract_audio": true,
108
+ "extract_contentvec_feature": false,
109
+ "extract_duration": false,
110
+ "extract_energy": false,
111
+ "extract_label": false,
112
+ "extract_mcep": false,
113
+ "extract_mel": true,
114
+ "extract_mert_feature": false,
115
+ "extract_one_hot": false,
116
+ "extract_pitch": false,
117
+ "extract_uv": false,
118
+ "extract_wenet_feature": false,
119
+ "extract_whisper_feature": false,
120
+ "f0_max": 1100,
121
+ "f0_min": 50,
122
+ "file_lst": "file.lst",
123
+ "fmax": 12000,
124
+ "fmin": 0,
125
+ "hop_size": 256,
126
+ "is_mu_law": false,
127
+ "lab_dir": "labs",
128
+ "label_dir": "labels",
129
+ "mcep_dir": "mcep",
130
+ "mel_dir": "mels",
131
+ "mel_min_max_norm": false,
132
+ "min_level_db": -115,
133
+ "n_fft": 1024,
134
+ "n_mel": 100,
135
+ "num_silent_frames": 8,
136
+ "phone_seq_file": "phone_seq_file",
137
+ "pitch_bin": 256,
138
+ "pitch_dir": "pitches",
139
+ "pitch_extractor": "parselmouth",
140
+ "pitch_max": 1100.0,
141
+ "pitch_min": 50.0,
142
+ "pitch_norm": false,
143
+ "processed_dir": "processed_data",
144
+ "ref_level_db": 20,
145
+ "sample_rate": 24000,
146
+ "spk2id": "singers.json",
147
+ "train_file": "train.json",
148
+ "trim_fft_size": 512,
149
+ "trim_hop_size": 128,
150
+ "trim_silence": false,
151
+ "trim_top_db": 30,
152
+ "trimmed_wav_dir": "trimmed_wavs",
153
+ "use_audio": true,
154
+ "use_dur": false,
155
+ "use_emoid": false,
156
+ "use_frame_duration": false,
157
+ "use_frame_energy": false,
158
+ "use_frame_pitch": false,
159
+ "use_lab": false,
160
+ "use_label": false,
161
+ "use_log_scale_energy": false,
162
+ "use_log_scale_pitch": false,
163
+ "use_mel": true,
164
+ "use_one_hot": false,
165
+ "use_phn_seq": false,
166
+ "use_phone_duration": false,
167
+ "use_phone_energy": false,
168
+ "use_phone_pitch": false,
169
+ "use_spkid": false,
170
+ "use_uv": false,
171
+ "use_wav": false,
172
+ "use_wenet": false,
173
+ "utt2emo": "utt2emo",
174
+ "utt2spk": "utt2spk",
175
+ "uv_dir": "uvs",
176
+ "valid_file": "test.json",
177
+ "wav_dir": "wavs",
178
+ "wenet_dir": "wenet",
179
+ "win_size": 1024,
180
+ },
181
+ "supported_model_type": [
182
+ "GANVocoder",
183
+ "Fastspeech2",
184
+ "DiffSVC",
185
+ "Transformer",
186
+ "EDM",
187
+ "CD",
188
+ ],
189
+ "train": {
190
+ "adamw": {
191
+ "adam_b1": 0.8,
192
+ "adam_b2": 0.99,
193
+ "lr": 0.0002,
194
+ },
195
+ "batch_size": 4,
196
+ "criterions": [
197
+ "feature",
198
+ "discriminator",
199
+ "generator",
200
+ "mel",
201
+ ],
202
+ "dataloader": {
203
+ "num_worker": 4,
204
+ "pin_memory": true,
205
+ },
206
+ "ddp": true,
207
+ "epochs": 50000,
208
+ "exponential_lr": {
209
+ "lr_decay": 0.999,
210
+ },
211
+ "gradient_accumulation_step": 1,
212
+ "keep_checkpoint_max": 5,
213
+ "max_epoch": 1000000,
214
+ "max_steps": 1000000,
215
+ "multi_speaker_training": false,
216
+ "random_seed": 114514,
217
+ "run_eval": [
218
+ true,
219
+ ],
220
+ "sampler": {
221
+ "drop_last": true,
222
+ "holistic_shuffle": true,
223
+ },
224
+ "save_checkpoint_stride": [
225
+ 200,
226
+ ],
227
+ "save_checkpoints_steps": 10000,
228
+ "save_summary_steps": 500,
229
+ "total_training_steps": 50000,
230
+ "tracker": [
231
+ "tensorboard",
232
+ ],
233
+ "valid_interval": 10000,
234
+ },
235
+ }
pretrained/contentvec/README.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Download
2
+
3
+ - [Link](https://github.com/auspicious3000/contentvec)
4
+ - Model: `ContentVec_legacy`
5
+ - Classes: 500