Text-to-Speech
English
geneing commited on
Commit
5b93bbf
·
1 Parent(s): c3b0d86

Working on onnx export.

Browse files
Files changed (2) hide show
  1. models.py +45 -18
  2. test.ipynb +0 -0
models.py CHANGED
@@ -272,9 +272,8 @@ class TextEncoder(nn.Module):
272
 
273
  x = x.transpose(1, 2) # [B, T, chn]
274
 
275
- input_lengths = input_lengths.cpu().numpy()
276
  x = nn.utils.rnn.pack_padded_sequence(
277
- x, input_lengths, batch_first=True, enforce_sorted=False)
278
 
279
  self.lstm.flatten_parameters()
280
  x, _ = self.lstm(x)
@@ -292,12 +291,19 @@ class TextEncoder(nn.Module):
292
  return x
293
 
294
  def inference(self, x):
295
- x = self.embedding(x)
296
- x = x.transpose(1, 2)
297
- x = self.cnn(x)
298
- x = x.transpose(1, 2)
 
 
 
 
299
  self.lstm.flatten_parameters()
300
  x, _ = self.lstm(x)
 
 
 
301
  return x
302
 
303
  def length_to_mask(self, lengths):
@@ -433,7 +439,7 @@ class ProsodyPredictor(nn.Module):
433
  text_size = d.shape[1]
434
 
435
  # predict duration
436
- input_lengths = text_lengths.cpu().numpy()
437
  x = nn.utils.rnn.pack_padded_sequence(
438
  d, input_lengths, batch_first=True, enforce_sorted=False)
439
 
@@ -456,8 +462,14 @@ class ProsodyPredictor(nn.Module):
456
  return duration.squeeze(-1), en
457
 
458
  def F0Ntrain(self, x, s):
459
- x, _ = self.shared(x.transpose(-1, -2))
460
-
 
 
 
 
 
 
461
  F0 = x.transpose(-1, -2)
462
  for block in self.F0:
463
  F0 = block(F0, s)
@@ -503,7 +515,6 @@ class DurationEncoder(nn.Module):
503
  x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
504
 
505
  x = x.transpose(0, 1)
506
- input_lengths = text_lengths.cpu().numpy()
507
  x = x.transpose(-1, -2)
508
 
509
  for block in self.lstms:
@@ -513,8 +524,9 @@ class DurationEncoder(nn.Module):
513
  x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
514
  else:
515
  x = x.transpose(-1, -2)
 
516
  x = nn.utils.rnn.pack_padded_sequence(
517
- x, input_lengths, batch_first=True, enforce_sorted=False)
518
  block.flatten_parameters()
519
  x, _ = block(x)
520
  x, _ = nn.utils.rnn.pad_packed_sequence(
@@ -529,13 +541,28 @@ class DurationEncoder(nn.Module):
529
 
530
  return x.transpose(-1, -2)
531
 
532
- def inference(self, x, style):
533
- x = self.embedding(x.transpose(-1, -2)) * np.sqrt(self.d_model)
534
- style = style.expand(x.shape[0], x.shape[1], -1)
535
- x = torch.cat([x, style], axis=-1)
536
- src = self.pos_encoder(x)
537
- output = self.transformer_encoder(src).transpose(0, 1)
538
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
 
540
  def length_to_mask(self, lengths):
541
  mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
 
272
 
273
  x = x.transpose(1, 2) # [B, T, chn]
274
 
 
275
  x = nn.utils.rnn.pack_padded_sequence(
276
+ x, input_lengths.cpu(), batch_first=True, enforce_sorted=False)
277
 
278
  self.lstm.flatten_parameters()
279
  x, _ = self.lstm(x)
 
291
  return x
292
 
293
  def inference(self, x):
294
+ x = self.embedding(x) # [B, T, emb]
295
+ x = x.transpose(1, 2) # [B, emb, T]
296
+
297
+ for c in self.cnn:
298
+ x = c(x)
299
+
300
+ x = x.transpose(1, 2) # [B, T, chn]
301
+
302
  self.lstm.flatten_parameters()
303
  x, _ = self.lstm(x)
304
+
305
+ x = x.transpose(-1, -2)
306
+
307
  return x
308
 
309
  def length_to_mask(self, lengths):
 
439
  text_size = d.shape[1]
440
 
441
  # predict duration
442
+ input_lengths = text_lengths
443
  x = nn.utils.rnn.pack_padded_sequence(
444
  d, input_lengths, batch_first=True, enforce_sorted=False)
445
 
 
462
  return duration.squeeze(-1), en
463
 
464
  def F0Ntrain(self, x, s):
465
+ x1 = x.transpose(-1, -2)
466
+ torch._check(x1.dim() == 3, lambda: print(f"Expected 3D tensor, got {x1.dim()}D tensor"))
467
+ torch._check(x1.shape[1] > 1, lambda: print(f"Shape 2, got {x1.size(1)}"))
468
+ torch._check(x1.shape[2] > 1, lambda: print(f"Shape 2, got {x1.size(2)}"))
469
+ torch._check(x.shape[2] > 0, lambda: print(f"Shape 2, got {x.size(2)}"))
470
+ x, _ = self.shared(x1)
471
+ # torch._check(x.shape[2] > 0, lambda: print(f"Shape 2, got {x.size(2)}"))
472
+
473
  F0 = x.transpose(-1, -2)
474
  for block in self.F0:
475
  F0 = block(F0, s)
 
515
  x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
516
 
517
  x = x.transpose(0, 1)
 
518
  x = x.transpose(-1, -2)
519
 
520
  for block in self.lstms:
 
524
  x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
525
  else:
526
  x = x.transpose(-1, -2)
527
+
528
  x = nn.utils.rnn.pack_padded_sequence(
529
+ x, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
530
  block.flatten_parameters()
531
  x, _ = block(x)
532
  x, _ = nn.utils.rnn.pad_packed_sequence(
 
541
 
542
  return x.transpose(-1, -2)
543
 
544
+ def inference(self, x: torch.Tensor, style: torch.Tensor) -> torch.Tensor:
545
+
546
+ x = x.permute(2, 0, 1)
547
+ s = style.expand(x.shape[0], x.shape[1], -1)
548
+ x = torch.cat([x, s], axis=-1)
549
+
550
+ x = x.transpose(0, 1)
551
+ x = x.transpose(-1, -2)
552
+
553
+ for block in self.lstms:
554
+ if isinstance(block, AdaLayerNorm):
555
+ x = block(x.transpose(-1, -2), style).transpose(-1, -2)
556
+ x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
557
+ else:
558
+ x = x.transpose(-1, -2)
559
+
560
+ block.flatten_parameters()
561
+ x, _ = block(x)
562
+
563
+ x = F.dropout(x, p=self.dropout, training=self.training)
564
+ x = x.transpose(-1, -2)
565
+ return x.transpose(-1, -2)
566
 
567
  def length_to_mask(self, lengths):
568
  mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
test.ipynb ADDED
The diff for this file is too large to render. See raw diff