Working on onnx export.
Browse files- models.py +45 -18
- test.ipynb +0 -0
models.py
CHANGED
@@ -272,9 +272,8 @@ class TextEncoder(nn.Module):
|
|
272 |
|
273 |
x = x.transpose(1, 2) # [B, T, chn]
|
274 |
|
275 |
-
input_lengths = input_lengths.cpu().numpy()
|
276 |
x = nn.utils.rnn.pack_padded_sequence(
|
277 |
-
x, input_lengths, batch_first=True, enforce_sorted=False)
|
278 |
|
279 |
self.lstm.flatten_parameters()
|
280 |
x, _ = self.lstm(x)
|
@@ -292,12 +291,19 @@ class TextEncoder(nn.Module):
|
|
292 |
return x
|
293 |
|
294 |
def inference(self, x):
|
295 |
-
x = self.embedding(x)
|
296 |
-
x = x.transpose(1, 2)
|
297 |
-
|
298 |
-
|
|
|
|
|
|
|
|
|
299 |
self.lstm.flatten_parameters()
|
300 |
x, _ = self.lstm(x)
|
|
|
|
|
|
|
301 |
return x
|
302 |
|
303 |
def length_to_mask(self, lengths):
|
@@ -433,7 +439,7 @@ class ProsodyPredictor(nn.Module):
|
|
433 |
text_size = d.shape[1]
|
434 |
|
435 |
# predict duration
|
436 |
-
input_lengths = text_lengths
|
437 |
x = nn.utils.rnn.pack_padded_sequence(
|
438 |
d, input_lengths, batch_first=True, enforce_sorted=False)
|
439 |
|
@@ -456,8 +462,14 @@ class ProsodyPredictor(nn.Module):
|
|
456 |
return duration.squeeze(-1), en
|
457 |
|
458 |
def F0Ntrain(self, x, s):
|
459 |
-
|
460 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
461 |
F0 = x.transpose(-1, -2)
|
462 |
for block in self.F0:
|
463 |
F0 = block(F0, s)
|
@@ -503,7 +515,6 @@ class DurationEncoder(nn.Module):
|
|
503 |
x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
|
504 |
|
505 |
x = x.transpose(0, 1)
|
506 |
-
input_lengths = text_lengths.cpu().numpy()
|
507 |
x = x.transpose(-1, -2)
|
508 |
|
509 |
for block in self.lstms:
|
@@ -513,8 +524,9 @@ class DurationEncoder(nn.Module):
|
|
513 |
x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
|
514 |
else:
|
515 |
x = x.transpose(-1, -2)
|
|
|
516 |
x = nn.utils.rnn.pack_padded_sequence(
|
517 |
-
x,
|
518 |
block.flatten_parameters()
|
519 |
x, _ = block(x)
|
520 |
x, _ = nn.utils.rnn.pad_packed_sequence(
|
@@ -529,13 +541,28 @@ class DurationEncoder(nn.Module):
|
|
529 |
|
530 |
return x.transpose(-1, -2)
|
531 |
|
532 |
-
def inference(self, x, style):
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
539 |
|
540 |
def length_to_mask(self, lengths):
|
541 |
mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
|
|
|
272 |
|
273 |
x = x.transpose(1, 2) # [B, T, chn]
|
274 |
|
|
|
275 |
x = nn.utils.rnn.pack_padded_sequence(
|
276 |
+
x, input_lengths.cpu(), batch_first=True, enforce_sorted=False)
|
277 |
|
278 |
self.lstm.flatten_parameters()
|
279 |
x, _ = self.lstm(x)
|
|
|
291 |
return x
|
292 |
|
293 |
def inference(self, x):
|
294 |
+
x = self.embedding(x) # [B, T, emb]
|
295 |
+
x = x.transpose(1, 2) # [B, emb, T]
|
296 |
+
|
297 |
+
for c in self.cnn:
|
298 |
+
x = c(x)
|
299 |
+
|
300 |
+
x = x.transpose(1, 2) # [B, T, chn]
|
301 |
+
|
302 |
self.lstm.flatten_parameters()
|
303 |
x, _ = self.lstm(x)
|
304 |
+
|
305 |
+
x = x.transpose(-1, -2)
|
306 |
+
|
307 |
return x
|
308 |
|
309 |
def length_to_mask(self, lengths):
|
|
|
439 |
text_size = d.shape[1]
|
440 |
|
441 |
# predict duration
|
442 |
+
input_lengths = text_lengths
|
443 |
x = nn.utils.rnn.pack_padded_sequence(
|
444 |
d, input_lengths, batch_first=True, enforce_sorted=False)
|
445 |
|
|
|
462 |
return duration.squeeze(-1), en
|
463 |
|
464 |
def F0Ntrain(self, x, s):
|
465 |
+
x1 = x.transpose(-1, -2)
|
466 |
+
torch._check(x1.dim() == 3, lambda: print(f"Expected 3D tensor, got {x1.dim()}D tensor"))
|
467 |
+
torch._check(x1.shape[1] > 1, lambda: print(f"Shape 2, got {x1.size(1)}"))
|
468 |
+
torch._check(x1.shape[2] > 1, lambda: print(f"Shape 2, got {x1.size(2)}"))
|
469 |
+
torch._check(x.shape[2] > 0, lambda: print(f"Shape 2, got {x.size(2)}"))
|
470 |
+
x, _ = self.shared(x1)
|
471 |
+
# torch._check(x.shape[2] > 0, lambda: print(f"Shape 2, got {x.size(2)}"))
|
472 |
+
|
473 |
F0 = x.transpose(-1, -2)
|
474 |
for block in self.F0:
|
475 |
F0 = block(F0, s)
|
|
|
515 |
x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
|
516 |
|
517 |
x = x.transpose(0, 1)
|
|
|
518 |
x = x.transpose(-1, -2)
|
519 |
|
520 |
for block in self.lstms:
|
|
|
524 |
x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
|
525 |
else:
|
526 |
x = x.transpose(-1, -2)
|
527 |
+
|
528 |
x = nn.utils.rnn.pack_padded_sequence(
|
529 |
+
x, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
|
530 |
block.flatten_parameters()
|
531 |
x, _ = block(x)
|
532 |
x, _ = nn.utils.rnn.pad_packed_sequence(
|
|
|
541 |
|
542 |
return x.transpose(-1, -2)
|
543 |
|
544 |
+
def inference(self, x: torch.Tensor, style: torch.Tensor) -> torch.Tensor:
|
545 |
+
|
546 |
+
x = x.permute(2, 0, 1)
|
547 |
+
s = style.expand(x.shape[0], x.shape[1], -1)
|
548 |
+
x = torch.cat([x, s], axis=-1)
|
549 |
+
|
550 |
+
x = x.transpose(0, 1)
|
551 |
+
x = x.transpose(-1, -2)
|
552 |
+
|
553 |
+
for block in self.lstms:
|
554 |
+
if isinstance(block, AdaLayerNorm):
|
555 |
+
x = block(x.transpose(-1, -2), style).transpose(-1, -2)
|
556 |
+
x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
|
557 |
+
else:
|
558 |
+
x = x.transpose(-1, -2)
|
559 |
+
|
560 |
+
block.flatten_parameters()
|
561 |
+
x, _ = block(x)
|
562 |
+
|
563 |
+
x = F.dropout(x, p=self.dropout, training=self.training)
|
564 |
+
x = x.transpose(-1, -2)
|
565 |
+
return x.transpose(-1, -2)
|
566 |
|
567 |
def length_to_mask(self, lengths):
|
568 |
mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
|
test.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|