Spaces:
Runtime error
Runtime error
Fix OOM
Browse files- app.py +2 -2
- descriptions.py +7 -1
- models/vallex.py +12 -0
- requirements.txt +1 -0
- utils/generation.py +2 -1
app.py
CHANGED
|
@@ -44,8 +44,8 @@ text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
|
|
| 44 |
text_collater = get_text_token_collater()
|
| 45 |
|
| 46 |
device = torch.device("cpu")
|
| 47 |
-
if torch.cuda.is_available():
|
| 48 |
-
|
| 49 |
|
| 50 |
# VALL-E-X model
|
| 51 |
model = VALLE(
|
|
|
|
| 44 |
text_collater = get_text_token_collater()
|
| 45 |
|
| 46 |
device = torch.device("cpu")
|
| 47 |
+
# if torch.cuda.is_available():
|
| 48 |
+
# device = torch.device("cuda", 0)
|
| 49 |
|
| 50 |
# VALL-E-X model
|
| 51 |
model = VALLE(
|
descriptions.py
CHANGED
|
@@ -1,6 +1,12 @@
|
|
| 1 |
top_md = """
|
| 2 |
# VALL-E X
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
VALL-E X can synthesize high-quality personalized speech with only a 3-second enrolled recording of
|
| 5 |
an unseen speaker as an acoustic prompt, even in another language for a monolingual speaker.<br>
|
| 6 |
This implementation supports zero-shot, mono-lingual/cross-lingual text-to-speech functionality of three languages (English, Chinese, Japanese)<br>
|
|
|
|
| 1 |
top_md = """
|
| 2 |
# VALL-E X
|
| 3 |
+
<a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
|
| 4 |
+
style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
|
| 5 |
+
<img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
|
| 6 |
+
src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> or <a href="https://colab.research.google.com/drive/1yyD_sz531QntLKowMHo-XxorsFBCfKul?usp=sharing"
|
| 7 |
+
style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
|
| 8 |
+
<img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
|
| 9 |
+
src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>to skip the queue.</p>
|
| 10 |
VALL-E X can synthesize high-quality personalized speech with only a 3-second enrolled recording of
|
| 11 |
an unseen speaker as an acoustic prompt, even in another language for a monolingual speaker.<br>
|
| 12 |
This implementation supports zero-shot, mono-lingual/cross-lingual text-to-speech functionality of three languages (English, Chinese, Japanese)<br>
|
models/vallex.py
CHANGED
|
@@ -33,6 +33,15 @@ from modules.transformer import (
|
|
| 33 |
|
| 34 |
from .macros import NUM_AUDIO_TOKENS, NUM_TEXT_TOKENS
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
class Transpose(nn.Identity):
|
| 38 |
"""(N, T, D) -> (N, D, T)"""
|
|
@@ -572,6 +581,9 @@ class VALLE(VALLF):
|
|
| 572 |
)
|
| 573 |
|
| 574 |
print(f"VALL-E EOS [{prompts.shape[1]} -> {y.shape[1]}]")
|
|
|
|
|
|
|
|
|
|
| 575 |
break
|
| 576 |
|
| 577 |
y = torch.concat([y, samples], dim=1)
|
|
|
|
| 33 |
|
| 34 |
from .macros import NUM_AUDIO_TOKENS, NUM_TEXT_TOKENS
|
| 35 |
|
| 36 |
+
import psutil
|
| 37 |
+
def get_memory_usage():
|
| 38 |
+
process = psutil.Process()
|
| 39 |
+
memory_info = process.memory_info()
|
| 40 |
+
|
| 41 |
+
memory_used = memory_info.rss
|
| 42 |
+
memory_used_mb = memory_used / (1024 * 1024)
|
| 43 |
+
|
| 44 |
+
return memory_used_mb
|
| 45 |
|
| 46 |
class Transpose(nn.Identity):
|
| 47 |
"""(N, T, D) -> (N, D, T)"""
|
|
|
|
| 581 |
)
|
| 582 |
|
| 583 |
print(f"VALL-E EOS [{prompts.shape[1]} -> {y.shape[1]}]")
|
| 584 |
+
|
| 585 |
+
memory_used = get_memory_usage()
|
| 586 |
+
print(f"Current memory used: {memory_used:.2f} MB")
|
| 587 |
break
|
| 588 |
|
| 589 |
y = torch.concat([y, samples], dim=1)
|
requirements.txt
CHANGED
|
@@ -18,4 +18,5 @@ nltk
|
|
| 18 |
openai-whisper
|
| 19 |
phonemizer
|
| 20 |
matplotlib
|
|
|
|
| 21 |
gradio
|
|
|
|
| 18 |
openai-whisper
|
| 19 |
phonemizer
|
| 20 |
matplotlib
|
| 21 |
+
psutil
|
| 22 |
gradio
|
utils/generation.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
| 2 |
import torch
|
| 3 |
import gdown
|
| 4 |
import logging
|
|
|
|
| 5 |
import langid
|
| 6 |
langid.set_languages(['en', 'zh', 'ja'])
|
| 7 |
|
|
@@ -253,4 +254,4 @@ def generate_audio_from_long_text(text, prompt=None, language='auto', accent='no
|
|
| 253 |
)
|
| 254 |
return samples[0][0].cpu().numpy()
|
| 255 |
else:
|
| 256 |
-
raise ValueError(f"No such mode {mode}")
|
|
|
|
| 2 |
import torch
|
| 3 |
import gdown
|
| 4 |
import logging
|
| 5 |
+
import psutil
|
| 6 |
import langid
|
| 7 |
langid.set_languages(['en', 'zh', 'ja'])
|
| 8 |
|
|
|
|
| 254 |
)
|
| 255 |
return samples[0][0].cpu().numpy()
|
| 256 |
else:
|
| 257 |
+
raise ValueError(f"No such mode {mode}")
|