Spaces:

Plachta
/

VALL-E-X

Runtime error

Plachta commited on Aug 28, 2023

Commit

f330917

1 Parent(s): b97852e

Fix OOM

Files changed (5) hide show

app.py CHANGED Viewed

@@ -44,8 +44,8 @@ text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
 text_collater = get_text_token_collater()
 device = torch.device("cpu")
-if torch.cuda.is_available():
-    device = torch.device("cuda", 0)
 # VALL-E-X model
 model = VALLE(

 text_collater = get_text_token_collater()
 device = torch.device("cpu")
+# if torch.cuda.is_available():
+#     device = torch.device("cuda", 0)
 # VALL-E-X model
 model = VALLE(

descriptions.py CHANGED Viewed

@@ -1,6 +1,12 @@
 top_md = """
 # VALL-E X
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1yyD_sz531QntLKowMHo-XxorsFBCfKul?usp=sharing)
 VALL-E X can synthesize high-quality personalized speech with only a 3-second enrolled recording of
 an unseen speaker as an acoustic prompt, even in another language for a monolingual speaker.<br>
 This implementation supports zero-shot, mono-lingual/cross-lingual text-to-speech functionality of three languages (English, Chinese, Japanese)<br>

 top_md = """
 # VALL-E X
+<a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
+    style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
+<img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
+    src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> or <a href="https://colab.research.google.com/drive/1yyD_sz531QntLKowMHo-XxorsFBCfKul?usp=sharing"
+    style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
+<img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
+    src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>to skip the queue.</p>
 VALL-E X can synthesize high-quality personalized speech with only a 3-second enrolled recording of
 an unseen speaker as an acoustic prompt, even in another language for a monolingual speaker.<br>
 This implementation supports zero-shot, mono-lingual/cross-lingual text-to-speech functionality of three languages (English, Chinese, Japanese)<br>

models/vallex.py CHANGED Viewed

@@ -33,6 +33,15 @@ from modules.transformer import (
 from .macros import NUM_AUDIO_TOKENS, NUM_TEXT_TOKENS
 class Transpose(nn.Identity):
     """(N, T, D) -> (N, D, T)"""
@@ -572,6 +581,9 @@ class VALLE(VALLF):
                     )
                 print(f"VALL-E EOS [{prompts.shape[1]} -> {y.shape[1]}]")
                 break
             y = torch.concat([y, samples], dim=1)

 from .macros import NUM_AUDIO_TOKENS, NUM_TEXT_TOKENS
+import psutil
+def get_memory_usage():
+    process = psutil.Process()
+    memory_info = process.memory_info()
+    memory_used = memory_info.rss
+    memory_used_mb = memory_used / (1024 * 1024)
+    return memory_used_mb
 class Transpose(nn.Identity):
     """(N, T, D) -> (N, D, T)"""
                     )
                 print(f"VALL-E EOS [{prompts.shape[1]} -> {y.shape[1]}]")
+                memory_used = get_memory_usage()
+                print(f"Current memory used: {memory_used:.2f} MB")
                 break
             y = torch.concat([y, samples], dim=1)

requirements.txt CHANGED Viewed

@@ -18,4 +18,5 @@ nltk
 openai-whisper
 phonemizer
 matplotlib
 gradio

 openai-whisper
 phonemizer
 matplotlib
+psutil
 gradio

utils/generation.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import torch
 import gdown
 import logging
 import langid
 langid.set_languages(['en', 'zh', 'ja'])
@@ -253,4 +254,4 @@ def generate_audio_from_long_text(text, prompt=None, language='auto', accent='no
         )
         return samples[0][0].cpu().numpy()
     else:
-        raise ValueError(f"No such mode {mode}")

 import torch
 import gdown
 import logging
+import psutil
 import langid
 langid.set_languages(['en', 'zh', 'ja'])
         )
         return samples[0][0].cpu().numpy()
     else:
+        raise ValueError(f"No such mode {mode}")