Spaces:

fluxions
/

vui-space

Running on Zero

Harry Coultas Blum commited on Jun 6

Commit

3d69f83

1 Parent(s): 84d326d

trying to cast

Files changed (3) hide show

app.py CHANGED Viewed

@@ -19,7 +19,8 @@ def get_available_models():
     return models
-AVAILABLE_MODELS = get_available_models()
 print(f"Available models: {list(AVAILABLE_MODELS.keys())}")
 current_model = None
@@ -39,7 +40,7 @@ def load_and_warm_model(model_name):
     model = Vui.from_pretrained_inf(model_path).cuda()
     print(f"Compiling model {model_name}...")
-    model.decoder = torch.compile(model.decoder, fullgraph=True)
     print(f"Warming up model {model_name}...")
     warmup_text = "Hello, this is a test. Let's say some random shizz"
@@ -169,7 +170,7 @@ def load_sample_text(sample_index):
 # Create Gradio interface
-with gr.Blocks(
     title="Vui",
     theme=gr.themes.Soft(),
     head="""
@@ -383,5 +384,4 @@ document.addEventListener('DOMContentLoaded', function() {
         outputs=[text_input, audio_output, info_output],
     )
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", share=True)

     return models
+# AVAILABLE_MODELS = get_available_models()
+AVAILABLE_MODELS = {"COHOST": Vui.COHOST}
 print(f"Available models: {list(AVAILABLE_MODELS.keys())}")
 current_model = None
     model = Vui.from_pretrained_inf(model_path).cuda()
     print(f"Compiling model {model_name}...")
+    # model.decoder = torch.compile(model.decoder, fullgraph=True)
     print(f"Warming up model {model_name}...")
     warmup_text = "Hello, this is a test. Let's say some random shizz"
 # Create Gradio interface
+gradio_interface = gr.Blocks(
     title="Vui",
     theme=gr.themes.Soft(),
     head="""
         outputs=[text_input, audio_output, info_output],
     )
+    demo.launch()

inference.py CHANGED Viewed

@@ -1,12 +1,17 @@
 import torchaudio
 from vui.inference import render
 from vui.model import Vui
 model = Vui.from_pretrained().cuda()
-waveform = render(
-    model,
-    "Hey, here is some random stuff, usually something quite long as the shorter the text the less likely the model can cope!",
-)
-print(waveform.shape)
-torchaudio.save("out.opus", waveform[0], 22050)

 import torchaudio
+import torch
 from vui.inference import render
 from vui.model import Vui
 model = Vui.from_pretrained().cuda()
+model.decoder = torch.compile(model.decoder, fullgraph=True, mode="max-autotune")
+for i  in range(10):
+    waveform = render(
+        model,
+        """Hey, here is some random stuff, usually something quite long as the shorter the text the less likely the model can cope!
+So cool yeah makes sense, would you be able to help me with something?
+Sure what is it?""",
+    )
+    print(waveform.shape)
+torchaudio.save("out.opus", waveform[0].cpu(), 22050)

vui/inference.py CHANGED Viewed

@@ -83,7 +83,7 @@ def replace_numbers_with_words(text):
     return re.sub(r"\d+", number_to_words, text)
-valid_non_speech = ["breath", "sigh", "laugh", "tut", "hesitate"]
 valid_non_speech = [f"[{v}]" for v in valid_non_speech]
@@ -316,24 +316,26 @@ def render(
     Render audio from text. Uses generate for text < 1000 characters,
     otherwise breaks text into sections and uses chunking with context.
     """
     text = remove_all_invalid_non_speech(text)
     text = simple_clean(text)
     SR = self.codec.config.sample_rate
     HZ = self.codec.hz
     max_gen_len = int(HZ * max_secs)
-    if len(text) < 1000:
         codes = generate(
             self, text, prompt_codes, temperature, top_k, top_p, max_gen_len
         )
         codes = codes[..., :-10]
         audio = self.codec.from_indices(codes)
-        return audio
-        raise Exception("Failed to render")
     # Otherwise we have to do some clever chaining!
     orig_codes = prompt_codes
     lines = text.split("\n")

     return re.sub(r"\d+", number_to_words, text)
+valid_non_speech = ["breath", "sigh", "laugh", "tut", "hesitate", "clearthroat"]
 valid_non_speech = [f"[{v}]" for v in valid_non_speech]
     Render audio from text. Uses generate for text < 1000 characters,
     otherwise breaks text into sections and uses chunking with context.
     """
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
     text = remove_all_invalid_non_speech(text)
     text = simple_clean(text)
     SR = self.codec.config.sample_rate
     HZ = self.codec.hz
     max_gen_len = int(HZ * max_secs)
+    t1 = time.perf_counter()
+    if len(text) < 1400:
         codes = generate(
             self, text, prompt_codes, temperature, top_k, top_p, max_gen_len
         )
         codes = codes[..., :-10]
         audio = self.codec.from_indices(codes)
+        print("RTF", (audio.numel()/SR)/(time.perf_counter() - t1))
+        return audio.cpu()
     # Otherwise we have to do some clever chaining!
     orig_codes = prompt_codes
     lines = text.split("\n")