Spaces:
Running
on
Zero
Running
on
Zero
Harry Coultas Blum
commited on
Commit
·
3d69f83
1
Parent(s):
84d326d
trying to cast
Browse files- app.py +5 -5
- inference.py +11 -6
- vui/inference.py +9 -7
app.py
CHANGED
|
@@ -19,7 +19,8 @@ def get_available_models():
|
|
| 19 |
return models
|
| 20 |
|
| 21 |
|
| 22 |
-
AVAILABLE_MODELS = get_available_models()
|
|
|
|
| 23 |
print(f"Available models: {list(AVAILABLE_MODELS.keys())}")
|
| 24 |
|
| 25 |
current_model = None
|
|
@@ -39,7 +40,7 @@ def load_and_warm_model(model_name):
|
|
| 39 |
model = Vui.from_pretrained_inf(model_path).cuda()
|
| 40 |
|
| 41 |
print(f"Compiling model {model_name}...")
|
| 42 |
-
model.decoder = torch.compile(model.decoder, fullgraph=True)
|
| 43 |
|
| 44 |
print(f"Warming up model {model_name}...")
|
| 45 |
warmup_text = "Hello, this is a test. Let's say some random shizz"
|
|
@@ -169,7 +170,7 @@ def load_sample_text(sample_index):
|
|
| 169 |
|
| 170 |
|
| 171 |
# Create Gradio interface
|
| 172 |
-
|
| 173 |
title="Vui",
|
| 174 |
theme=gr.themes.Soft(),
|
| 175 |
head="""
|
|
@@ -383,5 +384,4 @@ document.addEventListener('DOMContentLoaded', function() {
|
|
| 383 |
outputs=[text_input, audio_output, info_output],
|
| 384 |
)
|
| 385 |
|
| 386 |
-
|
| 387 |
-
demo.launch(server_name="0.0.0.0", share=True)
|
|
|
|
| 19 |
return models
|
| 20 |
|
| 21 |
|
| 22 |
+
# AVAILABLE_MODELS = get_available_models()
|
| 23 |
+
AVAILABLE_MODELS = {"COHOST": Vui.COHOST}
|
| 24 |
print(f"Available models: {list(AVAILABLE_MODELS.keys())}")
|
| 25 |
|
| 26 |
current_model = None
|
|
|
|
| 40 |
model = Vui.from_pretrained_inf(model_path).cuda()
|
| 41 |
|
| 42 |
print(f"Compiling model {model_name}...")
|
| 43 |
+
# model.decoder = torch.compile(model.decoder, fullgraph=True)
|
| 44 |
|
| 45 |
print(f"Warming up model {model_name}...")
|
| 46 |
warmup_text = "Hello, this is a test. Let's say some random shizz"
|
|
|
|
| 170 |
|
| 171 |
|
| 172 |
# Create Gradio interface
|
| 173 |
+
gradio_interface = gr.Blocks(
|
| 174 |
title="Vui",
|
| 175 |
theme=gr.themes.Soft(),
|
| 176 |
head="""
|
|
|
|
| 384 |
outputs=[text_input, audio_output, info_output],
|
| 385 |
)
|
| 386 |
|
| 387 |
+
demo.launch()
|
|
|
inference.py
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
import torchaudio
|
|
|
|
| 2 |
|
| 3 |
from vui.inference import render
|
| 4 |
from vui.model import Vui
|
| 5 |
|
| 6 |
model = Vui.from_pretrained().cuda()
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import torchaudio
|
| 2 |
+
import torch
|
| 3 |
|
| 4 |
from vui.inference import render
|
| 5 |
from vui.model import Vui
|
| 6 |
|
| 7 |
model = Vui.from_pretrained().cuda()
|
| 8 |
+
model.decoder = torch.compile(model.decoder, fullgraph=True, mode="max-autotune")
|
| 9 |
+
for i in range(10):
|
| 10 |
+
waveform = render(
|
| 11 |
+
model,
|
| 12 |
+
"""Hey, here is some random stuff, usually something quite long as the shorter the text the less likely the model can cope!
|
| 13 |
+
So cool yeah makes sense, would you be able to help me with something?
|
| 14 |
+
Sure what is it?""",
|
| 15 |
+
)
|
| 16 |
+
print(waveform.shape)
|
| 17 |
+
torchaudio.save("out.opus", waveform[0].cpu(), 22050)
|
vui/inference.py
CHANGED
|
@@ -83,7 +83,7 @@ def replace_numbers_with_words(text):
|
|
| 83 |
return re.sub(r"\d+", number_to_words, text)
|
| 84 |
|
| 85 |
|
| 86 |
-
valid_non_speech = ["breath", "sigh", "laugh", "tut", "hesitate"]
|
| 87 |
valid_non_speech = [f"[{v}]" for v in valid_non_speech]
|
| 88 |
|
| 89 |
|
|
@@ -316,24 +316,26 @@ def render(
|
|
| 316 |
Render audio from text. Uses generate for text < 1000 characters,
|
| 317 |
otherwise breaks text into sections and uses chunking with context.
|
| 318 |
"""
|
|
|
|
|
|
|
|
|
|
| 319 |
text = remove_all_invalid_non_speech(text)
|
| 320 |
text = simple_clean(text)
|
| 321 |
SR = self.codec.config.sample_rate
|
| 322 |
HZ = self.codec.hz
|
| 323 |
max_gen_len = int(HZ * max_secs)
|
|
|
|
| 324 |
|
| 325 |
-
if len(text) <
|
| 326 |
codes = generate(
|
| 327 |
self, text, prompt_codes, temperature, top_k, top_p, max_gen_len
|
| 328 |
)
|
| 329 |
codes = codes[..., :-10]
|
| 330 |
audio = self.codec.from_indices(codes)
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
# Otherwise we have to do some clever chaining!
|
| 336 |
-
|
| 337 |
orig_codes = prompt_codes
|
| 338 |
|
| 339 |
lines = text.split("\n")
|
|
|
|
| 83 |
return re.sub(r"\d+", number_to_words, text)
|
| 84 |
|
| 85 |
|
| 86 |
+
valid_non_speech = ["breath", "sigh", "laugh", "tut", "hesitate", "clearthroat"]
|
| 87 |
valid_non_speech = [f"[{v}]" for v in valid_non_speech]
|
| 88 |
|
| 89 |
|
|
|
|
| 316 |
Render audio from text. Uses generate for text < 1000 characters,
|
| 317 |
otherwise breaks text into sections and uses chunking with context.
|
| 318 |
"""
|
| 319 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
| 320 |
+
torch.backends.cudnn.allow_tf32 = True
|
| 321 |
+
|
| 322 |
text = remove_all_invalid_non_speech(text)
|
| 323 |
text = simple_clean(text)
|
| 324 |
SR = self.codec.config.sample_rate
|
| 325 |
HZ = self.codec.hz
|
| 326 |
max_gen_len = int(HZ * max_secs)
|
| 327 |
+
t1 = time.perf_counter()
|
| 328 |
|
| 329 |
+
if len(text) < 1400:
|
| 330 |
codes = generate(
|
| 331 |
self, text, prompt_codes, temperature, top_k, top_p, max_gen_len
|
| 332 |
)
|
| 333 |
codes = codes[..., :-10]
|
| 334 |
audio = self.codec.from_indices(codes)
|
| 335 |
+
print("RTF", (audio.numel()/SR)/(time.perf_counter() - t1))
|
| 336 |
+
return audio.cpu()
|
| 337 |
+
|
|
|
|
| 338 |
# Otherwise we have to do some clever chaining!
|
|
|
|
| 339 |
orig_codes = prompt_codes
|
| 340 |
|
| 341 |
lines = text.split("\n")
|