Staticaliza commited on
Commit
3d58a26
Β·
verified Β·
1 Parent(s): 193e78a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -233
app.py CHANGED
@@ -1,100 +1,3 @@
1
- # # app.py ── Zonos TTS (transformer only, minimal UI)
2
-
3
- # import os, tempfile, torch, numpy as np, gradio as gr, torchaudio, soundfile as sf, spaces
4
- # from zonos.model import Zonos
5
- # from zonos.conditioning import make_cond_dict, supported_language_codes
6
-
7
- # # disable Torch-Inductor (keeps Spaces happy)
8
- # os.environ["TORCH_COMPILE_DISABLE"] = os.environ["TORCHINDUCTOR_DISABLE"] = "1"
9
- # torch._dynamo.disable()
10
- # torch.compile = lambda f,*a,**k: f # no-op
11
-
12
- # device = "cuda"
13
- # model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=device).eval()
14
-
15
- # # ─────────────────── helpers ────────────────────
16
- # def _speaker_embed(aud):
17
- # if aud is None:
18
- # return None
19
- # sr, wav = aud # gradio returns (sr, np.ndarray)
20
- # if wav.dtype.kind in "iu": # int β†’ float
21
- # wav = wav.astype(np.float32) / np.iinfo(wav.dtype).max
22
- # wav_t = torch.from_numpy(wav).unsqueeze(0) # (1,C,N)
23
- # return model.make_speaker_embedding(wav_t, sr)
24
-
25
- # # ─────────────────── inference ───────────────────
26
- # @spaces.GPU
27
- # def tts(text, lang, speaker,
28
- # e1,e2,e3,e4,e5,e6,e7,e8,
29
- # vq, fmax, pitch, rate,
30
- # cfg, minp, tokens):
31
-
32
- # emotion = torch.tensor([float(x) for x in [e1,e2,e3,e4,e5,e6,e7,e8]],
33
- # device=device, dtype=torch.float32)
34
-
35
- # cond = make_cond_dict(
36
- # text=text,
37
- # language=lang,
38
- # speaker=_speaker_embed(speaker),
39
- # emotion=emotion,
40
- # vqscore_8=torch.tensor([vq]*8, device=device).unsqueeze(0),
41
- # fmax=float(fmax),
42
- # pitch_std=float(pitch),
43
- # speaking_rate=float(rate),
44
- # device=device
45
- # )
46
-
47
- # with torch.no_grad():
48
- # codes = model.generate(model.prepare_conditioning(cond),
49
- # max_new_tokens=int(tokens),
50
- # cfg_scale=float(cfg),
51
- # sampling_params=dict(min_p=float(minp)))
52
- # wav = model.autoencoder.decode(codes)[0] # (C,N) torch
53
- # wav = wav.cpu().clamp_(-1,1).numpy() # β†’ numpy
54
-
55
- # # >>> FIX <<< ensure (N,) or (N, C) for libsndfile
56
- # wav = np.squeeze(wav)
57
- # if wav.ndim == 2: # currently (C,N)
58
- # wav = wav.T # β†’ (N,C)
59
-
60
- # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
61
- # sf.write(tmp.name, wav,
62
- # model.autoencoder.sampling_rate,
63
- # format="WAV", subtype="PCM_16")
64
- # return tmp.name
65
-
66
- # # ─────────────────── UI ──────────────────────────
67
- # langs = supported_language_codes
68
- # with gr.Blocks() as demo:
69
- # text = gr.Textbox(label="Text")
70
- # lang = gr.Dropdown(langs, value="en-us", label="Language")
71
- # speaker = gr.Audio(type="numpy", label="Speaker ref (optional)")
72
-
73
- # # emotion sliders (all default 0)
74
- # emotions = []
75
- # for label in ["happiness","sadness","disgust","fear",
76
- # "surprise","anger","other","neutral"]:
77
- # emotions.append(gr.Slider(0,1,0.0,0.05,label=label))
78
-
79
- # vq = gr.Slider(0.5,0.9,0.78,0.01,label="clarity (vq)")
80
- # fmax = gr.Slider(0,24000,24000,100,label="fmax (Hz)")
81
- # pitch= gr.Slider(0,300,45,1,label="pitch variation")
82
- # rate = gr.Slider(5,30,15,0.5,label="speaking rate")
83
- # cfg = gr.Slider(1.1,5,2,0.1,label="guidance scale")
84
- # minp = gr.Slider(0,1,0.15,0.01,label="min-p")
85
- # tokens = gr.Slider(0,3000,300,1,label="tokens (β‰ˆsteps)")
86
-
87
- # out = gr.Audio(type="filepath", label="Output")
88
- # gr.Button("Generate").click(
89
- # tts,
90
- # inputs=[text, lang, speaker, *emotions,
91
- # vq, fmax, pitch, rate, cfg, minp, tokens],
92
- # outputs=out
93
- # )
94
-
95
- # if __name__ == "__main__":
96
- # demo.launch()
97
-
98
  import os
99
  import shlex
100
  import subprocess
@@ -122,12 +25,31 @@ from os import getenv
122
  from zonos.model import Zonos
123
  from zonos.conditioning import make_cond_dict, supported_language_codes
124
 
 
 
 
 
 
 
 
 
 
125
  device = "cuda"
126
  MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
127
  MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
128
  for model in MODELS.values():
129
  model.requires_grad_(False).eval()
130
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  def update_ui(model_choice):
133
  """
@@ -290,144 +212,9 @@ def generate_audio(
290
  wav_out = wav_out[0:1, :]
291
  return (sr_out, wav_out.squeeze().numpy()), seed
292
 
293
-
294
- # Custom CSS for pastel gradient background and enhanced UI
295
- custom_css = """
296
- .gradio-container {
297
- background: linear-gradient(135deg, #f3e7ff, #e6f0ff, #ffe6f2, #e6fff9);
298
- background-size: 400% 400%;
299
- animation: gradient 15s ease infinite;
300
- }
301
- @keyframes gradient {
302
- 0% {
303
- background-position: 0% 50%;
304
- }
305
- 50% {
306
- background-position: 100% 50%;
307
- }
308
- 100% {
309
- background-position: 0% 50%;
310
- }
311
- }
312
- .container {
313
- max-width: 1200px;
314
- margin: 0 auto;
315
- padding: 20px;
316
- }
317
- .panel {
318
- background-color: rgba(255, 255, 255, 0.7);
319
- border-radius: 16px;
320
- padding: 20px;
321
- box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
322
- margin-bottom: 16px;
323
- backdrop-filter: blur(5px);
324
- transition: all 0.3s ease;
325
- }
326
- .panel:hover {
327
- box-shadow: 0 6px 16px rgba(0, 0, 0, 0.12);
328
- transform: translateY(-2px);
329
- }
330
- .title {
331
- font-size: 1.2em;
332
- font-weight: 600;
333
- margin-bottom: 12px;
334
- color: #6a3ea1;
335
- border-bottom: 2px solid #f0e6ff;
336
- padding-bottom: 8px;
337
- }
338
- .slider-container {
339
- background-color: rgba(255, 255, 255, 0.5);
340
- border-radius: 10px;
341
- padding: 10px;
342
- margin: 5px 0;
343
- }
344
- /* Make sliders more appealing */
345
- input[type=range] {
346
- height: 5px;
347
- appearance: none;
348
- width: 100%;
349
- border-radius: 3px;
350
- background: linear-gradient(90deg, #9c83e0, #83b1e0);
351
- }
352
- .generate-button {
353
- background: linear-gradient(90deg, #a673ff, #7c4dff);
354
- color: white;
355
- border: none;
356
- border-radius: 8px;
357
- padding: 12px 24px;
358
- font-size: 16px;
359
- font-weight: 500;
360
- cursor: pointer;
361
- transition: all 0.3s ease;
362
- box-shadow: 0 4px 10px rgba(124, 77, 255, 0.2);
363
- display: block;
364
- width: 100%;
365
- margin: 20px 0;
366
- }
367
- .generate-button:hover {
368
- background: linear-gradient(90deg, #9c5eff, #6a3aff);
369
- box-shadow: 0 6px 15px rgba(124, 77, 255, 0.3);
370
- transform: translateY(-2px);
371
- }
372
- /* Tabs styling */
373
- .tabs {
374
- display: flex;
375
- border-bottom: 1px solid #e0e0e0;
376
- margin-bottom: 20px;
377
- }
378
- .tab {
379
- padding: 10px 20px;
380
- cursor: pointer;
381
- transition: all 0.3s ease;
382
- background-color: transparent;
383
- border: none;
384
- color: #666;
385
- }
386
- .tab.active {
387
- color: #7c4dff;
388
- border-bottom: 3px solid #7c4dff;
389
- font-weight: 600;
390
- }
391
- /* Emotion sliders container */
392
- .emotion-grid {
393
- display: grid;
394
- grid-template-columns: repeat(4, 1fr);
395
- gap: 12px;
396
- }
397
- /* Header styling */
398
- .app-header {
399
- text-align: center;
400
- margin-bottom: 25px;
401
- }
402
- .app-header h1 {
403
- font-size: 2.5em;
404
- color: #6a3ea1;
405
- margin-bottom: 8px;
406
- font-weight: 700;
407
- }
408
- .app-header p {
409
- font-size: 1.1em;
410
- color: #666;
411
- margin-bottom: 20px;
412
- }
413
- /* Audio player styling */
414
- .audio-output {
415
- margin-top: 20px;
416
- }
417
- /* Make output area more prominent */
418
- .output-container {
419
- background-color: rgba(255, 255, 255, 0.85);
420
- border-radius: 16px;
421
- padding: 24px;
422
- box-shadow: 0 8px 18px rgba(0, 0, 0, 0.1);
423
- margin-top: 20px;
424
- }
425
- """
426
-
427
-
428
  def build_interface():
429
  # Build interface with enhanced visual elements and layout
430
- with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
431
  # Header section
432
  with gr.Column(elem_classes="app-header"):
433
  gr.Markdown("# ✨ Zonos Text-to-Speech Generator ✨")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import shlex
3
  import subprocess
 
25
  from zonos.model import Zonos
26
  from zonos.conditioning import make_cond_dict, supported_language_codes
27
 
28
+ # 1. hard-kill torch.compile / dynamo / inductor so they never run
29
+ os.environ["TORCH_COMPILE_DISABLE"] = "1"
30
+ os.environ["TORCHINDUCTOR_DISABLE"] = "1"
31
+ os.environ["TORCHDYNAMO_DISABLE"] = "1" # <- the one that actually blocks torch._dynamo
32
+ os.environ["TORCHDYNAMO_SUPPRESS_ERRORS"] = "True" # fall back to eager if something still slips through :contentReference[oaicite:1]{index=1}
33
+
34
+ torch._dynamo.disable() # guard for older versions
35
+ torch.compile = lambda f,*_,**__: f # no-op wrapper
36
+
37
  device = "cuda"
38
  MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
39
  MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
40
  for model in MODELS.values():
41
  model.requires_grad_(False).eval()
42
 
43
+ def _patch_cuda_props():
44
+ if torch.cuda.is_available():
45
+ for i in range(torch.cuda.device_count()):
46
+ p = torch.cuda.get_device_properties(i)
47
+ if not hasattr(p, "regs_per_multiprocessor"):
48
+ setattr(p, "regs_per_multiprocessor", 65536)
49
+ if not hasattr(p, "max_threads_per_multi_processor"):
50
+ setattr(p, "max_threads_per_multi_processor", 2048)
51
+
52
+ _patch_cuda_props()
53
 
54
  def update_ui(model_choice):
55
  """
 
212
  wav_out = wav_out[0:1, :]
213
  return (sr_out, wav_out.squeeze().numpy()), seed
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  def build_interface():
216
  # Build interface with enhanced visual elements and layout
217
+ with gr.Blocks() as demo:
218
  # Header section
219
  with gr.Column(elem_classes="app-header"):
220
  gr.Markdown("# ✨ Zonos Text-to-Speech Generator ✨")