Staticaliza commited on
Commit
71ed3fb
Β·
verified Β·
1 Parent(s): f44a081

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -45
app.py CHANGED
@@ -1,63 +1,96 @@
1
- # app.py – only transformer, emotion cast to float, guidance β‰₯1.1
2
- import os, torch, numpy as np, gradio as gr, torchaudio, soundfile as sf, spaces
 
3
  from zonos.model import Zonos
4
  from zonos.conditioning import make_cond_dict, supported_language_codes
5
- import tempfile, soundfile as sf
6
 
7
- os.environ["TORCH_COMPILE_DISABLE"]=os.environ["TORCHINDUCTOR_DISABLE"]="1"
 
8
  torch._dynamo.disable()
9
- torch.compile=lambda f,*a,**k:f
10
 
11
- device="cuda"
12
- model=Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer",device=device).eval()
13
 
14
- def _spk(aud):
15
- if aud is None: return None
16
- sr,wav=aud
17
- if wav.dtype.kind in "iu":
18
- wav=wav.astype(np.float32)/np.iinfo(wav.dtype).max
19
- return model.make_speaker_embedding(torch.from_numpy(wav).unsqueeze(0),sr)
 
 
 
20
 
 
21
  @spaces.GPU
22
- def tts(text,lang,speaker,
23
  e1,e2,e3,e4,e5,e6,e7,e8,
24
- vq,fmax,pitch,rate,cfg,minp):
25
- emotion=torch.tensor([float(x) for x in [e1,e2,e3,e4,e5,e6,e7,e8]],
26
- device=device,dtype=torch.float32)
27
- cond=make_cond_dict(
28
- text=text,language=lang,speaker=_spk(speaker),
 
 
 
 
 
29
  emotion=emotion,
30
- vqscore_8=torch.tensor([vq]*8,device=device).unsqueeze(0),
31
- fmax=float(fmax),pitch_std=float(pitch),
32
- speaking_rate=float(rate),device=device)
 
 
 
 
33
  with torch.no_grad():
34
- wav=model.autoencoder.decode(
35
- model.generate(model.prepare_conditioning(cond),
36
- cfg_scale=float(cfg),sampling_params=dict(min_p=float(minp)))
37
- )[0].cpu().clamp_(-1,1).numpy()
38
- wav = np.squeeze(wav) # drop singleton channel
39
- if wav.ndim == 2: # (C, N) ➜ (N, C)
40
- wav = wav.T
41
-
 
 
 
 
42
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
43
  sf.write(tmp.name, wav,
44
  model.autoencoder.sampling_rate,
45
  format="WAV", subtype="PCM_16")
46
  return tmp.name
47
 
48
- langs=supported_language_codes
 
49
  with gr.Blocks() as demo:
50
- t=gr.Textbox(label="text")
51
- l=gr.Dropdown(langs,value="en-us",label="language")
52
- s=gr.Audio(type="numpy",label="speaker ref (optional)")
53
- emos=[gr.Slider(0,1,0.0,0.05,label=n) for n in
54
- ["happiness","sadness","disgust","fear","surprise","anger","other","neutral"]]
55
- vq=gr.Slider(0.5,0.9,0.78,0.01,label="clarity (vq)")
56
- fmx=gr.Slider(0,24000,24000,100,label="fmax (hz)")
57
- pit=gr.Slider(0,300,45,1,label="pitch variation")
58
- rte=gr.Slider(5,30,15,0.5,label="speaking rate")
59
- cfg=gr.Slider(1.1,5,2,0.1,label="guidance scale") # min 1.1 to avoid assert
60
- mp =gr.Slider(0,1,0.15,0.01,label="min-p")
61
- out=gr.Audio(type="filepath",label="output")
62
- gr.Button("generate").click(tts,[t,l,s,*emos,vq,fmx,pit,rte,cfg,mp],out)
63
- if __name__=="__main__": demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py ── Zonos TTS (transformer only, minimal UI)
2
+
3
+ import os, tempfile, torch, numpy as np, gradio as gr, torchaudio, soundfile as sf, spaces
4
  from zonos.model import Zonos
5
  from zonos.conditioning import make_cond_dict, supported_language_codes
 
6
 
7
+ # disable Torch-Inductor (keeps Spaces happy)
8
+ os.environ["TORCH_COMPILE_DISABLE"] = os.environ["TORCHINDUCTOR_DISABLE"] = "1"
9
  torch._dynamo.disable()
10
+ torch.compile = lambda f,*a,**k: f # no-op
11
 
12
+ device = "cuda"
13
+ model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=device).eval()
14
 
15
+ # ─────────────────── helpers ────────────────────
16
+ def _speaker_embed(aud):
17
+ if aud is None:
18
+ return None
19
+ sr, wav = aud # gradio returns (sr, np.ndarray)
20
+ if wav.dtype.kind in "iu": # int β†’ float
21
+ wav = wav.astype(np.float32) / np.iinfo(wav.dtype).max
22
+ wav_t = torch.from_numpy(wav).unsqueeze(0) # (1,C,N)
23
+ return model.make_speaker_embedding(wav_t, sr)
24
 
25
+ # ─────────────────── inference ───────────────────
26
  @spaces.GPU
27
+ def tts(text, lang, speaker,
28
  e1,e2,e3,e4,e5,e6,e7,e8,
29
+ vq, fmax, pitch, rate,
30
+ cfg, minp, tokens):
31
+
32
+ emotion = torch.tensor([float(x) for x in [e1,e2,e3,e4,e5,e6,e7,e8]],
33
+ device=device, dtype=torch.float32)
34
+
35
+ cond = make_cond_dict(
36
+ text=text,
37
+ language=lang,
38
+ speaker=_speaker_embed(speaker),
39
  emotion=emotion,
40
+ vqscore_8=torch.tensor([vq]*8, device=device).unsqueeze(0),
41
+ fmax=float(fmax),
42
+ pitch_std=float(pitch),
43
+ speaking_rate=float(rate),
44
+ device=device
45
+ )
46
+
47
  with torch.no_grad():
48
+ codes = model.generate(model.prepare_conditioning(cond),
49
+ max_new_tokens=int(tokens),
50
+ cfg_scale=float(cfg),
51
+ sampling_params=dict(min_p=float(minp)))
52
+ wav = model.autoencoder.decode(codes)[0] # (C,N) torch
53
+ wav = wav.cpu().clamp_(-1,1).numpy() # β†’ numpy
54
+
55
+ # >>> FIX <<< ensure (N,) or (N, C) for libsndfile
56
+ wav = np.squeeze(wav)
57
+ if wav.ndim == 2: # currently (C,N)
58
+ wav = wav.T # β†’ (N,C)
59
+
60
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
61
  sf.write(tmp.name, wav,
62
  model.autoencoder.sampling_rate,
63
  format="WAV", subtype="PCM_16")
64
  return tmp.name
65
 
66
+ # ─────────────────── UI ──────────────────────────
67
+ langs = supported_language_codes
68
  with gr.Blocks() as demo:
69
+ text = gr.Textbox(label="Text")
70
+ lang = gr.Dropdown(langs, value="en-us", label="Language")
71
+ speaker = gr.Audio(type="numpy", label="Speaker ref (optional)")
72
+
73
+ # emotion sliders (all default 0)
74
+ emotions = []
75
+ for label in ["happiness","sadness","disgust","fear",
76
+ "surprise","anger","other","neutral"]:
77
+ emotions.append(gr.Slider(0,1,0.0,0.05,label=label))
78
+
79
+ vq = gr.Slider(0.5,0.9,0.78,0.01,label="clarity (vq)")
80
+ fmax = gr.Slider(0,24000,24000,100,label="fmax (Hz)")
81
+ pitch= gr.Slider(0,300,45,1,label="pitch variation")
82
+ rate = gr.Slider(5,30,15,0.5,label="speaking rate")
83
+ cfg = gr.Slider(1.1,5,2,0.1,label="guidance scale")
84
+ minp = gr.Slider(0,1,0.15,0.01,label="min-p")
85
+ tokens = gr.Slider(0,3000,300,1,label="tokens (β‰ˆsteps)")
86
+
87
+ out = gr.Audio(type="filepath", label="Output")
88
+ gr.Button("Generate").click(
89
+ tts,
90
+ inputs=[text, lang, speaker, *emotions,
91
+ vq, fmax, pitch, rate, cfg, minp, tokens],
92
+ outputs=out
93
+ )
94
+
95
+ if __name__ == "__main__":
96
+ demo.launch()