Staticaliza commited on
Commit
e2288b2
·
verified ·
1 Parent(s): 71ed3fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +625 -85
app.py CHANGED
@@ -1,96 +1,636 @@
1
- # app.py ── Zonos TTS (transformer only, minimal UI)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- import os, tempfile, torch, numpy as np, gradio as gr, torchaudio, soundfile as sf, spaces
4
  from zonos.model import Zonos
5
  from zonos.conditioning import make_cond_dict, supported_language_codes
6
 
7
- # disable Torch-Inductor (keeps Spaces happy)
8
- os.environ["TORCH_COMPILE_DISABLE"] = os.environ["TORCHINDUCTOR_DISABLE"] = "1"
9
- torch._dynamo.disable()
10
- torch.compile = lambda f,*a,**k: f # no-op
11
-
12
  device = "cuda"
13
- model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=device).eval()
14
-
15
- # ─────────────────── helpers ────────────────────
16
- def _speaker_embed(aud):
17
- if aud is None:
18
- return None
19
- sr, wav = aud # gradio returns (sr, np.ndarray)
20
- if wav.dtype.kind in "iu": # int → float
21
- wav = wav.astype(np.float32) / np.iinfo(wav.dtype).max
22
- wav_t = torch.from_numpy(wav).unsqueeze(0) # (1,C,N)
23
- return model.make_speaker_embedding(wav_t, sr)
24
-
25
- # ─────────────────── inference ───────────────────
26
- @spaces.GPU
27
- def tts(text, lang, speaker,
28
- e1,e2,e3,e4,e5,e6,e7,e8,
29
- vq, fmax, pitch, rate,
30
- cfg, minp, tokens):
31
-
32
- emotion = torch.tensor([float(x) for x in [e1,e2,e3,e4,e5,e6,e7,e8]],
33
- device=device, dtype=torch.float32)
34
-
35
- cond = make_cond_dict(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  text=text,
37
- language=lang,
38
- speaker=_speaker_embed(speaker),
39
- emotion=emotion,
40
- vqscore_8=torch.tensor([vq]*8, device=device).unsqueeze(0),
41
- fmax=float(fmax),
42
- pitch_std=float(pitch),
43
- speaking_rate=float(rate),
44
- device=device
 
 
 
45
  )
 
 
 
 
46
 
47
- with torch.no_grad():
48
- codes = model.generate(model.prepare_conditioning(cond),
49
- max_new_tokens=int(tokens),
50
- cfg_scale=float(cfg),
51
- sampling_params=dict(min_p=float(minp)))
52
- wav = model.autoencoder.decode(codes)[0] # (C,N) torch
53
- wav = wav.cpu().clamp_(-1,1).numpy() # → numpy
54
-
55
- # >>> FIX <<< ensure (N,) or (N, C) for libsndfile
56
- wav = np.squeeze(wav)
57
- if wav.ndim == 2: # currently (C,N)
58
- wav = wav.T # → (N,C)
59
-
60
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
61
- sf.write(tmp.name, wav,
62
- model.autoencoder.sampling_rate,
63
- format="WAV", subtype="PCM_16")
64
- return tmp.name
65
-
66
- # ─────────────────── UI ──────────────────────────
67
- langs = supported_language_codes
68
- with gr.Blocks() as demo:
69
- text = gr.Textbox(label="Text")
70
- lang = gr.Dropdown(langs, value="en-us", label="Language")
71
- speaker = gr.Audio(type="numpy", label="Speaker ref (optional)")
72
-
73
- # emotion sliders (all default 0)
74
- emotions = []
75
- for label in ["happiness","sadness","disgust","fear",
76
- "surprise","anger","other","neutral"]:
77
- emotions.append(gr.Slider(0,1,0.0,0.05,label=label))
78
-
79
- vq = gr.Slider(0.5,0.9,0.78,0.01,label="clarity (vq)")
80
- fmax = gr.Slider(0,24000,24000,100,label="fmax (Hz)")
81
- pitch= gr.Slider(0,300,45,1,label="pitch variation")
82
- rate = gr.Slider(5,30,15,0.5,label="speaking rate")
83
- cfg = gr.Slider(1.1,5,2,0.1,label="guidance scale")
84
- minp = gr.Slider(0,1,0.15,0.01,label="min-p")
85
- tokens = gr.Slider(0,3000,300,1,label="tokens (≈steps)")
86
-
87
- out = gr.Audio(type="filepath", label="Output")
88
- gr.Button("Generate").click(
89
- tts,
90
- inputs=[text, lang, speaker, *emotions,
91
- vq, fmax, pitch, rate, cfg, minp, tokens],
92
- outputs=out
93
  )
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  if __name__ == "__main__":
96
- demo.launch()
 
 
 
1
+ # # app.py ── Zonos TTS (transformer only, minimal UI)
2
+
3
+ # import os, tempfile, torch, numpy as np, gradio as gr, torchaudio, soundfile as sf, spaces
4
+ # from zonos.model import Zonos
5
+ # from zonos.conditioning import make_cond_dict, supported_language_codes
6
+
7
+ # # disable Torch-Inductor (keeps Spaces happy)
8
+ # os.environ["TORCH_COMPILE_DISABLE"] = os.environ["TORCHINDUCTOR_DISABLE"] = "1"
9
+ # torch._dynamo.disable()
10
+ # torch.compile = lambda f,*a,**k: f # no-op
11
+
12
+ # device = "cuda"
13
+ # model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=device).eval()
14
+
15
+ # # ─────────────────── helpers ────────────────────
16
+ # def _speaker_embed(aud):
17
+ # if aud is None:
18
+ # return None
19
+ # sr, wav = aud # gradio returns (sr, np.ndarray)
20
+ # if wav.dtype.kind in "iu": # int → float
21
+ # wav = wav.astype(np.float32) / np.iinfo(wav.dtype).max
22
+ # wav_t = torch.from_numpy(wav).unsqueeze(0) # (1,C,N)
23
+ # return model.make_speaker_embedding(wav_t, sr)
24
+
25
+ # # ─────────────────── inference ───────────────────
26
+ # @spaces.GPU
27
+ # def tts(text, lang, speaker,
28
+ # e1,e2,e3,e4,e5,e6,e7,e8,
29
+ # vq, fmax, pitch, rate,
30
+ # cfg, minp, tokens):
31
+
32
+ # emotion = torch.tensor([float(x) for x in [e1,e2,e3,e4,e5,e6,e7,e8]],
33
+ # device=device, dtype=torch.float32)
34
+
35
+ # cond = make_cond_dict(
36
+ # text=text,
37
+ # language=lang,
38
+ # speaker=_speaker_embed(speaker),
39
+ # emotion=emotion,
40
+ # vqscore_8=torch.tensor([vq]*8, device=device).unsqueeze(0),
41
+ # fmax=float(fmax),
42
+ # pitch_std=float(pitch),
43
+ # speaking_rate=float(rate),
44
+ # device=device
45
+ # )
46
+
47
+ # with torch.no_grad():
48
+ # codes = model.generate(model.prepare_conditioning(cond),
49
+ # max_new_tokens=int(tokens),
50
+ # cfg_scale=float(cfg),
51
+ # sampling_params=dict(min_p=float(minp)))
52
+ # wav = model.autoencoder.decode(codes)[0] # (C,N) torch
53
+ # wav = wav.cpu().clamp_(-1,1).numpy() # → numpy
54
+
55
+ # # >>> FIX <<< ensure (N,) or (N, C) for libsndfile
56
+ # wav = np.squeeze(wav)
57
+ # if wav.ndim == 2: # currently (C,N)
58
+ # wav = wav.T # → (N,C)
59
+
60
+ # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
61
+ # sf.write(tmp.name, wav,
62
+ # model.autoencoder.sampling_rate,
63
+ # format="WAV", subtype="PCM_16")
64
+ # return tmp.name
65
+
66
+ # # ─────────────────── UI ──────────────────────────
67
+ # langs = supported_language_codes
68
+ # with gr.Blocks() as demo:
69
+ # text = gr.Textbox(label="Text")
70
+ # lang = gr.Dropdown(langs, value="en-us", label="Language")
71
+ # speaker = gr.Audio(type="numpy", label="Speaker ref (optional)")
72
+
73
+ # # emotion sliders (all default 0)
74
+ # emotions = []
75
+ # for label in ["happiness","sadness","disgust","fear",
76
+ # "surprise","anger","other","neutral"]:
77
+ # emotions.append(gr.Slider(0,1,0.0,0.05,label=label))
78
+
79
+ # vq = gr.Slider(0.5,0.9,0.78,0.01,label="clarity (vq)")
80
+ # fmax = gr.Slider(0,24000,24000,100,label="fmax (Hz)")
81
+ # pitch= gr.Slider(0,300,45,1,label="pitch variation")
82
+ # rate = gr.Slider(5,30,15,0.5,label="speaking rate")
83
+ # cfg = gr.Slider(1.1,5,2,0.1,label="guidance scale")
84
+ # minp = gr.Slider(0,1,0.15,0.01,label="min-p")
85
+ # tokens = gr.Slider(0,3000,300,1,label="tokens (≈steps)")
86
+
87
+ # out = gr.Audio(type="filepath", label="Output")
88
+ # gr.Button("Generate").click(
89
+ # tts,
90
+ # inputs=[text, lang, speaker, *emotions,
91
+ # vq, fmax, pitch, rate, cfg, minp, tokens],
92
+ # outputs=out
93
+ # )
94
+
95
+ # if __name__ == "__main__":
96
+ # demo.launch()
97
+
98
+ import os
99
+ import shlex
100
+ import subprocess
101
+
102
+ subprocess.run(
103
+ shlex.split("pip install flash-attn --no-build-isolation"),
104
+ env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
105
+ check=True,
106
+ )
107
+ subprocess.run(
108
+ shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
109
+ check=True,
110
+ )
111
+ subprocess.run(
112
+ shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
113
+ check=True,
114
+ )
115
+
116
+ import spaces
117
+ import torch
118
+ import torchaudio
119
+ import gradio as gr
120
+ from os import getenv
121
 
 
122
  from zonos.model import Zonos
123
  from zonos.conditioning import make_cond_dict, supported_language_codes
124
 
 
 
 
 
 
125
  device = "cuda"
126
+ MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
127
+ MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
128
+ for model in MODELS.values():
129
+ model.requires_grad_(False).eval()
130
+
131
+
132
+ def update_ui(model_choice):
133
+ """
134
+ Dynamically show/hide UI elements based on the model's conditioners.
135
+ We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
136
+ """
137
+ model = MODELS[model_choice]
138
+ cond_names = [c.name for c in model.prefix_conditioner.conditioners]
139
+ print("Conditioners in this model:", cond_names)
140
+
141
+ text_update = gr.update(visible=("espeak" in cond_names))
142
+ language_update = gr.update(visible=("espeak" in cond_names))
143
+ speaker_audio_update = gr.update(visible=("speaker" in cond_names))
144
+ prefix_audio_update = gr.update(visible=True)
145
+ emotion1_update = gr.update(visible=("emotion" in cond_names))
146
+ emotion2_update = gr.update(visible=("emotion" in cond_names))
147
+ emotion3_update = gr.update(visible=("emotion" in cond_names))
148
+ emotion4_update = gr.update(visible=("emotion" in cond_names))
149
+ emotion5_update = gr.update(visible=("emotion" in cond_names))
150
+ emotion6_update = gr.update(visible=("emotion" in cond_names))
151
+ emotion7_update = gr.update(visible=("emotion" in cond_names))
152
+ emotion8_update = gr.update(visible=("emotion" in cond_names))
153
+ vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
154
+ fmax_slider_update = gr.update(visible=("fmax" in cond_names))
155
+ pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
156
+ speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
157
+ dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
158
+ speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
159
+ unconditional_keys_update = gr.update(
160
+ choices=[name for name in cond_names if name not in ("espeak", "language_id")]
161
+ )
162
+
163
+ return (
164
+ text_update,
165
+ language_update,
166
+ speaker_audio_update,
167
+ prefix_audio_update,
168
+ emotion1_update,
169
+ emotion2_update,
170
+ emotion3_update,
171
+ emotion4_update,
172
+ emotion5_update,
173
+ emotion6_update,
174
+ emotion7_update,
175
+ emotion8_update,
176
+ vq_single_slider_update,
177
+ fmax_slider_update,
178
+ pitch_std_slider_update,
179
+ speaking_rate_slider_update,
180
+ dnsmos_slider_update,
181
+ speaker_noised_checkbox_update,
182
+ unconditional_keys_update,
183
+ )
184
+
185
+
186
+ @spaces.GPU(duration=120)
187
+ def generate_audio(
188
+ model_choice,
189
+ text,
190
+ language,
191
+ speaker_audio,
192
+ prefix_audio,
193
+ e1,
194
+ e2,
195
+ e3,
196
+ e4,
197
+ e5,
198
+ e6,
199
+ e7,
200
+ e8,
201
+ vq_single,
202
+ fmax,
203
+ pitch_std,
204
+ speaking_rate,
205
+ dnsmos_ovrl,
206
+ speaker_noised,
207
+ cfg_scale,
208
+ min_p,
209
+ seed,
210
+ randomize_seed,
211
+ unconditional_keys,
212
+ progress=gr.Progress(),
213
+ ):
214
+ """
215
+ Generates audio based on the provided UI parameters.
216
+ We do NOT use language_id or ctc_loss even if the model has them.
217
+ """
218
+ selected_model = MODELS[model_choice]
219
+
220
+ speaker_noised_bool = bool(speaker_noised)
221
+ fmax = float(fmax)
222
+ pitch_std = float(pitch_std)
223
+ speaking_rate = float(speaking_rate)
224
+ dnsmos_ovrl = float(dnsmos_ovrl)
225
+ cfg_scale = float(cfg_scale)
226
+ min_p = float(min_p)
227
+ seed = int(seed)
228
+ max_new_tokens = 86 * 30
229
+
230
+ if randomize_seed:
231
+ seed = torch.randint(0, 2**32 - 1, (1,)).item()
232
+ torch.manual_seed(seed)
233
+
234
+ speaker_embedding = None
235
+ if speaker_audio is not None and "speaker" not in unconditional_keys:
236
+ wav, sr = torchaudio.load(speaker_audio)
237
+ speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
238
+ speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)
239
+
240
+ audio_prefix_codes = None
241
+ if prefix_audio is not None:
242
+ wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
243
+ wav_prefix = wav_prefix.mean(0, keepdim=True)
244
+ wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
245
+ wav_prefix = wav_prefix.to(device, dtype=torch.float32)
246
+ with torch.autocast(device, dtype=torch.float32):
247
+ audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))
248
+
249
+ emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)
250
+
251
+ vq_val = float(vq_single)
252
+ vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)
253
+
254
+ cond_dict = make_cond_dict(
255
  text=text,
256
+ language=language,
257
+ speaker=speaker_embedding,
258
+ emotion=emotion_tensor,
259
+ vqscore_8=vq_tensor,
260
+ fmax=fmax,
261
+ pitch_std=pitch_std,
262
+ speaking_rate=speaking_rate,
263
+ dnsmos_ovrl=dnsmos_ovrl,
264
+ speaker_noised=speaker_noised_bool,
265
+ device=device,
266
+ unconditional_keys=unconditional_keys,
267
  )
268
+ conditioning = selected_model.prepare_conditioning(cond_dict)
269
+
270
+ estimated_generation_duration = 30 * len(text) / 400
271
+ estimated_total_steps = int(estimated_generation_duration * 86)
272
 
273
+ def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
274
+ progress((step, estimated_total_steps))
275
+ return True
276
+
277
+ codes = selected_model.generate(
278
+ prefix_conditioning=conditioning,
279
+ audio_prefix_codes=audio_prefix_codes,
280
+ max_new_tokens=max_new_tokens,
281
+ cfg_scale=cfg_scale,
282
+ batch_size=1,
283
+ sampling_params=dict(min_p=min_p),
284
+ callback=update_progress,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  )
286
 
287
+ wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
288
+ sr_out = selected_model.autoencoder.sampling_rate
289
+ if wav_out.dim() == 2 and wav_out.size(0) > 1:
290
+ wav_out = wav_out[0:1, :]
291
+ return (sr_out, wav_out.squeeze().numpy()), seed
292
+
293
+
294
+ # Custom CSS for pastel gradient background and enhanced UI
295
+ custom_css = """
296
+ .gradio-container {
297
+ background: linear-gradient(135deg, #f3e7ff, #e6f0ff, #ffe6f2, #e6fff9);
298
+ background-size: 400% 400%;
299
+ animation: gradient 15s ease infinite;
300
+ }
301
+ @keyframes gradient {
302
+ 0% {
303
+ background-position: 0% 50%;
304
+ }
305
+ 50% {
306
+ background-position: 100% 50%;
307
+ }
308
+ 100% {
309
+ background-position: 0% 50%;
310
+ }
311
+ }
312
+ .container {
313
+ max-width: 1200px;
314
+ margin: 0 auto;
315
+ padding: 20px;
316
+ }
317
+ .panel {
318
+ background-color: rgba(255, 255, 255, 0.7);
319
+ border-radius: 16px;
320
+ padding: 20px;
321
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
322
+ margin-bottom: 16px;
323
+ backdrop-filter: blur(5px);
324
+ transition: all 0.3s ease;
325
+ }
326
+ .panel:hover {
327
+ box-shadow: 0 6px 16px rgba(0, 0, 0, 0.12);
328
+ transform: translateY(-2px);
329
+ }
330
+ .title {
331
+ font-size: 1.2em;
332
+ font-weight: 600;
333
+ margin-bottom: 12px;
334
+ color: #6a3ea1;
335
+ border-bottom: 2px solid #f0e6ff;
336
+ padding-bottom: 8px;
337
+ }
338
+ .slider-container {
339
+ background-color: rgba(255, 255, 255, 0.5);
340
+ border-radius: 10px;
341
+ padding: 10px;
342
+ margin: 5px 0;
343
+ }
344
+ /* Make sliders more appealing */
345
+ input[type=range] {
346
+ height: 5px;
347
+ appearance: none;
348
+ width: 100%;
349
+ border-radius: 3px;
350
+ background: linear-gradient(90deg, #9c83e0, #83b1e0);
351
+ }
352
+ .generate-button {
353
+ background: linear-gradient(90deg, #a673ff, #7c4dff);
354
+ color: white;
355
+ border: none;
356
+ border-radius: 8px;
357
+ padding: 12px 24px;
358
+ font-size: 16px;
359
+ font-weight: 500;
360
+ cursor: pointer;
361
+ transition: all 0.3s ease;
362
+ box-shadow: 0 4px 10px rgba(124, 77, 255, 0.2);
363
+ display: block;
364
+ width: 100%;
365
+ margin: 20px 0;
366
+ }
367
+ .generate-button:hover {
368
+ background: linear-gradient(90deg, #9c5eff, #6a3aff);
369
+ box-shadow: 0 6px 15px rgba(124, 77, 255, 0.3);
370
+ transform: translateY(-2px);
371
+ }
372
+ /* Tabs styling */
373
+ .tabs {
374
+ display: flex;
375
+ border-bottom: 1px solid #e0e0e0;
376
+ margin-bottom: 20px;
377
+ }
378
+ .tab {
379
+ padding: 10px 20px;
380
+ cursor: pointer;
381
+ transition: all 0.3s ease;
382
+ background-color: transparent;
383
+ border: none;
384
+ color: #666;
385
+ }
386
+ .tab.active {
387
+ color: #7c4dff;
388
+ border-bottom: 3px solid #7c4dff;
389
+ font-weight: 600;
390
+ }
391
+ /* Emotion sliders container */
392
+ .emotion-grid {
393
+ display: grid;
394
+ grid-template-columns: repeat(4, 1fr);
395
+ gap: 12px;
396
+ }
397
+ /* Header styling */
398
+ .app-header {
399
+ text-align: center;
400
+ margin-bottom: 25px;
401
+ }
402
+ .app-header h1 {
403
+ font-size: 2.5em;
404
+ color: #6a3ea1;
405
+ margin-bottom: 8px;
406
+ font-weight: 700;
407
+ }
408
+ .app-header p {
409
+ font-size: 1.1em;
410
+ color: #666;
411
+ margin-bottom: 20px;
412
+ }
413
+ /* Audio player styling */
414
+ .audio-output {
415
+ margin-top: 20px;
416
+ }
417
+ /* Make output area more prominent */
418
+ .output-container {
419
+ background-color: rgba(255, 255, 255, 0.85);
420
+ border-radius: 16px;
421
+ padding: 24px;
422
+ box-shadow: 0 8px 18px rgba(0, 0, 0, 0.1);
423
+ margin-top: 20px;
424
+ }
425
+ """
426
+
427
+
428
+ def build_interface():
429
+ # Build interface with enhanced visual elements and layout
430
+ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
431
+ # Header section
432
+ with gr.Column(elem_classes="app-header"):
433
+ gr.Markdown("# ✨ Zonos Text-to-Speech Generator ✨")
434
+ gr.Markdown("Create natural-sounding speech with customizable voice characteristics")
435
+
436
+ # Main content container
437
+ with gr.Column(elem_classes="container"):
438
+ # First panel - Text & Model Selection
439
+ with gr.Column(elem_classes="panel"):
440
+ gr.Markdown('<div class="title">💬 Text & Model Configuration</div>')
441
+ with gr.Row():
442
+ with gr.Column(scale=2):
443
+ model_choice = gr.Dropdown(
444
+ choices=MODEL_NAMES,
445
+ value="Zyphra/Zonos-v0.1-transformer",
446
+ label="Zonos Model Type",
447
+ info="Select the model variant to use.",
448
+ )
449
+ text = gr.Textbox(
450
+ label="Text to Synthesize",
451
+ value="Zonos uses eSpeak for text to phoneme conversion!",
452
+ lines=4,
453
+ max_length=500,
454
+ )
455
+ language = gr.Dropdown(
456
+ choices=supported_language_codes,
457
+ value="en-us",
458
+ label="Language Code",
459
+ info="Select a language code.",
460
+ )
461
+ with gr.Column(scale=1):
462
+ prefix_audio = gr.Audio(
463
+ value="assets/silence_100ms.wav",
464
+ label="Optional Prefix Audio (continue from this audio)",
465
+ type="filepath",
466
+ )
467
+
468
+ # Second panel - Voice Characteristics
469
+ with gr.Column(elem_classes="panel"):
470
+ gr.Markdown('<div class="title">🎤 Voice Characteristics</div>')
471
+ with gr.Row():
472
+ with gr.Column(scale=1):
473
+ speaker_audio = gr.Audio(
474
+ label="Optional Speaker Audio (for voice cloning)",
475
+ type="filepath",
476
+ )
477
+ speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)
478
+
479
+ with gr.Column(scale=2):
480
+ with gr.Row():
481
+ with gr.Column():
482
+ dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="Voice Quality", elem_classes="slider-container")
483
+ fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Frequency Max (Hz)", elem_classes="slider-container")
484
+ vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="Voice Clarity", elem_classes="slider-container")
485
+ with gr.Column():
486
+ pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Variation", elem_classes="slider-container")
487
+ speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", elem_classes="slider-container")
488
+
489
+ # Third panel - Generation Parameters
490
+ with gr.Column(elem_classes="panel"):
491
+ gr.Markdown('<div class="title">⚙️ Generation Parameters</div>')
492
+ with gr.Row():
493
+ with gr.Column():
494
+ cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="Guidance Scale", elem_classes="slider-container")
495
+ min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P (Randomness)", elem_classes="slider-container")
496
+ with gr.Column():
497
+ seed_number = gr.Number(label="Seed", value=420, precision=0)
498
+ randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)
499
+
500
+ # Emotion Panel with Tabbed Interface
501
+ with gr.Accordion("🎭 Emotion Settings", open=False, elem_classes="panel"):
502
+ gr.Markdown(
503
+ "Adjust these sliders to control the emotional tone of the generated speech.\n"
504
+ "For a neutral voice, keep 'Neutral' high and other emotions low."
505
+ )
506
+ with gr.Row(elem_classes="emotion-grid"):
507
+ emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness", elem_classes="slider-container")
508
+ emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness", elem_classes="slider-container")
509
+ emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust", elem_classes="slider-container")
510
+ emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear", elem_classes="slider-container")
511
+ with gr.Row(elem_classes="emotion-grid"):
512
+ emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise", elem_classes="slider-container")
513
+ emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger", elem_classes="slider-container")
514
+ emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other", elem_classes="slider-container")
515
+ emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral", elem_classes="slider-container")
516
+
517
+ # Advanced Settings Panel
518
+ with gr.Accordion("⚡ Advanced Settings", open=False, elem_classes="panel"):
519
+ gr.Markdown(
520
+ "### Unconditional Toggles\n"
521
+ "Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
522
+ 'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
523
+ )
524
+ unconditional_keys = gr.CheckboxGroup(
525
+ [
526
+ "speaker",
527
+ "emotion",
528
+ "vqscore_8",
529
+ "fmax",
530
+ "pitch_std",
531
+ "speaking_rate",
532
+ "dnsmos_ovrl",
533
+ "speaker_noised",
534
+ ],
535
+ value=["emotion"],
536
+ label="Unconditional Keys",
537
+ )
538
+
539
+ # Generate Button and Output Area
540
+ with gr.Column(elem_classes="panel output-container"):
541
+ gr.Markdown('<div class="title">🔊 Generate & Output</div>')
542
+ generate_button = gr.Button("Generate Audio", elem_classes="generate-button")
543
+ output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True, elem_classes="audio-output")
544
+
545
+ model_choice.change(
546
+ fn=update_ui,
547
+ inputs=[model_choice],
548
+ outputs=[
549
+ text,
550
+ language,
551
+ speaker_audio,
552
+ prefix_audio,
553
+ emotion1,
554
+ emotion2,
555
+ emotion3,
556
+ emotion4,
557
+ emotion5,
558
+ emotion6,
559
+ emotion7,
560
+ emotion8,
561
+ vq_single_slider,
562
+ fmax_slider,
563
+ pitch_std_slider,
564
+ speaking_rate_slider,
565
+ dnsmos_slider,
566
+ speaker_noised_checkbox,
567
+ unconditional_keys,
568
+ ],
569
+ )
570
+
571
+ # On page load, trigger the same UI refresh
572
+ demo.load(
573
+ fn=update_ui,
574
+ inputs=[model_choice],
575
+ outputs=[
576
+ text,
577
+ language,
578
+ speaker_audio,
579
+ prefix_audio,
580
+ emotion1,
581
+ emotion2,
582
+ emotion3,
583
+ emotion4,
584
+ emotion5,
585
+ emotion6,
586
+ emotion7,
587
+ emotion8,
588
+ vq_single_slider,
589
+ fmax_slider,
590
+ pitch_std_slider,
591
+ speaking_rate_slider,
592
+ dnsmos_slider,
593
+ speaker_noised_checkbox,
594
+ unconditional_keys,
595
+ ],
596
+ )
597
+
598
+ # Generate audio on button click
599
+ generate_button.click(
600
+ fn=generate_audio,
601
+ inputs=[
602
+ model_choice,
603
+ text,
604
+ language,
605
+ speaker_audio,
606
+ prefix_audio,
607
+ emotion1,
608
+ emotion2,
609
+ emotion3,
610
+ emotion4,
611
+ emotion5,
612
+ emotion6,
613
+ emotion7,
614
+ emotion8,
615
+ vq_single_slider,
616
+ fmax_slider,
617
+ pitch_std_slider,
618
+ speaking_rate_slider,
619
+ dnsmos_slider,
620
+ speaker_noised_checkbox,
621
+ cfg_scale_slider,
622
+ min_p_slider,
623
+ seed_number,
624
+ randomize_seed_toggle,
625
+ unconditional_keys,
626
+ ],
627
+ outputs=[output_audio, seed_number],
628
+ )
629
+
630
+ return demo
631
+
632
+
633
  if __name__ == "__main__":
634
+ demo = build_interface()
635
+ share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t")
636
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=share, mcp_server=True)