Staticaliza commited on
Commit
47581eb
·
verified ·
1 Parent(s): 234b163

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +643 -91
app.py CHANGED
@@ -1,23 +1,21 @@
1
- # Imports
2
- import gradio as gr
3
- import spaces
4
  import os
5
- import torch
6
- import torchaudio
7
- import time
8
-
9
- from zonos.model import Zonos
10
- from zonos.conditioning import make_cond_dict, supported_language_codes
11
-
12
- # Variables
13
- HF_TOKEN = os.environ.get("HF_TOKEN", "")
14
 
15
- device = "cuda"
16
-
17
- REPO = "Zyphra/Zonos-v0.1-transformer"
18
- model = Zonos.from_pretrained(REPO, device=device)
 
 
 
 
 
 
 
 
 
19
 
20
- # Functions
21
  def patch_cuda():
22
  if torch.cuda.is_available():
23
  for i in range(torch.cuda.device_count()):
@@ -27,106 +25,660 @@ def patch_cuda():
27
  if not hasattr(p, "max_threads_per_multi_processor"):
28
  setattr(p, "max_threads_per_multi_processor", 2048)
29
 
30
- @spaces.GPU
31
- def generate(input, language, speaker_audio, emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral, clarity, fmax, pitch_std, speaking_rate, dnsmos_ovrl, cfg_scale, min_p, steps, seed, randomize_seed):
32
- if randomize_seed: seed = int(time.time())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  torch.manual_seed(seed)
34
 
35
  speaker_embedding = None
36
- if speaker_audio is not None:
37
- print(1)
38
- print(speaker_audio)
39
  wav, sr = torchaudio.load(speaker_audio)
40
- print(2)
41
- print(wav)
42
- print(sr)
43
- speaker_embedding = (model.make_speaker_embedding(wav, sr).to(device, dtype=torch.bfloat16))
44
- print(3)
45
- print(speaker_embedding)
46
-
47
- emotion_tensor = torch.tensor([emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral], device=device, dtype=torch.bfloat16)
48
- vq_tensor = torch.tensor([clarity] * 8, device=device, dtype=torch.bfloat16).unsqueeze(0)
49
- print(4)
50
- print(emotion_tensor)
51
- print(vq_tensor)
 
 
 
 
52
 
53
  cond_dict = make_cond_dict(
54
- text=input,
55
  language=language,
56
  speaker=speaker_embedding,
57
  emotion=emotion_tensor,
58
  vqscore_8=vq_tensor,
59
- fmax=float(fmax),
60
- pitch_std=float(pitch_std),
61
- speaking_rate=float(speaking_rate),
62
- dnsmos_ovrl=float(dnsmos_ovrl),
 
63
  device=device,
 
64
  )
65
- print(5)
66
- print(cond_dict)
67
-
68
- conditioning = model.prepare_conditioning(cond_dict)
69
- print(6)
70
- print(conditioning)
71
 
72
- codes = model.generate(
 
 
 
 
73
  prefix_conditioning=conditioning,
74
- max_new_tokens=int(steps),
75
- cfg_scale=float(cfg_scale),
 
76
  batch_size=1,
77
- sampling_params=dict(min_p=float(min_p)),
 
78
  )
79
- print(7)
80
- print(codes)
81
-
82
- wav_out = model.autoencoder.decode(codes).cpu().detach()
83
- sr_out = model.autoencoder.sampling_rate
84
- print(8)
85
- print(wav_output)
86
- print(sr_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
- if wav_out.dim() == 2 and wav_out.size(0) > 1: wav_out = wav_out[0:1, :]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- print(9)
91
- print((sr_out, wav_out.squeeze().numpy()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- return (sr_out, wav_out.squeeze().numpy())
 
 
 
 
 
94
 
95
- # Initialize
96
- patch_cuda()
97
 
98
- with gr.Blocks() as main:
99
- text = gr.Textbox(label="text", value="hello, world!")
100
- language = gr.Dropdown(choices=supported_language_codes, value="en-us", label="language")
101
- speaker_audio = gr.Audio(label="voice reference", type="filepath")
102
 
103
- clarity_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="clarity")
104
- steps_slider = gr.Slider(1, 3000, 316, 1, label="steps")
105
 
106
- dnsmos_slider = gr.Slider(1.0, 5.0, 5.0, 0.1, label="quality")
107
- fmax_slider = gr.Slider(0, 24000, 24000, 1, label="fmax")
108
- pitch_std_slider = gr.Slider(0.0, 1000.0, 30.0, 1, label="pitch std")
109
- speaking_rate_slider = gr.Slider(5.0, 30.0, 15.0, 0.1, label="rate")
110
 
111
- cfg_scale_slider = gr.Slider(1.0, 5.0, 2.5, 0.1, label="guidance")
112
- min_p_slider = gr.Slider(0.0, 1.0, 0.05, 0.15, label="min p")
113
 
114
- with gr.Row():
115
- e1 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="happy")
116
- e2 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="sad")
117
- e3 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="disgust")
118
- e4 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="fear")
119
- e5 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="surprise")
120
- e6 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="anger")
121
- e7 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="other")
122
- e8 = gr.Slider(0.0, 1.0, 1.0, 0.01, label="neutral")
123
 
124
- seed_number = gr.Number(label="seed", value=42, precision=0)
125
- randomize_seed_toggle = gr.Checkbox(label="randomize seed", value=True)
126
 
127
- generate_button = gr.Button("generate")
128
- output_audio = gr.Audio(label="output", type="numpy", autoplay=True)
129
 
130
- generate_button.click(fn=generate, inputs=[text, language, speaker_audio, e1, e2, e3, e4, e5, e6, e7, e8, clarity_slider, fmax_slider, pitch_std_slider, speaking_rate_slider, dnsmos_slider, cfg_scale_slider, min_p_slider, steps_slider, seed_number, randomize_seed_toggle], outputs=output_audio)
131
 
132
- main.launch()
 
 
 
 
1
  import os
2
+ import shlex
3
+ import subprocess
 
 
 
 
 
 
 
4
 
5
+ subprocess.run(
6
+ shlex.split("pip install flash-attn --no-build-isolation"),
7
+ env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
8
+ check=True,
9
+ )
10
+ subprocess.run(
11
+ shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
12
+ check=True,
13
+ )
14
+ subprocess.run(
15
+ shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
16
+ check=True,
17
+ )
18
 
 
19
  def patch_cuda():
20
  if torch.cuda.is_available():
21
  for i in range(torch.cuda.device_count()):
 
25
  if not hasattr(p, "max_threads_per_multi_processor"):
26
  setattr(p, "max_threads_per_multi_processor", 2048)
27
 
28
+ patch_cuda()
29
+
30
+ import spaces
31
+ import torch
32
+ import torchaudio
33
+ import gradio as gr
34
+ from os import getenv
35
+
36
+ from zonos.model import Zonos
37
+ from zonos.conditioning import make_cond_dict, supported_language_codes
38
+
39
+ device = "cuda"
40
+ MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
41
+ MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
42
+ for model in MODELS.values():
43
+ model.requires_grad_(False).eval()
44
+
45
+
46
+ def update_ui(model_choice):
47
+ """
48
+ Dynamically show/hide UI elements based on the model's conditioners.
49
+ We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
50
+ """
51
+ model = MODELS[model_choice]
52
+ cond_names = [c.name for c in model.prefix_conditioner.conditioners]
53
+ print("Conditioners in this model:", cond_names)
54
+
55
+ text_update = gr.update(visible=("espeak" in cond_names))
56
+ language_update = gr.update(visible=("espeak" in cond_names))
57
+ speaker_audio_update = gr.update(visible=("speaker" in cond_names))
58
+ prefix_audio_update = gr.update(visible=True)
59
+ emotion1_update = gr.update(visible=("emotion" in cond_names))
60
+ emotion2_update = gr.update(visible=("emotion" in cond_names))
61
+ emotion3_update = gr.update(visible=("emotion" in cond_names))
62
+ emotion4_update = gr.update(visible=("emotion" in cond_names))
63
+ emotion5_update = gr.update(visible=("emotion" in cond_names))
64
+ emotion6_update = gr.update(visible=("emotion" in cond_names))
65
+ emotion7_update = gr.update(visible=("emotion" in cond_names))
66
+ emotion8_update = gr.update(visible=("emotion" in cond_names))
67
+ vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
68
+ fmax_slider_update = gr.update(visible=("fmax" in cond_names))
69
+ pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
70
+ speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
71
+ dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
72
+ speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
73
+ unconditional_keys_update = gr.update(
74
+ choices=[name for name in cond_names if name not in ("espeak", "language_id")]
75
+ )
76
+
77
+ return (
78
+ text_update,
79
+ language_update,
80
+ speaker_audio_update,
81
+ prefix_audio_update,
82
+ emotion1_update,
83
+ emotion2_update,
84
+ emotion3_update,
85
+ emotion4_update,
86
+ emotion5_update,
87
+ emotion6_update,
88
+ emotion7_update,
89
+ emotion8_update,
90
+ vq_single_slider_update,
91
+ fmax_slider_update,
92
+ pitch_std_slider_update,
93
+ speaking_rate_slider_update,
94
+ dnsmos_slider_update,
95
+ speaker_noised_checkbox_update,
96
+ unconditional_keys_update,
97
+ )
98
+
99
+
100
+ @spaces.GPU(duration=120)
101
+ def generate_audio(
102
+ model_choice,
103
+ text,
104
+ language,
105
+ speaker_audio,
106
+ prefix_audio,
107
+ e1,
108
+ e2,
109
+ e3,
110
+ e4,
111
+ e5,
112
+ e6,
113
+ e7,
114
+ e8,
115
+ vq_single,
116
+ fmax,
117
+ pitch_std,
118
+ speaking_rate,
119
+ dnsmos_ovrl,
120
+ speaker_noised,
121
+ cfg_scale,
122
+ min_p,
123
+ seed,
124
+ randomize_seed,
125
+ unconditional_keys,
126
+ progress=gr.Progress(),
127
+ ):
128
+ """
129
+ Generates audio based on the provided UI parameters.
130
+ We do NOT use language_id or ctc_loss even if the model has them.
131
+ """
132
+ selected_model = MODELS[model_choice]
133
+
134
+ speaker_noised_bool = bool(speaker_noised)
135
+ fmax = float(fmax)
136
+ pitch_std = float(pitch_std)
137
+ speaking_rate = float(speaking_rate)
138
+ dnsmos_ovrl = float(dnsmos_ovrl)
139
+ cfg_scale = float(cfg_scale)
140
+ min_p = float(min_p)
141
+ seed = int(seed)
142
+ max_new_tokens = 86 * 30
143
+
144
+ if randomize_seed:
145
+ seed = torch.randint(0, 2**32 - 1, (1,)).item()
146
  torch.manual_seed(seed)
147
 
148
  speaker_embedding = None
149
+ if speaker_audio is not None and "speaker" not in unconditional_keys:
 
 
150
  wav, sr = torchaudio.load(speaker_audio)
151
+ speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
152
+ speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)
153
+
154
+ audio_prefix_codes = None
155
+ if prefix_audio is not None:
156
+ wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
157
+ wav_prefix = wav_prefix.mean(0, keepdim=True)
158
+ wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
159
+ wav_prefix = wav_prefix.to(device, dtype=torch.float32)
160
+ with torch.autocast(device, dtype=torch.float32):
161
+ audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))
162
+
163
+ emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)
164
+
165
+ vq_val = float(vq_single)
166
+ vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)
167
 
168
  cond_dict = make_cond_dict(
169
+ text=text,
170
  language=language,
171
  speaker=speaker_embedding,
172
  emotion=emotion_tensor,
173
  vqscore_8=vq_tensor,
174
+ fmax=fmax,
175
+ pitch_std=pitch_std,
176
+ speaking_rate=speaking_rate,
177
+ dnsmos_ovrl=dnsmos_ovrl,
178
+ speaker_noised=speaker_noised_bool,
179
  device=device,
180
+ unconditional_keys=unconditional_keys,
181
  )
182
+ conditioning = selected_model.prepare_conditioning(cond_dict)
183
+
184
+ estimated_generation_duration = 30 * len(text) / 400
185
+ estimated_total_steps = int(estimated_generation_duration * 86)
 
 
186
 
187
+ def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
188
+ progress((step, estimated_total_steps))
189
+ return True
190
+
191
+ codes = selected_model.generate(
192
  prefix_conditioning=conditioning,
193
+ audio_prefix_codes=audio_prefix_codes,
194
+ max_new_tokens=max_new_tokens,
195
+ cfg_scale=cfg_scale,
196
  batch_size=1,
197
+ sampling_params=dict(min_p=min_p),
198
+ callback=update_progress,
199
  )
200
+
201
+ wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
202
+ sr_out = selected_model.autoencoder.sampling_rate
203
+ if wav_out.dim() == 2 and wav_out.size(0) > 1:
204
+ wav_out = wav_out[0:1, :]
205
+ return (sr_out, wav_out.squeeze().numpy()), seed
206
+
207
+
208
+ # Custom CSS for pastel gradient background and enhanced UI
209
+ custom_css = """
210
+ .gradio-container {
211
+ background: linear-gradient(135deg, #f3e7ff, #e6f0ff, #ffe6f2, #e6fff9);
212
+ background-size: 400% 400%;
213
+ animation: gradient 15s ease infinite;
214
+ }
215
+ @keyframes gradient {
216
+ 0% {
217
+ background-position: 0% 50%;
218
+ }
219
+ 50% {
220
+ background-position: 100% 50%;
221
+ }
222
+ 100% {
223
+ background-position: 0% 50%;
224
+ }
225
+ }
226
+ .container {
227
+ max-width: 1200px;
228
+ margin: 0 auto;
229
+ padding: 20px;
230
+ }
231
+ .panel {
232
+ background-color: rgba(255, 255, 255, 0.7);
233
+ border-radius: 16px;
234
+ padding: 20px;
235
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
236
+ margin-bottom: 16px;
237
+ backdrop-filter: blur(5px);
238
+ transition: all 0.3s ease;
239
+ }
240
+ .panel:hover {
241
+ box-shadow: 0 6px 16px rgba(0, 0, 0, 0.12);
242
+ transform: translateY(-2px);
243
+ }
244
+ .title {
245
+ font-size: 1.2em;
246
+ font-weight: 600;
247
+ margin-bottom: 12px;
248
+ color: #6a3ea1;
249
+ border-bottom: 2px solid #f0e6ff;
250
+ padding-bottom: 8px;
251
+ }
252
+ .slider-container {
253
+ background-color: rgba(255, 255, 255, 0.5);
254
+ border-radius: 10px;
255
+ padding: 10px;
256
+ margin: 5px 0;
257
+ }
258
+ /* Make sliders more appealing */
259
+ input[type=range] {
260
+ height: 5px;
261
+ appearance: none;
262
+ width: 100%;
263
+ border-radius: 3px;
264
+ background: linear-gradient(90deg, #9c83e0, #83b1e0);
265
+ }
266
+ .generate-button {
267
+ background: linear-gradient(90deg, #a673ff, #7c4dff);
268
+ color: white;
269
+ border: none;
270
+ border-radius: 8px;
271
+ padding: 12px 24px;
272
+ font-size: 16px;
273
+ font-weight: 500;
274
+ cursor: pointer;
275
+ transition: all 0.3s ease;
276
+ box-shadow: 0 4px 10px rgba(124, 77, 255, 0.2);
277
+ display: block;
278
+ width: 100%;
279
+ margin: 20px 0;
280
+ }
281
+ .generate-button:hover {
282
+ background: linear-gradient(90deg, #9c5eff, #6a3aff);
283
+ box-shadow: 0 6px 15px rgba(124, 77, 255, 0.3);
284
+ transform: translateY(-2px);
285
+ }
286
+ /* Tabs styling */
287
+ .tabs {
288
+ display: flex;
289
+ border-bottom: 1px solid #e0e0e0;
290
+ margin-bottom: 20px;
291
+ }
292
+ .tab {
293
+ padding: 10px 20px;
294
+ cursor: pointer;
295
+ transition: all 0.3s ease;
296
+ background-color: transparent;
297
+ border: none;
298
+ color: #666;
299
+ }
300
+ .tab.active {
301
+ color: #7c4dff;
302
+ border-bottom: 3px solid #7c4dff;
303
+ font-weight: 600;
304
+ }
305
+ /* Emotion sliders container */
306
+ .emotion-grid {
307
+ display: grid;
308
+ grid-template-columns: repeat(4, 1fr);
309
+ gap: 12px;
310
+ }
311
+ /* Header styling */
312
+ .app-header {
313
+ text-align: center;
314
+ margin-bottom: 25px;
315
+ }
316
+ .app-header h1 {
317
+ font-size: 2.5em;
318
+ color: #6a3ea1;
319
+ margin-bottom: 8px;
320
+ font-weight: 700;
321
+ }
322
+ .app-header p {
323
+ font-size: 1.1em;
324
+ color: #666;
325
+ margin-bottom: 20px;
326
+ }
327
+ /* Audio player styling */
328
+ .audio-output {
329
+ margin-top: 20px;
330
+ }
331
+ /* Make output area more prominent */
332
+ .output-container {
333
+ background-color: rgba(255, 255, 255, 0.85);
334
+ border-radius: 16px;
335
+ padding: 24px;
336
+ box-shadow: 0 8px 18px rgba(0, 0, 0, 0.1);
337
+ margin-top: 20px;
338
+ }
339
+ """
340
+
341
+
342
+ def build_interface():
343
+ # Build interface with enhanced visual elements and layout
344
+ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
345
+ # Header section
346
+ with gr.Column(elem_classes="app-header"):
347
+ gr.Markdown("# ✨ Zonos Text-to-Speech Generator ✨")
348
+ gr.Markdown("Create natural-sounding speech with customizable voice characteristics")
349
+
350
+ # Main content container
351
+ with gr.Column(elem_classes="container"):
352
+ # First panel - Text & Model Selection
353
+ with gr.Column(elem_classes="panel"):
354
+ gr.Markdown('<div class="title">💬 Text & Model Configuration</div>')
355
+ with gr.Row():
356
+ with gr.Column(scale=2):
357
+ model_choice = gr.Dropdown(
358
+ choices=MODEL_NAMES,
359
+ value="Zyphra/Zonos-v0.1-transformer",
360
+ label="Zonos Model Type",
361
+ info="Select the model variant to use.",
362
+ )
363
+ text = gr.Textbox(
364
+ label="Text to Synthesize",
365
+ value="Zonos uses eSpeak for text to phoneme conversion!",
366
+ lines=4,
367
+ max_length=500,
368
+ )
369
+ language = gr.Dropdown(
370
+ choices=supported_language_codes,
371
+ value="en-us",
372
+ label="Language Code",
373
+ info="Select a language code.",
374
+ )
375
+ with gr.Column(scale=1):
376
+ prefix_audio = gr.Audio(
377
+ value="assets/silence_100ms.wav",
378
+ label="Optional Prefix Audio (continue from this audio)",
379
+ type="filepath",
380
+ )
381
+
382
+ # Second panel - Voice Characteristics
383
+ with gr.Column(elem_classes="panel"):
384
+ gr.Markdown('<div class="title">🎤 Voice Characteristics</div>')
385
+ with gr.Row():
386
+ with gr.Column(scale=1):
387
+ speaker_audio = gr.Audio(
388
+ label="Optional Speaker Audio (for voice cloning)",
389
+ type="filepath",
390
+ )
391
+ speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)
392
+
393
+ with gr.Column(scale=2):
394
+ with gr.Row():
395
+ with gr.Column():
396
+ dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="Voice Quality", elem_classes="slider-container")
397
+ fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Frequency Max (Hz)", elem_classes="slider-container")
398
+ vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="Voice Clarity", elem_classes="slider-container")
399
+ with gr.Column():
400
+ pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Variation", elem_classes="slider-container")
401
+ speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", elem_classes="slider-container")
402
+
403
+ # Third panel - Generation Parameters
404
+ with gr.Column(elem_classes="panel"):
405
+ gr.Markdown('<div class="title">⚙️ Generation Parameters</div>')
406
+ with gr.Row():
407
+ with gr.Column():
408
+ cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="Guidance Scale", elem_classes="slider-container")
409
+ min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P (Randomness)", elem_classes="slider-container")
410
+ with gr.Column():
411
+ seed_number = gr.Number(label="Seed", value=420, precision=0)
412
+ randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)
413
+
414
+ # Emotion Panel with Tabbed Interface
415
+ with gr.Accordion("🎭 Emotion Settings", open=False, elem_classes="panel"):
416
+ gr.Markdown(
417
+ "Adjust these sliders to control the emotional tone of the generated speech.\n"
418
+ "For a neutral voice, keep 'Neutral' high and other emotions low."
419
+ )
420
+ with gr.Row(elem_classes="emotion-grid"):
421
+ emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness", elem_classes="slider-container")
422
+ emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness", elem_classes="slider-container")
423
+ emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust", elem_classes="slider-container")
424
+ emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear", elem_classes="slider-container")
425
+ with gr.Row(elem_classes="emotion-grid"):
426
+ emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise", elem_classes="slider-container")
427
+ emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger", elem_classes="slider-container")
428
+ emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other", elem_classes="slider-container")
429
+ emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral", elem_classes="slider-container")
430
+
431
+ # Advanced Settings Panel
432
+ with gr.Accordion("⚡ Advanced Settings", open=False, elem_classes="panel"):
433
+ gr.Markdown(
434
+ "### Unconditional Toggles\n"
435
+ "Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
436
+ 'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
437
+ )
438
+ unconditional_keys = gr.CheckboxGroup(
439
+ [
440
+ "speaker",
441
+ "emotion",
442
+ "vqscore_8",
443
+ "fmax",
444
+ "pitch_std",
445
+ "speaking_rate",
446
+ "dnsmos_ovrl",
447
+ "speaker_noised",
448
+ ],
449
+ value=["emotion"],
450
+ label="Unconditional Keys",
451
+ )
452
+
453
+ # Generate Button and Output Area
454
+ with gr.Column(elem_classes="panel output-container"):
455
+ gr.Markdown('<div class="title">🔊 Generate & Output</div>')
456
+ generate_button = gr.Button("Generate Audio", elem_classes="generate-button")
457
+ output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True, elem_classes="audio-output")
458
+
459
+ model_choice.change(
460
+ fn=update_ui,
461
+ inputs=[model_choice],
462
+ outputs=[
463
+ text,
464
+ language,
465
+ speaker_audio,
466
+ prefix_audio,
467
+ emotion1,
468
+ emotion2,
469
+ emotion3,
470
+ emotion4,
471
+ emotion5,
472
+ emotion6,
473
+ emotion7,
474
+ emotion8,
475
+ vq_single_slider,
476
+ fmax_slider,
477
+ pitch_std_slider,
478
+ speaking_rate_slider,
479
+ dnsmos_slider,
480
+ speaker_noised_checkbox,
481
+ unconditional_keys,
482
+ ],
483
+ )
484
+
485
+ # On page load, trigger the same UI refresh
486
+ demo.load(
487
+ fn=update_ui,
488
+ inputs=[model_choice],
489
+ outputs=[
490
+ text,
491
+ language,
492
+ speaker_audio,
493
+ prefix_audio,
494
+ emotion1,
495
+ emotion2,
496
+ emotion3,
497
+ emotion4,
498
+ emotion5,
499
+ emotion6,
500
+ emotion7,
501
+ emotion8,
502
+ vq_single_slider,
503
+ fmax_slider,
504
+ pitch_std_slider,
505
+ speaking_rate_slider,
506
+ dnsmos_slider,
507
+ speaker_noised_checkbox,
508
+ unconditional_keys,
509
+ ],
510
+ )
511
+
512
+ # Generate audio on button click
513
+ generate_button.click(
514
+ fn=generate_audio,
515
+ inputs=[
516
+ model_choice,
517
+ text,
518
+ language,
519
+ speaker_audio,
520
+ prefix_audio,
521
+ emotion1,
522
+ emotion2,
523
+ emotion3,
524
+ emotion4,
525
+ emotion5,
526
+ emotion6,
527
+ emotion7,
528
+ emotion8,
529
+ vq_single_slider,
530
+ fmax_slider,
531
+ pitch_std_slider,
532
+ speaking_rate_slider,
533
+ dnsmos_slider,
534
+ speaker_noised_checkbox,
535
+ cfg_scale_slider,
536
+ min_p_slider,
537
+ seed_number,
538
+ randomize_seed_toggle,
539
+ unconditional_keys,
540
+ ],
541
+ outputs=[output_audio, seed_number],
542
+ )
543
+
544
+ return demo
545
+
546
+
547
+ if __name__ == "__main__":
548
+ demo = build_interface()
549
+ share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t")
550
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=share, mcp_server=True)
551
+
552
 
553
+ # # Imports
554
+ # import gradio as gr
555
+ # import spaces
556
+ # import os
557
+ # import torch
558
+ # import torchaudio
559
+ # import time
560
+
561
+ # from zonos.model import Zonos
562
+ # from zonos.conditioning import make_cond_dict, supported_language_codes
563
+
564
+ # # Variables
565
+ # HF_TOKEN = os.environ.get("HF_TOKEN", "")
566
+
567
+ # device = "cuda"
568
+
569
+ # REPO = "Zyphra/Zonos-v0.1-transformer"
570
+ # model = Zonos.from_pretrained(REPO, device=device)
571
+
572
+ # # Functions
573
+ # def patch_cuda():
574
+ # if torch.cuda.is_available():
575
+ # for i in range(torch.cuda.device_count()):
576
+ # p = torch.cuda.get_device_properties(i)
577
+ # if not hasattr(p, "regs_per_multiprocessor"):
578
+ # setattr(p, "regs_per_multiprocessor", 65536)
579
+ # if not hasattr(p, "max_threads_per_multi_processor"):
580
+ # setattr(p, "max_threads_per_multi_processor", 2048)
581
+
582
+ # @spaces.GPU
583
+ # def generate(input, language, speaker_audio, emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral, clarity, fmax, pitch_std, speaking_rate, dnsmos_ovrl, cfg_scale, min_p, steps, seed, randomize_seed):
584
+ # if randomize_seed: seed = int(time.time())
585
+ # torch.manual_seed(seed)
586
+
587
+ # speaker_embedding = None
588
+ # if speaker_audio is not None:
589
+ # print(1)
590
+ # print(speaker_audio)
591
+ # wav, sr = torchaudio.load(speaker_audio)
592
+ # print(2)
593
+ # print(wav)
594
+ # print(sr)
595
+ # speaker_embedding = (model.make_speaker_embedding(wav, sr).to(device, dtype=torch.bfloat16))
596
+ # print(3)
597
+ # print(speaker_embedding)
598
+
599
+ # emotion_tensor = torch.tensor([emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral], device=device, dtype=torch.bfloat16)
600
+ # vq_tensor = torch.tensor([clarity] * 8, device=device, dtype=torch.bfloat16).unsqueeze(0)
601
+ # print(4)
602
+ # print(emotion_tensor)
603
+ # print(vq_tensor)
604
+
605
+ # cond_dict = make_cond_dict(
606
+ # text=input,
607
+ # language=language,
608
+ # speaker=speaker_embedding,
609
+ # emotion=emotion_tensor,
610
+ # vqscore_8=vq_tensor,
611
+ # fmax=float(fmax),
612
+ # pitch_std=float(pitch_std),
613
+ # speaking_rate=float(speaking_rate),
614
+ # dnsmos_ovrl=float(dnsmos_ovrl),
615
+ # device=device,
616
+ # )
617
+ # print(5)
618
+ # print(cond_dict)
619
 
620
+ # conditioning = model.prepare_conditioning(cond_dict)
621
+ # print(6)
622
+ # print(conditioning)
623
+
624
+ # codes = model.generate(
625
+ # prefix_conditioning=conditioning,
626
+ # max_new_tokens=int(steps),
627
+ # cfg_scale=float(cfg_scale),
628
+ # batch_size=1,
629
+ # sampling_params=dict(min_p=float(min_p)),
630
+ # )
631
+ # print(7)
632
+ # print(codes)
633
+
634
+ # wav_out = model.autoencoder.decode(codes).cpu().detach()
635
+ # sr_out = model.autoencoder.sampling_rate
636
+ # print(8)
637
+ # print(wav_out)
638
+ # print(sr_out)
639
 
640
+ # if wav_out.dim() == 2 and wav_out.size(0) > 1: wav_out = wav_out[0:1, :]
641
+
642
+ # print(9)
643
+ # print((sr_out, wav_out.squeeze().numpy()))
644
+
645
+ # return (sr_out, wav_out.squeeze().numpy())
646
 
647
+ # # Initialize
648
+ # patch_cuda()
649
 
650
+ # with gr.Blocks() as main:
651
+ # text = gr.Textbox(label="text", value="hello, world!")
652
+ # language = gr.Dropdown(choices=supported_language_codes, value="en-us", label="language")
653
+ # speaker_audio = gr.Audio(label="voice reference", type="filepath")
654
 
655
+ # clarity_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="clarity")
656
+ # steps_slider = gr.Slider(1, 3000, 316, 1, label="steps")
657
 
658
+ # dnsmos_slider = gr.Slider(1.0, 5.0, 5.0, 0.1, label="quality")
659
+ # fmax_slider = gr.Slider(0, 24000, 24000, 1, label="fmax")
660
+ # pitch_std_slider = gr.Slider(0.0, 1000.0, 30.0, 1, label="pitch std")
661
+ # speaking_rate_slider = gr.Slider(5.0, 30.0, 15.0, 0.1, label="rate")
662
 
663
+ # cfg_scale_slider = gr.Slider(1.0, 5.0, 2.5, 0.1, label="guidance")
664
+ # min_p_slider = gr.Slider(0.0, 1.0, 0.05, 0.15, label="min p")
665
 
666
+ # with gr.Row():
667
+ # e1 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="happy")
668
+ # e2 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="sad")
669
+ # e3 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="disgust")
670
+ # e4 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="fear")
671
+ # e5 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="surprise")
672
+ # e6 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="anger")
673
+ # e7 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="other")
674
+ # e8 = gr.Slider(0.0, 1.0, 1.0, 0.01, label="neutral")
675
 
676
+ # seed_number = gr.Number(label="seed", value=42, precision=0)
677
+ # randomize_seed_toggle = gr.Checkbox(label="randomize seed", value=True)
678
 
679
+ # generate_button = gr.Button("generate")
680
+ # output_audio = gr.Audio(label="output", type="numpy", autoplay=True)
681
 
682
+ # generate_button.click(fn=generate, inputs=[text, language, speaker_audio, e1, e2, e3, e4, e5, e6, e7, e8, clarity_slider, fmax_slider, pitch_std_slider, speaking_rate_slider, dnsmos_slider, cfg_scale_slider, min_p_slider, steps_slider, seed_number, randomize_seed_toggle], outputs=output_audio)
683
 
684
+ # main.launch()