John Tran commited on
Commit
e128d3e
·
1 Parent(s): a3b4dec

Add application file

Browse files
Files changed (1) hide show
  1. app.py +372 -4
app.py CHANGED
@@ -1,8 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
8
 
 
 
 
 
 
1
+ import os
2
+ import shlex
3
+ import subprocess
4
+
5
+ subprocess.run(shlex.split("pip install flash-attn --no-build-isolation"), env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, check=True)
6
+ subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True)
7
+ subprocess.run(shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True)
8
+
9
+ import spaces
10
+ import torch
11
+ import torchaudio
12
  import gradio as gr
13
+ from os import getenv
14
+
15
+ from zonos.model import Zonos
16
+ from zonos.conditioning import make_cond_dict, supported_language_codes
17
+
18
+ device = "cuda"
19
+ MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
20
+ MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
21
+ for model in MODELS.values():
22
+ model.requires_grad_(False).eval()
23
+
24
+
25
+ def update_ui(model_choice):
26
+ """
27
+ Dynamically show/hide UI elements based on the model's conditioners.
28
+ We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
29
+ """
30
+ model = MODELS[model_choice]
31
+ cond_names = [c.name for c in model.prefix_conditioner.conditioners]
32
+ print("Conditioners in this model:", cond_names)
33
+
34
+ text_update = gr.update(visible=("espeak" in cond_names))
35
+ language_update = gr.update(visible=("espeak" in cond_names))
36
+ speaker_audio_update = gr.update(visible=("speaker" in cond_names))
37
+ prefix_audio_update = gr.update(visible=True)
38
+ emotion1_update = gr.update(visible=("emotion" in cond_names))
39
+ emotion2_update = gr.update(visible=("emotion" in cond_names))
40
+ emotion3_update = gr.update(visible=("emotion" in cond_names))
41
+ emotion4_update = gr.update(visible=("emotion" in cond_names))
42
+ emotion5_update = gr.update(visible=("emotion" in cond_names))
43
+ emotion6_update = gr.update(visible=("emotion" in cond_names))
44
+ emotion7_update = gr.update(visible=("emotion" in cond_names))
45
+ emotion8_update = gr.update(visible=("emotion" in cond_names))
46
+ vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
47
+ fmax_slider_update = gr.update(visible=("fmax" in cond_names))
48
+ pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
49
+ speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
50
+ dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
51
+ speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
52
+ unconditional_keys_update = gr.update(
53
+ choices=[name for name in cond_names if name not in ("espeak", "language_id")]
54
+ )
55
+
56
+ return (
57
+ text_update,
58
+ language_update,
59
+ speaker_audio_update,
60
+ prefix_audio_update,
61
+ emotion1_update,
62
+ emotion2_update,
63
+ emotion3_update,
64
+ emotion4_update,
65
+ emotion5_update,
66
+ emotion6_update,
67
+ emotion7_update,
68
+ emotion8_update,
69
+ vq_single_slider_update,
70
+ fmax_slider_update,
71
+ pitch_std_slider_update,
72
+ speaking_rate_slider_update,
73
+ dnsmos_slider_update,
74
+ speaker_noised_checkbox_update,
75
+ unconditional_keys_update,
76
+ )
77
+
78
+
79
+ @spaces.GPU(duration=120)
80
+ def generate_audio(
81
+ model_choice,
82
+ text,
83
+ language,
84
+ speaker_audio,
85
+ prefix_audio,
86
+ e1,
87
+ e2,
88
+ e3,
89
+ e4,
90
+ e5,
91
+ e6,
92
+ e7,
93
+ e8,
94
+ vq_single,
95
+ fmax,
96
+ pitch_std,
97
+ speaking_rate,
98
+ dnsmos_ovrl,
99
+ speaker_noised,
100
+ cfg_scale,
101
+ min_p,
102
+ seed,
103
+ randomize_seed,
104
+ unconditional_keys,
105
+ progress=gr.Progress(),
106
+ ):
107
+ """
108
+ Generates audio based on the provided UI parameters.
109
+ We do NOT use language_id or ctc_loss even if the model has them.
110
+ """
111
+ selected_model = MODELS[model_choice]
112
+
113
+ speaker_noised_bool = bool(speaker_noised)
114
+ fmax = float(fmax)
115
+ pitch_std = float(pitch_std)
116
+ speaking_rate = float(speaking_rate)
117
+ dnsmos_ovrl = float(dnsmos_ovrl)
118
+ cfg_scale = float(cfg_scale)
119
+ min_p = float(min_p)
120
+ seed = int(seed)
121
+ max_new_tokens = 86 * 30
122
+
123
+ if randomize_seed:
124
+ seed = torch.randint(0, 2**32 - 1, (1,)).item()
125
+ torch.manual_seed(seed)
126
+
127
+ speaker_embedding = None
128
+ if speaker_audio is not None and "speaker" not in unconditional_keys:
129
+ wav, sr = torchaudio.load(speaker_audio)
130
+ speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
131
+ speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)
132
+
133
+ audio_prefix_codes = None
134
+ if prefix_audio is not None:
135
+ wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
136
+ wav_prefix = wav_prefix.mean(0, keepdim=True)
137
+ wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
138
+ wav_prefix = wav_prefix.to(device, dtype=torch.float32)
139
+ with torch.autocast(device, dtype=torch.float32):
140
+ audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))
141
+
142
+ emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)
143
+
144
+ vq_val = float(vq_single)
145
+ vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)
146
+
147
+ cond_dict = make_cond_dict(
148
+ text=text,
149
+ language=language,
150
+ speaker=speaker_embedding,
151
+ emotion=emotion_tensor,
152
+ vqscore_8=vq_tensor,
153
+ fmax=fmax,
154
+ pitch_std=pitch_std,
155
+ speaking_rate=speaking_rate,
156
+ dnsmos_ovrl=dnsmos_ovrl,
157
+ speaker_noised=speaker_noised_bool,
158
+ device=device,
159
+ unconditional_keys=unconditional_keys,
160
+ )
161
+ conditioning = selected_model.prepare_conditioning(cond_dict)
162
+
163
+ estimated_generation_duration = 30 * len(text) / 400
164
+ estimated_total_steps = int(estimated_generation_duration * 86)
165
+
166
+ def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
167
+ progress((step, estimated_total_steps))
168
+ return True
169
+
170
+ codes = selected_model.generate(
171
+ prefix_conditioning=conditioning,
172
+ audio_prefix_codes=audio_prefix_codes,
173
+ max_new_tokens=max_new_tokens,
174
+ cfg_scale=cfg_scale,
175
+ batch_size=1,
176
+ sampling_params=dict(min_p=min_p),
177
+ callback=update_progress,
178
+ )
179
+
180
+ wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
181
+ sr_out = selected_model.autoencoder.sampling_rate
182
+ if wav_out.dim() == 2 and wav_out.size(0) > 1:
183
+ wav_out = wav_out[0:1, :]
184
+ return (sr_out, wav_out.squeeze().numpy()), seed
185
+
186
+
187
+ def build_interface():
188
+ with gr.Blocks(theme='ParityError/Interstellar') as demo:
189
+ gr.Markdown("# Zonos v0.1")
190
+ gr.Markdown("State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f), [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio) ")
191
+ with gr.Row():
192
+ with gr.Column():
193
+ text = gr.Textbox(
194
+ label="Text to Synthesize",
195
+ value="Zonos uses eSpeak for text to phoneme conversion!",
196
+ lines=4,
197
+ max_length=500, # approximately
198
+ )
199
+ with gr.Row():
200
+ language = gr.Dropdown(
201
+ choices=supported_language_codes,
202
+ value="en-us",
203
+ label="Language",
204
+ )
205
+ model_choice = gr.Dropdown(
206
+ choices=MODEL_NAMES,
207
+ value="Zyphra/Zonos-v0.1-transformer",
208
+ label="Zonos Model Type",
209
+ info="Select the model variant to use.",
210
+ )
211
+ speaker_noised_checkbox = gr.Checkbox(
212
+ label="Denoise Speaker?",
213
+ value=False
214
+ )
215
+ speaker_audio = gr.Audio(
216
+ label="Optional Speaker Audio (for cloning)",
217
+ type="filepath",
218
+ )
219
+ generate_button = gr.Button("Generate Audio")
220
+
221
+ with gr.Column():
222
+ output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True)
223
+
224
+ with gr.Accordion("Toggles", open=True):
225
+ gr.Markdown(
226
+ "### Emotion Sliders\n"
227
+ "Warning: The way these sliders work is not intuitive and may require some trial and error to get the desired effect.\n"
228
+ "Certain configurations can cause the model to become unstable. Setting emotion to unconditional may help."
229
+ )
230
+ with gr.Row():
231
+ emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness")
232
+ emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness")
233
+ emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust")
234
+ emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear")
235
+ with gr.Row():
236
+ emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise")
237
+ emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger")
238
+ emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other")
239
+ emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral")
240
+
241
+ gr.Markdown(
242
+ "### Unconditional Toggles\n"
243
+ "Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
244
+ 'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
245
+ )
246
+ with gr.Row():
247
+ unconditional_keys = gr.CheckboxGroup(
248
+ [
249
+ "speaker",
250
+ "emotion",
251
+ "vqscore_8",
252
+ "fmax",
253
+ "pitch_std",
254
+ "speaking_rate",
255
+ "dnsmos_ovrl",
256
+ "speaker_noised",
257
+ ],
258
+ value=["emotion"],
259
+ label="Unconditional Keys",
260
+ )
261
+
262
+ with gr.Accordion("Advanced Settings", open=False):
263
+ with gr.Row():
264
+ with gr.Column():
265
+ gr.Markdown("## Conditioning Parameters")
266
+ dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="DNSMOS Overall")
267
+ fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Fmax (Hz)")
268
+ vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="VQ Score")
269
+ pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Std")
270
+ speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate")
271
+
272
+ with gr.Column():
273
+ gr.Markdown("## Generation Parameters")
274
+ cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
275
+ min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
276
+ seed_number = gr.Number(label="Seed", value=420, precision=0)
277
+ randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)
278
+
279
+ prefix_audio = gr.Audio(
280
+ value="assets/silence_100ms.wav",
281
+ label="Optional Prefix Audio (continue from this audio)",
282
+ type="filepath",
283
+ )
284
+
285
+ model_choice.change(
286
+ fn=update_ui,
287
+ inputs=[model_choice],
288
+ outputs=[
289
+ text,
290
+ language,
291
+ speaker_audio,
292
+ prefix_audio,
293
+ emotion1,
294
+ emotion2,
295
+ emotion3,
296
+ emotion4,
297
+ emotion5,
298
+ emotion6,
299
+ emotion7,
300
+ emotion8,
301
+ vq_single_slider,
302
+ fmax_slider,
303
+ pitch_std_slider,
304
+ speaking_rate_slider,
305
+ dnsmos_slider,
306
+ speaker_noised_checkbox,
307
+ unconditional_keys,
308
+ ],
309
+ )
310
+
311
+ # On page load, trigger the same UI refresh
312
+ demo.load(
313
+ fn=update_ui,
314
+ inputs=[model_choice],
315
+ outputs=[
316
+ text,
317
+ language,
318
+ speaker_audio,
319
+ prefix_audio,
320
+ emotion1,
321
+ emotion2,
322
+ emotion3,
323
+ emotion4,
324
+ emotion5,
325
+ emotion6,
326
+ emotion7,
327
+ emotion8,
328
+ vq_single_slider,
329
+ fmax_slider,
330
+ pitch_std_slider,
331
+ speaking_rate_slider,
332
+ dnsmos_slider,
333
+ speaker_noised_checkbox,
334
+ unconditional_keys,
335
+ ],
336
+ )
337
+
338
+ # Generate audio on button click
339
+ generate_button.click(
340
+ fn=generate_audio,
341
+ inputs=[
342
+ model_choice,
343
+ text,
344
+ language,
345
+ speaker_audio,
346
+ prefix_audio,
347
+ emotion1,
348
+ emotion2,
349
+ emotion3,
350
+ emotion4,
351
+ emotion5,
352
+ emotion6,
353
+ emotion7,
354
+ emotion8,
355
+ vq_single_slider,
356
+ fmax_slider,
357
+ pitch_std_slider,
358
+ speaking_rate_slider,
359
+ dnsmos_slider,
360
+ speaker_noised_checkbox,
361
+ cfg_scale_slider,
362
+ min_p_slider,
363
+ seed_number,
364
+ randomize_seed_toggle,
365
+ unconditional_keys,
366
+ ],
367
+ outputs=[output_audio, seed_number],
368
+ )
369
 
370
+ return demo
 
371
 
 
 
372
 
373
+ if __name__ == "__main__":
374
+ demo = build_interface()
375
+ share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t")
376
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=share, ssr_mode=False)