Staticaliza commited on
Commit
39d0c1a
·
verified ·
1 Parent(s): dec0f9c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -325
app.py CHANGED
@@ -1,44 +1,18 @@
1
- import os
2
- import shlex
3
- import subprocess
4
-
5
- subprocess.run(
6
- shlex.split("pip install flash-attn --no-build-isolation"),
7
- env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
8
- check=True,
9
- )
10
- subprocess.run(
11
- shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
12
- check=True,
13
- )
14
- subprocess.run(
15
- shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
16
- check=True,
17
- )
18
-
19
- import spaces
20
- import torch
21
- import torchaudio
22
- import gradio as gr
23
- from os import getenv
24
-
25
  from zonos.model import Zonos
26
  from zonos.conditioning import make_cond_dict, supported_language_codes
27
 
28
- # 1. hard-kill torch.compile / dynamo / inductor so they never run
29
  os.environ["TORCH_COMPILE_DISABLE"] = "1"
30
  os.environ["TORCHINDUCTOR_DISABLE"] = "1"
31
- os.environ["TORCHDYNAMO_DISABLE"] = "1" # <- the one that actually blocks torch._dynamo
32
- os.environ["TORCHDYNAMO_SUPPRESS_ERRORS"] = "True" # fall back to eager if something still slips through :contentReference[oaicite:1]{index=1}
33
-
34
- torch._dynamo.disable() # guard for older versions
35
- torch.compile = lambda f,*_,**__: f # no-op wrapper
36
 
37
  device = "cuda"
38
- MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
39
- MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
40
- for model in MODELS.values():
41
- model.requires_grad_(False).eval()
42
 
43
  def _patch_cuda_props():
44
  if torch.cuda.is_available():
@@ -48,70 +22,15 @@ def _patch_cuda_props():
48
  setattr(p, "regs_per_multiprocessor", 65536)
49
  if not hasattr(p, "max_threads_per_multi_processor"):
50
  setattr(p, "max_threads_per_multi_processor", 2048)
51
-
52
- _patch_cuda_props()
53
 
54
- def update_ui(model_choice):
55
- """
56
- Dynamically show/hide UI elements based on the model's conditioners.
57
- We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
58
- """
59
- model = MODELS[model_choice]
60
- cond_names = [c.name for c in model.prefix_conditioner.conditioners]
61
- print("Conditioners in this model:", cond_names)
62
 
63
- text_update = gr.update(visible=("espeak" in cond_names))
64
- language_update = gr.update(visible=("espeak" in cond_names))
65
- speaker_audio_update = gr.update(visible=("speaker" in cond_names))
66
- prefix_audio_update = gr.update(visible=True)
67
- emotion1_update = gr.update(visible=("emotion" in cond_names))
68
- emotion2_update = gr.update(visible=("emotion" in cond_names))
69
- emotion3_update = gr.update(visible=("emotion" in cond_names))
70
- emotion4_update = gr.update(visible=("emotion" in cond_names))
71
- emotion5_update = gr.update(visible=("emotion" in cond_names))
72
- emotion6_update = gr.update(visible=("emotion" in cond_names))
73
- emotion7_update = gr.update(visible=("emotion" in cond_names))
74
- emotion8_update = gr.update(visible=("emotion" in cond_names))
75
- vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
76
- fmax_slider_update = gr.update(visible=("fmax" in cond_names))
77
- pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
78
- speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
79
- dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
80
- speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
81
- unconditional_keys_update = gr.update(
82
- choices=[name for name in cond_names if name not in ("espeak", "language_id")]
83
- )
84
-
85
- return (
86
- text_update,
87
- language_update,
88
- speaker_audio_update,
89
- prefix_audio_update,
90
- emotion1_update,
91
- emotion2_update,
92
- emotion3_update,
93
- emotion4_update,
94
- emotion5_update,
95
- emotion6_update,
96
- emotion7_update,
97
- emotion8_update,
98
- vq_single_slider_update,
99
- fmax_slider_update,
100
- pitch_std_slider_update,
101
- speaking_rate_slider_update,
102
- dnsmos_slider_update,
103
- speaker_noised_checkbox_update,
104
- unconditional_keys_update,
105
- )
106
 
107
 
108
- @spaces.GPU(duration=120)
109
  def generate_audio(
110
- model_choice,
111
  text,
112
  language,
113
  speaker_audio,
114
- prefix_audio,
115
  e1,
116
  e2,
117
  e3,
@@ -120,58 +39,35 @@ def generate_audio(
120
  e6,
121
  e7,
122
  e8,
123
- vq_single,
124
  fmax,
125
  pitch_std,
126
  speaking_rate,
127
  dnsmos_ovrl,
128
- speaker_noised,
129
  cfg_scale,
130
  min_p,
 
131
  seed,
132
  randomize_seed,
133
- unconditional_keys,
134
  progress=gr.Progress(),
135
  ):
136
- """
137
- Generates audio based on the provided UI parameters.
138
- We do NOT use language_id or ctc_loss even if the model has them.
139
- """
140
- selected_model = MODELS[model_choice]
141
-
142
- speaker_noised_bool = bool(speaker_noised)
143
- fmax = float(fmax)
144
- pitch_std = float(pitch_std)
145
- speaking_rate = float(speaking_rate)
146
- dnsmos_ovrl = float(dnsmos_ovrl)
147
- cfg_scale = float(cfg_scale)
148
- min_p = float(min_p)
149
- seed = int(seed)
150
- max_new_tokens = 86 * 30
151
-
152
  if randomize_seed:
153
  seed = torch.randint(0, 2**32 - 1, (1,)).item()
154
- torch.manual_seed(seed)
155
 
156
  speaker_embedding = None
157
- if speaker_audio is not None and "speaker" not in unconditional_keys:
158
  wav, sr = torchaudio.load(speaker_audio)
159
- speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
160
- speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)
161
-
162
- audio_prefix_codes = None
163
- if prefix_audio is not None:
164
- wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
165
- wav_prefix = wav_prefix.mean(0, keepdim=True)
166
- wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
167
- wav_prefix = wav_prefix.to(device, dtype=torch.float32)
168
- with torch.autocast(device, dtype=torch.float32):
169
- audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))
170
-
171
- emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)
172
 
173
- vq_val = float(vq_single)
174
- vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)
 
 
 
 
175
 
176
  cond_dict = make_cond_dict(
177
  text=text,
@@ -179,244 +75,104 @@ def generate_audio(
179
  speaker=speaker_embedding,
180
  emotion=emotion_tensor,
181
  vqscore_8=vq_tensor,
182
- fmax=fmax,
183
- pitch_std=pitch_std,
184
- speaking_rate=speaking_rate,
185
- dnsmos_ovrl=dnsmos_ovrl,
186
- speaker_noised=speaker_noised_bool,
187
  device=device,
188
- unconditional_keys=unconditional_keys,
189
  )
190
- conditioning = selected_model.prepare_conditioning(cond_dict)
191
 
192
- estimated_generation_duration = 30 * len(text) / 400
193
- estimated_total_steps = int(estimated_generation_duration * 86)
194
 
195
- def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
196
  progress((step, estimated_total_steps))
197
  return True
198
 
199
- codes = selected_model.generate(
200
  prefix_conditioning=conditioning,
201
- audio_prefix_codes=audio_prefix_codes,
202
- max_new_tokens=max_new_tokens,
203
- cfg_scale=cfg_scale,
204
  batch_size=1,
205
- sampling_params=dict(min_p=min_p),
206
- callback=update_progress,
207
  )
208
 
209
- wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
210
- sr_out = selected_model.autoencoder.sampling_rate
211
  if wav_out.dim() == 2 and wav_out.size(0) > 1:
212
  wav_out = wav_out[0:1, :]
213
  return (sr_out, wav_out.squeeze().numpy()), seed
214
 
 
215
  def build_interface():
216
- # Build interface with enhanced visual elements and layout
217
  with gr.Blocks() as demo:
218
- # Header section
219
- with gr.Column(elem_classes="app-header"):
220
- gr.Markdown("# ✨ Zonos Text-to-Speech Generator ✨")
221
- gr.Markdown("Create natural-sounding speech with customizable voice characteristics")
222
-
223
- # Main content container
224
- with gr.Column(elem_classes="container"):
225
- # First panel - Text & Model Selection
226
- with gr.Column(elem_classes="panel"):
227
- gr.Markdown('<div class="title">💬 Text & Model Configuration</div>')
228
- with gr.Row():
229
- with gr.Column(scale=2):
230
- model_choice = gr.Dropdown(
231
- choices=MODEL_NAMES,
232
- value="Zyphra/Zonos-v0.1-transformer",
233
- label="Zonos Model Type",
234
- info="Select the model variant to use.",
235
- )
236
- text = gr.Textbox(
237
- label="Text to Synthesize",
238
- value="Zonos uses eSpeak for text to phoneme conversion!",
239
- lines=4,
240
- max_length=500,
241
- )
242
- language = gr.Dropdown(
243
- choices=supported_language_codes,
244
- value="en-us",
245
- label="Language Code",
246
- info="Select a language code.",
247
- )
248
- with gr.Column(scale=1):
249
- prefix_audio = gr.Audio(
250
- value="assets/silence_100ms.wav",
251
- label="Optional Prefix Audio (continue from this audio)",
252
- type="filepath",
253
- )
254
-
255
- # Second panel - Voice Characteristics
256
- with gr.Column(elem_classes="panel"):
257
- gr.Markdown('<div class="title">🎤 Voice Characteristics</div>')
258
- with gr.Row():
259
- with gr.Column(scale=1):
260
- speaker_audio = gr.Audio(
261
- label="Optional Speaker Audio (for voice cloning)",
262
- type="filepath",
263
- )
264
- speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)
265
-
266
- with gr.Column(scale=2):
267
- with gr.Row():
268
- with gr.Column():
269
- dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="Voice Quality", elem_classes="slider-container")
270
- fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Frequency Max (Hz)", elem_classes="slider-container")
271
- vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="Voice Clarity", elem_classes="slider-container")
272
- with gr.Column():
273
- pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Variation", elem_classes="slider-container")
274
- speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", elem_classes="slider-container")
275
-
276
- # Third panel - Generation Parameters
277
- with gr.Column(elem_classes="panel"):
278
- gr.Markdown('<div class="title">⚙️ Generation Parameters</div>')
279
- with gr.Row():
280
- with gr.Column():
281
- cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="Guidance Scale", elem_classes="slider-container")
282
- min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P (Randomness)", elem_classes="slider-container")
283
- with gr.Column():
284
- seed_number = gr.Number(label="Seed", value=420, precision=0)
285
- randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)
286
-
287
- # Emotion Panel with Tabbed Interface
288
- with gr.Accordion("🎭 Emotion Settings", open=False, elem_classes="panel"):
289
- gr.Markdown(
290
- "Adjust these sliders to control the emotional tone of the generated speech.\n"
291
- "For a neutral voice, keep 'Neutral' high and other emotions low."
292
- )
293
- with gr.Row(elem_classes="emotion-grid"):
294
- emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness", elem_classes="slider-container")
295
- emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness", elem_classes="slider-container")
296
- emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust", elem_classes="slider-container")
297
- emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear", elem_classes="slider-container")
298
- with gr.Row(elem_classes="emotion-grid"):
299
- emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise", elem_classes="slider-container")
300
- emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger", elem_classes="slider-container")
301
- emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other", elem_classes="slider-container")
302
- emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral", elem_classes="slider-container")
303
-
304
- # Advanced Settings Panel
305
- with gr.Accordion("⚡ Advanced Settings", open=False, elem_classes="panel"):
306
- gr.Markdown(
307
- "### Unconditional Toggles\n"
308
- "Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
309
- 'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
310
- )
311
- unconditional_keys = gr.CheckboxGroup(
312
- [
313
- "speaker",
314
- "emotion",
315
- "vqscore_8",
316
- "fmax",
317
- "pitch_std",
318
- "speaking_rate",
319
- "dnsmos_ovrl",
320
- "speaker_noised",
321
- ],
322
- value=["emotion"],
323
- label="Unconditional Keys",
324
- )
325
-
326
- # Generate Button and Output Area
327
- with gr.Column(elem_classes="panel output-container"):
328
- gr.Markdown('<div class="title">🔊 Generate & Output</div>')
329
- generate_button = gr.Button("Generate Audio", elem_classes="generate-button")
330
- output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True, elem_classes="audio-output")
331
 
332
- model_choice.change(
333
- fn=update_ui,
334
- inputs=[model_choice],
335
- outputs=[
336
- text,
337
- language,
338
- speaker_audio,
339
- prefix_audio,
340
- emotion1,
341
- emotion2,
342
- emotion3,
343
- emotion4,
344
- emotion5,
345
- emotion6,
346
- emotion7,
347
- emotion8,
348
- vq_single_slider,
349
- fmax_slider,
350
- pitch_std_slider,
351
- speaking_rate_slider,
352
- dnsmos_slider,
353
- speaker_noised_checkbox,
354
- unconditional_keys,
355
- ],
356
  )
 
357
 
358
- # On page load, trigger the same UI refresh
359
- demo.load(
360
- fn=update_ui,
361
- inputs=[model_choice],
362
- outputs=[
363
- text,
364
- language,
365
- speaker_audio,
366
- prefix_audio,
367
- emotion1,
368
- emotion2,
369
- emotion3,
370
- emotion4,
371
- emotion5,
372
- emotion6,
373
- emotion7,
374
- emotion8,
375
- vq_single_slider,
376
- fmax_slider,
377
- pitch_std_slider,
378
- speaking_rate_slider,
379
- dnsmos_slider,
380
- speaker_noised_checkbox,
381
- unconditional_keys,
382
- ],
383
- )
 
 
384
 
385
- # Generate audio on button click
386
  generate_button.click(
387
  fn=generate_audio,
388
  inputs=[
389
- model_choice,
390
  text,
391
  language,
392
  speaker_audio,
393
- prefix_audio,
394
- emotion1,
395
- emotion2,
396
- emotion3,
397
- emotion4,
398
- emotion5,
399
- emotion6,
400
- emotion7,
401
- emotion8,
402
- vq_single_slider,
403
  fmax_slider,
404
  pitch_std_slider,
405
  speaking_rate_slider,
406
  dnsmos_slider,
407
- speaker_noised_checkbox,
408
  cfg_scale_slider,
409
  min_p_slider,
 
410
  seed_number,
411
  randomize_seed_toggle,
412
- unconditional_keys,
413
  ],
414
  outputs=[output_audio, seed_number],
415
  )
416
-
417
  return demo
418
 
419
 
420
  if __name__ == "__main__":
421
- demo = build_interface()
422
- demo.launch()
 
1
+ import os, torch, torchaudio, gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from zonos.model import Zonos
3
  from zonos.conditioning import make_cond_dict, supported_language_codes
4
 
 
5
  os.environ["TORCH_COMPILE_DISABLE"] = "1"
6
  os.environ["TORCHINDUCTOR_DISABLE"] = "1"
7
+ os.environ["TORCHDYNAMO_DISABLE"] = "1"
8
+ os.environ["TORCHDYNAMO_SUPPRESS_ERRORS"] = "True"
9
+ torch._dynamo.disable()
10
+ torch.compile = lambda f, *_, **__: f
 
11
 
12
  device = "cuda"
13
+ MODEL_NAME = "Zyphra/Zonos-v0.1-transformer"
14
+ MODEL = Zonos.from_pretrained(MODEL_NAME, device=device).requires_grad_(False).eval()
15
+
 
16
 
17
  def _patch_cuda_props():
18
  if torch.cuda.is_available():
 
22
  setattr(p, "regs_per_multiprocessor", 65536)
23
  if not hasattr(p, "max_threads_per_multi_processor"):
24
  setattr(p, "max_threads_per_multi_processor", 2048)
 
 
25
 
 
 
 
 
 
 
 
 
26
 
27
+ _patch_cuda_props()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
 
 
30
  def generate_audio(
 
31
  text,
32
  language,
33
  speaker_audio,
 
34
  e1,
35
  e2,
36
  e3,
 
39
  e6,
40
  e7,
41
  e8,
42
+ clarity,
43
  fmax,
44
  pitch_std,
45
  speaking_rate,
46
  dnsmos_ovrl,
 
47
  cfg_scale,
48
  min_p,
49
+ steps,
50
  seed,
51
  randomize_seed,
 
52
  progress=gr.Progress(),
53
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  if randomize_seed:
55
  seed = torch.randint(0, 2**32 - 1, (1,)).item()
56
+ torch.manual_seed(int(seed))
57
 
58
  speaker_embedding = None
59
+ if speaker_audio is not None:
60
  wav, sr = torchaudio.load(speaker_audio)
61
+ speaker_embedding = (
62
+ MODEL.make_speaker_embedding(wav, sr).to(device, dtype=torch.bfloat16)
63
+ )
 
 
 
 
 
 
 
 
 
 
64
 
65
+ emotion_tensor = torch.tensor(
66
+ [e1, e2, e3, e4, e5, e6, e7, e8], device=device, dtype=torch.float32
67
+ )
68
+ vq_tensor = torch.tensor([clarity] * 8, device=device, dtype=torch.float32).unsqueeze(
69
+ 0
70
+ )
71
 
72
  cond_dict = make_cond_dict(
73
  text=text,
 
75
  speaker=speaker_embedding,
76
  emotion=emotion_tensor,
77
  vqscore_8=vq_tensor,
78
+ fmax=float(fmax),
79
+ pitch_std=float(pitch_std),
80
+ speaking_rate=float(speaking_rate),
81
+ dnsmos_ovrl=float(dnsmos_ovrl),
 
82
  device=device,
 
83
  )
84
+ conditioning = MODEL.prepare_conditioning(cond_dict)
85
 
86
+ estimated_total_steps = int(steps)
 
87
 
88
+ def cb(_, step, __):
89
  progress((step, estimated_total_steps))
90
  return True
91
 
92
+ codes = MODEL.generate(
93
  prefix_conditioning=conditioning,
94
+ max_new_tokens=int(steps),
95
+ cfg_scale=float(cfg_scale),
 
96
  batch_size=1,
97
+ sampling_params=dict(min_p=float(min_p)),
98
+ callback=cb,
99
  )
100
 
101
+ wav_out = MODEL.autoencoder.decode(codes).cpu().detach()
102
+ sr_out = MODEL.autoencoder.sampling_rate
103
  if wav_out.dim() == 2 and wav_out.size(0) > 1:
104
  wav_out = wav_out[0:1, :]
105
  return (sr_out, wav_out.squeeze().numpy()), seed
106
 
107
+
108
  def build_interface():
 
109
  with gr.Blocks() as demo:
110
+ gr.Markdown("# zonos tts generator ✨")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ text = gr.Textbox(label="text", value="hello, world!", lines=4, max_length=500)
113
+ language = gr.Dropdown(
114
+ choices=supported_language_codes, value="en-us", label="language"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  )
116
+ speaker_audio = gr.Audio(label="voice reference", type="filepath")
117
 
118
+ clarity_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="clarity")
119
+ steps_slider = gr.Slider(1, 3000, 300, 1, label="steps")
120
+
121
+ dnsmos_slider = gr.Slider(1.0, 5.0, 4.0, 0.1, label="quality")
122
+ fmax_slider = gr.Slider(0, 24000, 24000, 1, label="fmax")
123
+ pitch_std_slider = gr.Slider(0.0, 300.0, 45.0, 1, label="pitch std")
124
+ speaking_rate_slider = gr.Slider(5.0, 30.0, 15.0, 0.5, label="rate")
125
+
126
+ cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="guidance")
127
+ min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="min p")
128
+
129
+ with gr.Row():
130
+ e1 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="happy")
131
+ e2 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="sad")
132
+ e3 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="disgust")
133
+ e4 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="fear")
134
+
135
+ with gr.Row():
136
+ e5 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="surprise")
137
+ e6 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="anger")
138
+ e7 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="other")
139
+ e8 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="neutral")
140
+
141
+ seed_number = gr.Number(label="seed", value=420, precision=0)
142
+ randomize_seed_toggle = gr.Checkbox(label="randomize seed", value=True)
143
+
144
+ generate_button = gr.Button("generate")
145
+ output_audio = gr.Audio(label="output", type="numpy", autoplay=True)
146
 
 
147
  generate_button.click(
148
  fn=generate_audio,
149
  inputs=[
 
150
  text,
151
  language,
152
  speaker_audio,
153
+ e1,
154
+ e2,
155
+ e3,
156
+ e4,
157
+ e5,
158
+ e6,
159
+ e7,
160
+ e8,
161
+ clarity_slider,
 
162
  fmax_slider,
163
  pitch_std_slider,
164
  speaking_rate_slider,
165
  dnsmos_slider,
 
166
  cfg_scale_slider,
167
  min_p_slider,
168
+ steps_slider,
169
  seed_number,
170
  randomize_seed_toggle,
 
171
  ],
172
  outputs=[output_audio, seed_number],
173
  )
 
174
  return demo
175
 
176
 
177
  if __name__ == "__main__":
178
+ build_interface().launch()