Spaces:

Staticaliza
/

Zero-5

Running on Zero

App Files Files Community

Staticaliza commited on 13 days ago

Commit

47581eb

verified ·

1 Parent(s): 234b163

Update app.py

Browse files

Files changed (1) hide show

app.py +643 -91

app.py CHANGED Viewed

@@ -1,23 +1,21 @@
-# Imports
-import gradio as gr
-import spaces
 import os
-import torch
-import torchaudio
-import time
-from zonos.model import Zonos
-from zonos.conditioning import make_cond_dict, supported_language_codes
-# Variables
-HF_TOKEN = os.environ.get("HF_TOKEN", "")
-device = "cuda"
-REPO = "Zyphra/Zonos-v0.1-transformer"
-model = Zonos.from_pretrained(REPO, device=device)
-# Functions
 def patch_cuda():
     if torch.cuda.is_available():
         for i in range(torch.cuda.device_count()):
@@ -27,106 +25,660 @@ def patch_cuda():
             if not hasattr(p, "max_threads_per_multi_processor"):
                 setattr(p, "max_threads_per_multi_processor", 2048)
-@spaces.GPU
-def generate(input, language, speaker_audio, emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral, clarity, fmax, pitch_std, speaking_rate, dnsmos_ovrl, cfg_scale, min_p, steps, seed, randomize_seed):
-    if randomize_seed: seed = int(time.time())
     torch.manual_seed(seed)
     speaker_embedding = None
-    if speaker_audio is not None:
-        print(1)
-        print(speaker_audio)
         wav, sr = torchaudio.load(speaker_audio)
-        print(2)
-        print(wav)
-        print(sr)
-        speaker_embedding = (model.make_speaker_embedding(wav, sr).to(device, dtype=torch.bfloat16))
-        print(3)
-        print(speaker_embedding)
-    emotion_tensor = torch.tensor([emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral], device=device, dtype=torch.bfloat16)
-    vq_tensor = torch.tensor([clarity] * 8, device=device, dtype=torch.bfloat16).unsqueeze(0)
-    print(4)
-    print(emotion_tensor)
-    print(vq_tensor)
     cond_dict = make_cond_dict(
-        text=input,
         language=language,
         speaker=speaker_embedding,
         emotion=emotion_tensor,
         vqscore_8=vq_tensor,
-        fmax=float(fmax),
-        pitch_std=float(pitch_std),
-        speaking_rate=float(speaking_rate),
-        dnsmos_ovrl=float(dnsmos_ovrl),
         device=device,
     )
-    print(5)
-    print(cond_dict)
-    conditioning = model.prepare_conditioning(cond_dict)
-    print(6)
-    print(conditioning)
-    codes = model.generate(
         prefix_conditioning=conditioning,
-        max_new_tokens=int(steps),
-        cfg_scale=float(cfg_scale),
         batch_size=1,
-        sampling_params=dict(min_p=float(min_p)),
     )
-    print(7)
-    print(codes)
-    wav_out = model.autoencoder.decode(codes).cpu().detach()
-    sr_out = model.autoencoder.sampling_rate
-    print(8)
-    print(wav_output)
-    print(sr_output)
-    if wav_out.dim() == 2 and wav_out.size(0) > 1: wav_out = wav_out[0:1, :]
-    print(9)
-    print((sr_out, wav_out.squeeze().numpy()))
-    return (sr_out, wav_out.squeeze().numpy())
-# Initialize
-patch_cuda()
-with gr.Blocks() as main:
-    text = gr.Textbox(label="text", value="hello, world!")
-    language = gr.Dropdown(choices=supported_language_codes, value="en-us", label="language")
-    speaker_audio = gr.Audio(label="voice reference", type="filepath")
-    clarity_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="clarity")
-    steps_slider = gr.Slider(1, 3000, 316, 1, label="steps")
-    dnsmos_slider = gr.Slider(1.0, 5.0, 5.0, 0.1, label="quality")
-    fmax_slider = gr.Slider(0, 24000, 24000, 1, label="fmax")
-    pitch_std_slider = gr.Slider(0.0, 1000.0, 30.0, 1, label="pitch std")
-    speaking_rate_slider = gr.Slider(5.0, 30.0, 15.0, 0.1, label="rate")
-    cfg_scale_slider = gr.Slider(1.0, 5.0, 2.5, 0.1, label="guidance")
-    min_p_slider = gr.Slider(0.0, 1.0, 0.05, 0.15, label="min p")
-    with gr.Row():
-        e1 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="happy")
-        e2 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="sad")
-        e3 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="disgust")
-        e4 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="fear")
-        e5 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="surprise")
-        e6 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="anger")
-        e7 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="other")
-        e8 = gr.Slider(0.0, 1.0, 1.0, 0.01, label="neutral")
-    seed_number = gr.Number(label="seed", value=42, precision=0)
-    randomize_seed_toggle = gr.Checkbox(label="randomize seed", value=True)
-    generate_button = gr.Button("generate")
-    output_audio = gr.Audio(label="output", type="numpy", autoplay=True)
-    generate_button.click(fn=generate, inputs=[text, language, speaker_audio, e1, e2, e3, e4, e5, e6, e7, e8, clarity_slider, fmax_slider, pitch_std_slider, speaking_rate_slider, dnsmos_slider, cfg_scale_slider, min_p_slider, steps_slider, seed_number, randomize_seed_toggle], outputs=output_audio)
-main.launch()

 import os
+import shlex
+import subprocess
+subprocess.run(
+    shlex.split("pip install flash-attn --no-build-isolation"),
+    env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    check=True,
+)
+subprocess.run(
+    shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
+    check=True,
+)
+subprocess.run(
+    shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
+    check=True,
+)
 def patch_cuda():
     if torch.cuda.is_available():
         for i in range(torch.cuda.device_count()):
             if not hasattr(p, "max_threads_per_multi_processor"):
                 setattr(p, "max_threads_per_multi_processor", 2048)
+patch_cuda()
+import spaces
+import torch
+import torchaudio
+import gradio as gr
+from os import getenv
+from zonos.model import Zonos
+from zonos.conditioning import make_cond_dict, supported_language_codes
+device = "cuda"
+MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
+MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
+for model in MODELS.values():
+    model.requires_grad_(False).eval()
+def update_ui(model_choice):
+    """
+    Dynamically show/hide UI elements based on the model's conditioners.
+    We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
+    """
+    model = MODELS[model_choice]
+    cond_names = [c.name for c in model.prefix_conditioner.conditioners]
+    print("Conditioners in this model:", cond_names)
+    text_update = gr.update(visible=("espeak" in cond_names))
+    language_update = gr.update(visible=("espeak" in cond_names))
+    speaker_audio_update = gr.update(visible=("speaker" in cond_names))
+    prefix_audio_update = gr.update(visible=True)
+    emotion1_update = gr.update(visible=("emotion" in cond_names))
+    emotion2_update = gr.update(visible=("emotion" in cond_names))
+    emotion3_update = gr.update(visible=("emotion" in cond_names))
+    emotion4_update = gr.update(visible=("emotion" in cond_names))
+    emotion5_update = gr.update(visible=("emotion" in cond_names))
+    emotion6_update = gr.update(visible=("emotion" in cond_names))
+    emotion7_update = gr.update(visible=("emotion" in cond_names))
+    emotion8_update = gr.update(visible=("emotion" in cond_names))
+    vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
+    fmax_slider_update = gr.update(visible=("fmax" in cond_names))
+    pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
+    speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
+    dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
+    speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
+    unconditional_keys_update = gr.update(
+        choices=[name for name in cond_names if name not in ("espeak", "language_id")]
+    )
+    return (
+        text_update,
+        language_update,
+        speaker_audio_update,
+        prefix_audio_update,
+        emotion1_update,
+        emotion2_update,
+        emotion3_update,
+        emotion4_update,
+        emotion5_update,
+        emotion6_update,
+        emotion7_update,
+        emotion8_update,
+        vq_single_slider_update,
+        fmax_slider_update,
+        pitch_std_slider_update,
+        speaking_rate_slider_update,
+        dnsmos_slider_update,
+        speaker_noised_checkbox_update,
+        unconditional_keys_update,
+    )
+@spaces.GPU(duration=120)
+def generate_audio(
+    model_choice,
+    text,
+    language,
+    speaker_audio,
+    prefix_audio,
+    e1,
+    e2,
+    e3,
+    e4,
+    e5,
+    e6,
+    e7,
+    e8,
+    vq_single,
+    fmax,
+    pitch_std,
+    speaking_rate,
+    dnsmos_ovrl,
+    speaker_noised,
+    cfg_scale,
+    min_p,
+    seed,
+    randomize_seed,
+    unconditional_keys,
+    progress=gr.Progress(),
+):
+    """
+    Generates audio based on the provided UI parameters.
+    We do NOT use language_id or ctc_loss even if the model has them.
+    """
+    selected_model = MODELS[model_choice]
+    speaker_noised_bool = bool(speaker_noised)
+    fmax = float(fmax)
+    pitch_std = float(pitch_std)
+    speaking_rate = float(speaking_rate)
+    dnsmos_ovrl = float(dnsmos_ovrl)
+    cfg_scale = float(cfg_scale)
+    min_p = float(min_p)
+    seed = int(seed)
+    max_new_tokens = 86 * 30
+    if randomize_seed:
+        seed = torch.randint(0, 2**32 - 1, (1,)).item()
     torch.manual_seed(seed)
     speaker_embedding = None
+    if speaker_audio is not None and "speaker" not in unconditional_keys:
         wav, sr = torchaudio.load(speaker_audio)
+        speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
+        speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)
+    audio_prefix_codes = None
+    if prefix_audio is not None:
+        wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
+        wav_prefix = wav_prefix.mean(0, keepdim=True)
+        wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
+        wav_prefix = wav_prefix.to(device, dtype=torch.float32)
+        with torch.autocast(device, dtype=torch.float32):
+            audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))
+    emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)
+    vq_val = float(vq_single)
+    vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)
     cond_dict = make_cond_dict(
+        text=text,
         language=language,
         speaker=speaker_embedding,
         emotion=emotion_tensor,
         vqscore_8=vq_tensor,
+        fmax=fmax,
+        pitch_std=pitch_std,
+        speaking_rate=speaking_rate,
+        dnsmos_ovrl=dnsmos_ovrl,
+        speaker_noised=speaker_noised_bool,
         device=device,
+        unconditional_keys=unconditional_keys,
     )
+    conditioning = selected_model.prepare_conditioning(cond_dict)
+    estimated_generation_duration = 30 * len(text) / 400
+    estimated_total_steps = int(estimated_generation_duration * 86)
+    def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
+        progress((step, estimated_total_steps))
+        return True
+    codes = selected_model.generate(
         prefix_conditioning=conditioning,
+        audio_prefix_codes=audio_prefix_codes,
+        max_new_tokens=max_new_tokens,
+        cfg_scale=cfg_scale,
         batch_size=1,
+        sampling_params=dict(min_p=min_p),
+        callback=update_progress,
     )
+    wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
+    sr_out = selected_model.autoencoder.sampling_rate
+    if wav_out.dim() == 2 and wav_out.size(0) > 1:
+        wav_out = wav_out[0:1, :]
+    return (sr_out, wav_out.squeeze().numpy()), seed
+# Custom CSS for pastel gradient background and enhanced UI
+custom_css = """
+.gradio-container {
+    background: linear-gradient(135deg, #f3e7ff, #e6f0ff, #ffe6f2, #e6fff9);
+    background-size: 400% 400%;
+    animation: gradient 15s ease infinite;
+}
+@keyframes gradient {
+    0% {
+        background-position: 0% 50%;
+    }
+    50% {
+        background-position: 100% 50%;
+    }
+    100% {
+        background-position: 0% 50%;
+    }
+}
+.container {
+    max-width: 1200px;
+    margin: 0 auto;
+    padding: 20px;
+}
+.panel {
+    background-color: rgba(255, 255, 255, 0.7);
+    border-radius: 16px;
+    padding: 20px;
+    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
+    margin-bottom: 16px;
+    backdrop-filter: blur(5px);
+    transition: all 0.3s ease;
+}
+.panel:hover {
+    box-shadow: 0 6px 16px rgba(0, 0, 0, 0.12);
+    transform: translateY(-2px);
+}
+.title {
+    font-size: 1.2em;
+    font-weight: 600;
+    margin-bottom: 12px;
+    color: #6a3ea1;
+    border-bottom: 2px solid #f0e6ff;
+    padding-bottom: 8px;
+}
+.slider-container {
+    background-color: rgba(255, 255, 255, 0.5);
+    border-radius: 10px;
+    padding: 10px;
+    margin: 5px 0;
+}
+/* Make sliders more appealing */
+input[type=range] {
+    height: 5px;
+    appearance: none;
+    width: 100%;
+    border-radius: 3px;
+    background: linear-gradient(90deg, #9c83e0, #83b1e0);
+}
+.generate-button {
+    background: linear-gradient(90deg, #a673ff, #7c4dff);
+    color: white;
+    border: none;
+    border-radius: 8px;
+    padding: 12px 24px;
+    font-size: 16px;
+    font-weight: 500;
+    cursor: pointer;
+    transition: all 0.3s ease;
+    box-shadow: 0 4px 10px rgba(124, 77, 255, 0.2);
+    display: block;
+    width: 100%;
+    margin: 20px 0;
+}
+.generate-button:hover {
+    background: linear-gradient(90deg, #9c5eff, #6a3aff);
+    box-shadow: 0 6px 15px rgba(124, 77, 255, 0.3);
+    transform: translateY(-2px);
+}
+/* Tabs styling */
+.tabs {
+    display: flex;
+    border-bottom: 1px solid #e0e0e0;
+    margin-bottom: 20px;
+}
+.tab {
+    padding: 10px 20px;
+    cursor: pointer;
+    transition: all 0.3s ease;
+    background-color: transparent;
+    border: none;
+    color: #666;
+}
+.tab.active {
+    color: #7c4dff;
+    border-bottom: 3px solid #7c4dff;
+    font-weight: 600;
+}
+/* Emotion sliders container */
+.emotion-grid {
+    display: grid;
+    grid-template-columns: repeat(4, 1fr);
+    gap: 12px;
+}
+/* Header styling */
+.app-header {
+    text-align: center;
+    margin-bottom: 25px;
+}
+.app-header h1 {
+    font-size: 2.5em;
+    color: #6a3ea1;
+    margin-bottom: 8px;
+    font-weight: 700;
+}
+.app-header p {
+    font-size: 1.1em;
+    color: #666;
+    margin-bottom: 20px;
+}
+/* Audio player styling */
+.audio-output {
+    margin-top: 20px;
+}
+/* Make output area more prominent */
+.output-container {
+    background-color: rgba(255, 255, 255, 0.85);
+    border-radius: 16px;
+    padding: 24px;
+    box-shadow: 0 8px 18px rgba(0, 0, 0, 0.1);
+    margin-top: 20px;
+}
+"""
+def build_interface():
+    # Build interface with enhanced visual elements and layout
+    with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
+        # Header section
+        with gr.Column(elem_classes="app-header"):
+            gr.Markdown("# ✨ Zonos Text-to-Speech Generator ✨")
+            gr.Markdown("Create natural-sounding speech with customizable voice characteristics")
+        # Main content container
+        with gr.Column(elem_classes="container"):
+            # First panel - Text & Model Selection
+            with gr.Column(elem_classes="panel"):
+                gr.Markdown('<div class="title">💬 Text & Model Configuration</div>')
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        model_choice = gr.Dropdown(
+                            choices=MODEL_NAMES,
+                            value="Zyphra/Zonos-v0.1-transformer",
+                            label="Zonos Model Type",
+                            info="Select the model variant to use.",
+                        )
+                        text = gr.Textbox(
+                            label="Text to Synthesize",
+                            value="Zonos uses eSpeak for text to phoneme conversion!",
+                            lines=4,
+                            max_length=500,
+                        )
+                        language = gr.Dropdown(
+                            choices=supported_language_codes,
+                            value="en-us",
+                            label="Language Code",
+                            info="Select a language code.",
+                        )
+                    with gr.Column(scale=1):
+                        prefix_audio = gr.Audio(
+                            value="assets/silence_100ms.wav",
+                            label="Optional Prefix Audio (continue from this audio)",
+                            type="filepath",
+                        )
+            # Second panel - Voice Characteristics
+            with gr.Column(elem_classes="panel"):
+                gr.Markdown('<div class="title">🎤 Voice Characteristics</div>')
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        speaker_audio = gr.Audio(
+                            label="Optional Speaker Audio (for voice cloning)",
+                            type="filepath",
+                        )
+                        speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)
+                    with gr.Column(scale=2):
+                        with gr.Row():
+                            with gr.Column():
+                                dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="Voice Quality", elem_classes="slider-container")
+                                fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Frequency Max (Hz)", elem_classes="slider-container")
+                                vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="Voice Clarity", elem_classes="slider-container")
+                            with gr.Column():
+                                pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Variation", elem_classes="slider-container")
+                                speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", elem_classes="slider-container")
+            # Third panel - Generation Parameters
+            with gr.Column(elem_classes="panel"):
+                gr.Markdown('<div class="title">⚙️ Generation Parameters</div>')
+                with gr.Row():
+                    with gr.Column():
+                        cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="Guidance Scale", elem_classes="slider-container")
+                        min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P (Randomness)", elem_classes="slider-container")
+                    with gr.Column():
+                        seed_number = gr.Number(label="Seed", value=420, precision=0)
+                        randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)
+            # Emotion Panel with Tabbed Interface
+            with gr.Accordion("🎭 Emotion Settings", open=False, elem_classes="panel"):
+                gr.Markdown(
+                    "Adjust these sliders to control the emotional tone of the generated speech.\n"
+                    "For a neutral voice, keep 'Neutral' high and other emotions low."
+                )
+                with gr.Row(elem_classes="emotion-grid"):
+                    emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness", elem_classes="slider-container")
+                    emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness", elem_classes="slider-container")
+                    emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust", elem_classes="slider-container")
+                    emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear", elem_classes="slider-container")
+                with gr.Row(elem_classes="emotion-grid"):
+                    emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise", elem_classes="slider-container")
+                    emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger", elem_classes="slider-container")
+                    emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other", elem_classes="slider-container")
+                    emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral", elem_classes="slider-container")
+            # Advanced Settings Panel
+            with gr.Accordion("⚡ Advanced Settings", open=False, elem_classes="panel"):
+                gr.Markdown(
+                    "### Unconditional Toggles\n"
+                    "Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
+                    'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
+                )
+                unconditional_keys = gr.CheckboxGroup(
+                    [
+                        "speaker",
+                        "emotion",
+                        "vqscore_8",
+                        "fmax",
+                        "pitch_std",
+                        "speaking_rate",
+                        "dnsmos_ovrl",
+                        "speaker_noised",
+                    ],
+                    value=["emotion"],
+                    label="Unconditional Keys",
+                )
+            # Generate Button and Output Area
+            with gr.Column(elem_classes="panel output-container"):
+                gr.Markdown('<div class="title">🔊 Generate & Output</div>')
+                generate_button = gr.Button("Generate Audio", elem_classes="generate-button")
+                output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True, elem_classes="audio-output")
+        model_choice.change(
+            fn=update_ui,
+            inputs=[model_choice],
+            outputs=[
+                text,
+                language,
+                speaker_audio,
+                prefix_audio,
+                emotion1,
+                emotion2,
+                emotion3,
+                emotion4,
+                emotion5,
+                emotion6,
+                emotion7,
+                emotion8,
+                vq_single_slider,
+                fmax_slider,
+                pitch_std_slider,
+                speaking_rate_slider,
+                dnsmos_slider,
+                speaker_noised_checkbox,
+                unconditional_keys,
+            ],
+        )
+        # On page load, trigger the same UI refresh
+        demo.load(
+            fn=update_ui,
+            inputs=[model_choice],
+            outputs=[
+                text,
+                language,
+                speaker_audio,
+                prefix_audio,
+                emotion1,
+                emotion2,
+                emotion3,
+                emotion4,
+                emotion5,
+                emotion6,
+                emotion7,
+                emotion8,
+                vq_single_slider,
+                fmax_slider,
+                pitch_std_slider,
+                speaking_rate_slider,
+                dnsmos_slider,
+                speaker_noised_checkbox,
+                unconditional_keys,
+            ],
+        )
+        # Generate audio on button click
+        generate_button.click(
+            fn=generate_audio,
+            inputs=[
+                model_choice,
+                text,
+                language,
+                speaker_audio,
+                prefix_audio,
+                emotion1,
+                emotion2,
+                emotion3,
+                emotion4,
+                emotion5,
+                emotion6,
+                emotion7,
+                emotion8,
+                vq_single_slider,
+                fmax_slider,
+                pitch_std_slider,
+                speaking_rate_slider,
+                dnsmos_slider,
+                speaker_noised_checkbox,
+                cfg_scale_slider,
+                min_p_slider,
+                seed_number,
+                randomize_seed_toggle,
+                unconditional_keys,
+            ],
+            outputs=[output_audio, seed_number],
+        )
+    return demo
+if __name__ == "__main__":
+    demo = build_interface()
+    share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t")
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=share, mcp_server=True)
+# # Imports
+# import gradio as gr
+# import spaces
+# import os
+# import torch
+# import torchaudio
+# import time
+# from zonos.model import Zonos
+# from zonos.conditioning import make_cond_dict, supported_language_codes
+# # Variables
+# HF_TOKEN = os.environ.get("HF_TOKEN", "")
+# device = "cuda"
+# REPO = "Zyphra/Zonos-v0.1-transformer"
+# model = Zonos.from_pretrained(REPO, device=device)
+# # Functions
+# def patch_cuda():
+#     if torch.cuda.is_available():
+#         for i in range(torch.cuda.device_count()):
+#             p = torch.cuda.get_device_properties(i)
+#             if not hasattr(p, "regs_per_multiprocessor"):
+#                 setattr(p, "regs_per_multiprocessor", 65536)
+#             if not hasattr(p, "max_threads_per_multi_processor"):
+#                 setattr(p, "max_threads_per_multi_processor", 2048)
+# @spaces.GPU
+# def generate(input, language, speaker_audio, emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral, clarity, fmax, pitch_std, speaking_rate, dnsmos_ovrl, cfg_scale, min_p, steps, seed, randomize_seed):
+#     if randomize_seed: seed = int(time.time())
+#     torch.manual_seed(seed)
+#     speaker_embedding = None
+#     if speaker_audio is not None:
+#         print(1)
+#         print(speaker_audio)
+#         wav, sr = torchaudio.load(speaker_audio)
+#         print(2)
+#         print(wav)
+#         print(sr)
+#         speaker_embedding = (model.make_speaker_embedding(wav, sr).to(device, dtype=torch.bfloat16))
+#         print(3)
+#         print(speaker_embedding)
+#     emotion_tensor = torch.tensor([emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral], device=device, dtype=torch.bfloat16)
+#     vq_tensor = torch.tensor([clarity] * 8, device=device, dtype=torch.bfloat16).unsqueeze(0)
+#     print(4)
+#     print(emotion_tensor)
+#     print(vq_tensor)
+#     cond_dict = make_cond_dict(
+#         text=input,
+#         language=language,
+#         speaker=speaker_embedding,
+#         emotion=emotion_tensor,
+#         vqscore_8=vq_tensor,
+#         fmax=float(fmax),
+#         pitch_std=float(pitch_std),
+#         speaking_rate=float(speaking_rate),
+#         dnsmos_ovrl=float(dnsmos_ovrl),
+#         device=device,
+#     )
+#     print(5)
+#     print(cond_dict)
+#     conditioning = model.prepare_conditioning(cond_dict)
+#     print(6)
+#     print(conditioning)
+#     codes = model.generate(
+#         prefix_conditioning=conditioning,
+#         max_new_tokens=int(steps),
+#         cfg_scale=float(cfg_scale),
+#         batch_size=1,
+#         sampling_params=dict(min_p=float(min_p)),
+#     )
+#     print(7)
+#     print(codes)
+#     wav_out = model.autoencoder.decode(codes).cpu().detach()
+#     sr_out = model.autoencoder.sampling_rate
+#     print(8)
+#     print(wav_out)
+#     print(sr_out)
+#     if wav_out.dim() == 2 and wav_out.size(0) > 1: wav_out = wav_out[0:1, :]
+#     print(9)
+#     print((sr_out, wav_out.squeeze().numpy()))
+#     return (sr_out, wav_out.squeeze().numpy())
+# # Initialize
+# patch_cuda()
+# with gr.Blocks() as main:
+#     text = gr.Textbox(label="text", value="hello, world!")
+#     language = gr.Dropdown(choices=supported_language_codes, value="en-us", label="language")
+#     speaker_audio = gr.Audio(label="voice reference", type="filepath")
+#     clarity_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="clarity")
+#     steps_slider = gr.Slider(1, 3000, 316, 1, label="steps")
+#     dnsmos_slider = gr.Slider(1.0, 5.0, 5.0, 0.1, label="quality")
+#     fmax_slider = gr.Slider(0, 24000, 24000, 1, label="fmax")
+#     pitch_std_slider = gr.Slider(0.0, 1000.0, 30.0, 1, label="pitch std")
+#     speaking_rate_slider = gr.Slider(5.0, 30.0, 15.0, 0.1, label="rate")
+#     cfg_scale_slider = gr.Slider(1.0, 5.0, 2.5, 0.1, label="guidance")
+#     min_p_slider = gr.Slider(0.0, 1.0, 0.05, 0.15, label="min p")
+#     with gr.Row():
+#         e1 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="happy")
+#         e2 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="sad")
+#         e3 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="disgust")
+#         e4 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="fear")
+#         e5 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="surprise")
+#         e6 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="anger")
+#         e7 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="other")
+#         e8 = gr.Slider(0.0, 1.0, 1.0, 0.01, label="neutral")
+#     seed_number = gr.Number(label="seed", value=42, precision=0)
+#     randomize_seed_toggle = gr.Checkbox(label="randomize seed", value=True)
+#     generate_button = gr.Button("generate")
+#     output_audio = gr.Audio(label="output", type="numpy", autoplay=True)
+#     generate_button.click(fn=generate, inputs=[text, language, speaker_audio, e1, e2, e3, e4, e5, e6, e7, e8, clarity_slider, fmax_slider, pitch_std_slider, speaking_rate_slider, dnsmos_slider, cfg_scale_slider, min_p_slider, steps_slider, seed_number, randomize_seed_toggle], outputs=output_audio)
+# main.launch()