Spaces:

Staticaliza
/

Zero-5

Running on Zero

App Files Files Community

Staticaliza commited on 13 days ago

Commit

39d0c1a

verified ·

1 Parent(s): dec0f9c

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -325

app.py CHANGED Viewed

@@ -1,44 +1,18 @@
-import os
-import shlex
-import subprocess
-subprocess.run(
-    shlex.split("pip install flash-attn --no-build-isolation"),
-    env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-    check=True,
-)
-subprocess.run(
-    shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
-    check=True,
-)
-subprocess.run(
-    shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
-    check=True,
-)
-import spaces
-import torch
-import torchaudio
-import gradio as gr
-from os import getenv
 from zonos.model import Zonos
 from zonos.conditioning import make_cond_dict, supported_language_codes
-# 1. hard-kill torch.compile / dynamo / inductor so they never run
 os.environ["TORCH_COMPILE_DISABLE"] = "1"
 os.environ["TORCHINDUCTOR_DISABLE"] = "1"
-os.environ["TORCHDYNAMO_DISABLE"] = "1"          # <- the one that actually blocks torch._dynamo
-os.environ["TORCHDYNAMO_SUPPRESS_ERRORS"] = "True"  # fall back to eager if something still slips through  :contentReference[oaicite:1]{index=1}
-torch._dynamo.disable()        # guard for older versions
-torch.compile = lambda f,*_,**__: f   # no-op wrapper
 device = "cuda"
-MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
-MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
-for model in MODELS.values():
-    model.requires_grad_(False).eval()
 def _patch_cuda_props():
     if torch.cuda.is_available():
@@ -48,70 +22,15 @@ def _patch_cuda_props():
                 setattr(p, "regs_per_multiprocessor", 65536)
             if not hasattr(p, "max_threads_per_multi_processor"):
                 setattr(p, "max_threads_per_multi_processor", 2048)
-_patch_cuda_props()
-def update_ui(model_choice):
-    """
-    Dynamically show/hide UI elements based on the model's conditioners.
-    We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
-    """
-    model = MODELS[model_choice]
-    cond_names = [c.name for c in model.prefix_conditioner.conditioners]
-    print("Conditioners in this model:", cond_names)
-    text_update = gr.update(visible=("espeak" in cond_names))
-    language_update = gr.update(visible=("espeak" in cond_names))
-    speaker_audio_update = gr.update(visible=("speaker" in cond_names))
-    prefix_audio_update = gr.update(visible=True)
-    emotion1_update = gr.update(visible=("emotion" in cond_names))
-    emotion2_update = gr.update(visible=("emotion" in cond_names))
-    emotion3_update = gr.update(visible=("emotion" in cond_names))
-    emotion4_update = gr.update(visible=("emotion" in cond_names))
-    emotion5_update = gr.update(visible=("emotion" in cond_names))
-    emotion6_update = gr.update(visible=("emotion" in cond_names))
-    emotion7_update = gr.update(visible=("emotion" in cond_names))
-    emotion8_update = gr.update(visible=("emotion" in cond_names))
-    vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
-    fmax_slider_update = gr.update(visible=("fmax" in cond_names))
-    pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
-    speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
-    dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
-    speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
-    unconditional_keys_update = gr.update(
-        choices=[name for name in cond_names if name not in ("espeak", "language_id")]
-    )
-    return (
-        text_update,
-        language_update,
-        speaker_audio_update,
-        prefix_audio_update,
-        emotion1_update,
-        emotion2_update,
-        emotion3_update,
-        emotion4_update,
-        emotion5_update,
-        emotion6_update,
-        emotion7_update,
-        emotion8_update,
-        vq_single_slider_update,
-        fmax_slider_update,
-        pitch_std_slider_update,
-        speaking_rate_slider_update,
-        dnsmos_slider_update,
-        speaker_noised_checkbox_update,
-        unconditional_keys_update,
-    )
-@spaces.GPU(duration=120)
 def generate_audio(
-    model_choice,
     text,
     language,
     speaker_audio,
-    prefix_audio,
     e1,
     e2,
     e3,
@@ -120,58 +39,35 @@ def generate_audio(
     e6,
     e7,
     e8,
-    vq_single,
     fmax,
     pitch_std,
     speaking_rate,
     dnsmos_ovrl,
-    speaker_noised,
     cfg_scale,
     min_p,
     seed,
     randomize_seed,
-    unconditional_keys,
     progress=gr.Progress(),
 ):
-    """
-    Generates audio based on the provided UI parameters.
-    We do NOT use language_id or ctc_loss even if the model has them.
-    """
-    selected_model = MODELS[model_choice]
-    speaker_noised_bool = bool(speaker_noised)
-    fmax = float(fmax)
-    pitch_std = float(pitch_std)
-    speaking_rate = float(speaking_rate)
-    dnsmos_ovrl = float(dnsmos_ovrl)
-    cfg_scale = float(cfg_scale)
-    min_p = float(min_p)
-    seed = int(seed)
-    max_new_tokens = 86 * 30
     if randomize_seed:
         seed = torch.randint(0, 2**32 - 1, (1,)).item()
-    torch.manual_seed(seed)
     speaker_embedding = None
-    if speaker_audio is not None and "speaker" not in unconditional_keys:
         wav, sr = torchaudio.load(speaker_audio)
-        speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
-        speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)
-    audio_prefix_codes = None
-    if prefix_audio is not None:
-        wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
-        wav_prefix = wav_prefix.mean(0, keepdim=True)
-        wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
-        wav_prefix = wav_prefix.to(device, dtype=torch.float32)
-        with torch.autocast(device, dtype=torch.float32):
-            audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))
-    emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)
-    vq_val = float(vq_single)
-    vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)
     cond_dict = make_cond_dict(
         text=text,
@@ -179,244 +75,104 @@ def generate_audio(
         speaker=speaker_embedding,
         emotion=emotion_tensor,
         vqscore_8=vq_tensor,
-        fmax=fmax,
-        pitch_std=pitch_std,
-        speaking_rate=speaking_rate,
-        dnsmos_ovrl=dnsmos_ovrl,
-        speaker_noised=speaker_noised_bool,
         device=device,
-        unconditional_keys=unconditional_keys,
     )
-    conditioning = selected_model.prepare_conditioning(cond_dict)
-    estimated_generation_duration = 30 * len(text) / 400
-    estimated_total_steps = int(estimated_generation_duration * 86)
-    def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
         progress((step, estimated_total_steps))
         return True
-    codes = selected_model.generate(
         prefix_conditioning=conditioning,
-        audio_prefix_codes=audio_prefix_codes,
-        max_new_tokens=max_new_tokens,
-        cfg_scale=cfg_scale,
         batch_size=1,
-        sampling_params=dict(min_p=min_p),
-        callback=update_progress,
     )
-    wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
-    sr_out = selected_model.autoencoder.sampling_rate
     if wav_out.dim() == 2 and wav_out.size(0) > 1:
         wav_out = wav_out[0:1, :]
     return (sr_out, wav_out.squeeze().numpy()), seed
 def build_interface():
-    # Build interface with enhanced visual elements and layout
     with gr.Blocks() as demo:
-        # Header section
-        with gr.Column(elem_classes="app-header"):
-            gr.Markdown("# ✨ Zonos Text-to-Speech Generator ✨")
-            gr.Markdown("Create natural-sounding speech with customizable voice characteristics")
-        # Main content container
-        with gr.Column(elem_classes="container"):
-            # First panel - Text & Model Selection
-            with gr.Column(elem_classes="panel"):
-                gr.Markdown('<div class="title">💬 Text & Model Configuration</div>')
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        model_choice = gr.Dropdown(
-                            choices=MODEL_NAMES,
-                            value="Zyphra/Zonos-v0.1-transformer",
-                            label="Zonos Model Type",
-                            info="Select the model variant to use.",
-                        )
-                        text = gr.Textbox(
-                            label="Text to Synthesize",
-                            value="Zonos uses eSpeak for text to phoneme conversion!",
-                            lines=4,
-                            max_length=500,
-                        )
-                        language = gr.Dropdown(
-                            choices=supported_language_codes,
-                            value="en-us",
-                            label="Language Code",
-                            info="Select a language code.",
-                        )
-                    with gr.Column(scale=1):
-                        prefix_audio = gr.Audio(
-                            value="assets/silence_100ms.wav",
-                            label="Optional Prefix Audio (continue from this audio)",
-                            type="filepath",
-                        )
-            # Second panel - Voice Characteristics
-            with gr.Column(elem_classes="panel"):
-                gr.Markdown('<div class="title">🎤 Voice Characteristics</div>')
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        speaker_audio = gr.Audio(
-                            label="Optional Speaker Audio (for voice cloning)",
-                            type="filepath",
-                        )
-                        speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)
-                    with gr.Column(scale=2):
-                        with gr.Row():
-                            with gr.Column():
-                                dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="Voice Quality", elem_classes="slider-container")
-                                fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Frequency Max (Hz)", elem_classes="slider-container")
-                                vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="Voice Clarity", elem_classes="slider-container")
-                            with gr.Column():
-                                pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Variation", elem_classes="slider-container")
-                                speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", elem_classes="slider-container")
-            # Third panel - Generation Parameters
-            with gr.Column(elem_classes="panel"):
-                gr.Markdown('<div class="title">⚙️ Generation Parameters</div>')
-                with gr.Row():
-                    with gr.Column():
-                        cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="Guidance Scale", elem_classes="slider-container")
-                        min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P (Randomness)", elem_classes="slider-container")
-                    with gr.Column():
-                        seed_number = gr.Number(label="Seed", value=420, precision=0)
-                        randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)
-            # Emotion Panel with Tabbed Interface
-            with gr.Accordion("🎭 Emotion Settings", open=False, elem_classes="panel"):
-                gr.Markdown(
-                    "Adjust these sliders to control the emotional tone of the generated speech.\n"
-                    "For a neutral voice, keep 'Neutral' high and other emotions low."
-                )
-                with gr.Row(elem_classes="emotion-grid"):
-                    emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness", elem_classes="slider-container")
-                    emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness", elem_classes="slider-container")
-                    emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust", elem_classes="slider-container")
-                    emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear", elem_classes="slider-container")
-                with gr.Row(elem_classes="emotion-grid"):
-                    emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise", elem_classes="slider-container")
-                    emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger", elem_classes="slider-container")
-                    emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other", elem_classes="slider-container")
-                    emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral", elem_classes="slider-container")
-            # Advanced Settings Panel
-            with gr.Accordion("⚡ Advanced Settings", open=False, elem_classes="panel"):
-                gr.Markdown(
-                    "### Unconditional Toggles\n"
-                    "Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
-                    'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
-                )
-                unconditional_keys = gr.CheckboxGroup(
-                    [
-                        "speaker",
-                        "emotion",
-                        "vqscore_8",
-                        "fmax",
-                        "pitch_std",
-                        "speaking_rate",
-                        "dnsmos_ovrl",
-                        "speaker_noised",
-                    ],
-                    value=["emotion"],
-                    label="Unconditional Keys",
-                )
-            # Generate Button and Output Area
-            with gr.Column(elem_classes="panel output-container"):
-                gr.Markdown('<div class="title">🔊 Generate & Output</div>')
-                generate_button = gr.Button("Generate Audio", elem_classes="generate-button")
-                output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True, elem_classes="audio-output")
-        model_choice.change(
-            fn=update_ui,
-            inputs=[model_choice],
-            outputs=[
-                text,
-                language,
-                speaker_audio,
-                prefix_audio,
-                emotion1,
-                emotion2,
-                emotion3,
-                emotion4,
-                emotion5,
-                emotion6,
-                emotion7,
-                emotion8,
-                vq_single_slider,
-                fmax_slider,
-                pitch_std_slider,
-                speaking_rate_slider,
-                dnsmos_slider,
-                speaker_noised_checkbox,
-                unconditional_keys,
-            ],
         )
-        # On page load, trigger the same UI refresh
-        demo.load(
-            fn=update_ui,
-            inputs=[model_choice],
-            outputs=[
-                text,
-                language,
-                speaker_audio,
-                prefix_audio,
-                emotion1,
-                emotion2,
-                emotion3,
-                emotion4,
-                emotion5,
-                emotion6,
-                emotion7,
-                emotion8,
-                vq_single_slider,
-                fmax_slider,
-                pitch_std_slider,
-                speaking_rate_slider,
-                dnsmos_slider,
-                speaker_noised_checkbox,
-                unconditional_keys,
-            ],
-        )
-        # Generate audio on button click
         generate_button.click(
             fn=generate_audio,
             inputs=[
-                model_choice,
                 text,
                 language,
                 speaker_audio,
-                prefix_audio,
-                emotion1,
-                emotion2,
-                emotion3,
-                emotion4,
-                emotion5,
-                emotion6,
-                emotion7,
-                emotion8,
-                vq_single_slider,
                 fmax_slider,
                 pitch_std_slider,
                 speaking_rate_slider,
                 dnsmos_slider,
-                speaker_noised_checkbox,
                 cfg_scale_slider,
                 min_p_slider,
                 seed_number,
                 randomize_seed_toggle,
-                unconditional_keys,
             ],
             outputs=[output_audio, seed_number],
         )
     return demo
 if __name__ == "__main__":
-    demo = build_interface()
-    demo.launch()

+import os, torch, torchaudio, gradio as gr
 from zonos.model import Zonos
 from zonos.conditioning import make_cond_dict, supported_language_codes
 os.environ["TORCH_COMPILE_DISABLE"] = "1"
 os.environ["TORCHINDUCTOR_DISABLE"] = "1"
+os.environ["TORCHDYNAMO_DISABLE"] = "1"
+os.environ["TORCHDYNAMO_SUPPRESS_ERRORS"] = "True"
+torch._dynamo.disable()
+torch.compile = lambda f, *_, **__: f
 device = "cuda"
+MODEL_NAME = "Zyphra/Zonos-v0.1-transformer"
+MODEL = Zonos.from_pretrained(MODEL_NAME, device=device).requires_grad_(False).eval()
 def _patch_cuda_props():
     if torch.cuda.is_available():
                 setattr(p, "regs_per_multiprocessor", 65536)
             if not hasattr(p, "max_threads_per_multi_processor"):
                 setattr(p, "max_threads_per_multi_processor", 2048)
+_patch_cuda_props()
 def generate_audio(
     text,
     language,
     speaker_audio,
     e1,
     e2,
     e3,
     e6,
     e7,
     e8,
+    clarity,
     fmax,
     pitch_std,
     speaking_rate,
     dnsmos_ovrl,
     cfg_scale,
     min_p,
+    steps,
     seed,
     randomize_seed,
     progress=gr.Progress(),
 ):
     if randomize_seed:
         seed = torch.randint(0, 2**32 - 1, (1,)).item()
+    torch.manual_seed(int(seed))
     speaker_embedding = None
+    if speaker_audio is not None:
         wav, sr = torchaudio.load(speaker_audio)
+        speaker_embedding = (
+            MODEL.make_speaker_embedding(wav, sr).to(device, dtype=torch.bfloat16)
+        )
+    emotion_tensor = torch.tensor(
+        [e1, e2, e3, e4, e5, e6, e7, e8], device=device, dtype=torch.float32
+    )
+    vq_tensor = torch.tensor([clarity] * 8, device=device, dtype=torch.float32).unsqueeze(
+        0
+    )
     cond_dict = make_cond_dict(
         text=text,
         speaker=speaker_embedding,
         emotion=emotion_tensor,
         vqscore_8=vq_tensor,
+        fmax=float(fmax),
+        pitch_std=float(pitch_std),
+        speaking_rate=float(speaking_rate),
+        dnsmos_ovrl=float(dnsmos_ovrl),
         device=device,
     )
+    conditioning = MODEL.prepare_conditioning(cond_dict)
+    estimated_total_steps = int(steps)
+    def cb(_, step, __):
         progress((step, estimated_total_steps))
         return True
+    codes = MODEL.generate(
         prefix_conditioning=conditioning,
+        max_new_tokens=int(steps),
+        cfg_scale=float(cfg_scale),
         batch_size=1,
+        sampling_params=dict(min_p=float(min_p)),
+        callback=cb,
     )
+    wav_out = MODEL.autoencoder.decode(codes).cpu().detach()
+    sr_out = MODEL.autoencoder.sampling_rate
     if wav_out.dim() == 2 and wav_out.size(0) > 1:
         wav_out = wav_out[0:1, :]
     return (sr_out, wav_out.squeeze().numpy()), seed
 def build_interface():
     with gr.Blocks() as demo:
+        gr.Markdown("# ✨ zonos tts generator ✨")
+        text = gr.Textbox(label="text", value="hello, world!", lines=4, max_length=500)
+        language = gr.Dropdown(
+            choices=supported_language_codes, value="en-us", label="language"
         )
+        speaker_audio = gr.Audio(label="voice reference", type="filepath")
+        clarity_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="clarity")
+        steps_slider = gr.Slider(1, 3000, 300, 1, label="steps")
+        dnsmos_slider = gr.Slider(1.0, 5.0, 4.0, 0.1, label="quality")
+        fmax_slider = gr.Slider(0, 24000, 24000, 1, label="fmax")
+        pitch_std_slider = gr.Slider(0.0, 300.0, 45.0, 1, label="pitch std")
+        speaking_rate_slider = gr.Slider(5.0, 30.0, 15.0, 0.5, label="rate")
+        cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="guidance")
+        min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="min p")
+        with gr.Row():
+            e1 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="happy")
+            e2 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="sad")
+            e3 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="disgust")
+            e4 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="fear")
+        with gr.Row():
+            e5 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="surprise")
+            e6 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="anger")
+            e7 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="other")
+            e8 = gr.Slider(0.0, 1.0, 0.0, 0.05, label="neutral")
+        seed_number = gr.Number(label="seed", value=420, precision=0)
+        randomize_seed_toggle = gr.Checkbox(label="randomize seed", value=True)
+        generate_button = gr.Button("generate")
+        output_audio = gr.Audio(label="output", type="numpy", autoplay=True)
         generate_button.click(
             fn=generate_audio,
             inputs=[
                 text,
                 language,
                 speaker_audio,
+                e1,
+                e2,
+                e3,
+                e4,
+                e5,
+                e6,
+                e7,
+                e8,
+                clarity_slider,
                 fmax_slider,
                 pitch_std_slider,
                 speaking_rate_slider,
                 dnsmos_slider,
                 cfg_scale_slider,
                 min_p_slider,
+                steps_slider,
                 seed_number,
                 randomize_seed_toggle,
             ],
             outputs=[output_audio, seed_number],
         )
     return demo
 if __name__ == "__main__":
+    build_interface().launch()