import gradio as gr import torch import numpy as np import tempfile import os import logging from typing import Optional, Tuple # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Global model variable model = None def load_dia_model(): """Load the Dia model""" global model try: logger.info("Loading Dia model...") from dia import Dia # Load with appropriate device and dtype device = torch.device("cuda" if torch.cuda.is_available() else "cpu") compute_dtype = "float16" if torch.cuda.is_available() else "float32" model = Dia.from_pretrained( "nari-labs/Dia-1.6B-0626", device=device, compute_dtype=compute_dtype ) logger.info(f"Dia model loaded successfully on {device}") return True except Exception as e: logger.error(f"Failed to load Dia model: {e}") return False def generate_speech( text: str, max_tokens: int = 3072, temperature: float = 0.7, top_p: float = 0.9 ) -> Tuple[Optional[str], str]: """Generate speech from text using Dia model""" if not text or not text.strip(): return None, "❌ Please enter some text to convert to speech" if model is None: return None, "❌ Model not loaded. Please refresh the page and try again." try: logger.info(f"Generating speech for text: {text[:50]}...") # Generate audio using Dia model audio_array = model.generate( text=text.strip(), max_tokens=max_tokens, temperature=temperature, top_p=top_p ) # Save to temporary file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: model.save_audio(temp_file.name, audio_array) logger.info("Speech generation completed successfully") return temp_file.name, f"✅ Generated speech for: '{text[:50]}{'...' if len(text) > 50 else ''}'" except Exception as e: error_msg = f"❌ Error generating speech: {str(e)}" logger.error(error_msg) return None, error_msg # Load model on startup model_loaded = load_dia_model() # Create Gradio interface with gr.Blocks( title="Dia TTS - Nari Voice Generator", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 800px !important; margin: auto !important; } """ ) as demo: gr.Markdown(""" # 🎙️ Dia TTS - Nari Voice Generator Convert your text into natural, human-like speech using the advanced Dia text-to-speech model. **Model**: `nari-labs/Dia-1.6B-0626` """) if not model_loaded: gr.Markdown("⚠️ **Warning**: Model failed to load. Some functionality may not work.") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="📝 Text Input", placeholder="Enter the text you want to convert to speech...", lines=4, max_lines=10 ) with gr.Row(): max_tokens = gr.Slider( minimum=512, maximum=4096, value=3072, step=128, label="🎯 Max Tokens" ) temperature = gr.Slider( minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="🌡️ Temperature" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="🎲 Top P" ) generate_btn = gr.Button( "🎵 Generate Speech", variant="primary", size="lg" ) with gr.Column(): audio_output = gr.Audio( label="🔊 Generated Speech", type="filepath" ) status_output = gr.Textbox( label="📊 Status", interactive=False, lines=2 ) # Event handlers generate_btn.click( fn=generate_speech, inputs=[text_input, max_tokens, temperature, top_p], outputs=[audio_output, status_output], show_progress=True ) # Examples gr.Examples( examples=[ ["Transform your text into natural, human-like speech with our advanced AI technology.", 3072, 0.7, 0.9], ["The quick brown fox jumps over the lazy dog. This is a test of the Dia text-to-speech system.", 2048, 0.8, 0.9], ["Welcome to the future of voice synthesis. Experience the power of AI-generated speech.", 3072, 0.6, 0.8], ], inputs=[text_input, max_tokens, temperature, top_p], outputs=[audio_output, status_output], fn=generate_speech, cache_examples=False ) gr.Markdown(""" --- ### 📚 Usage Tips: - **Max Tokens**: Controls the length of generated audio (higher = longer) - **Temperature**: Controls randomness (0.1 = conservative, 1.0 = creative) - **Top P**: Controls diversity of word selection (0.1 = focused, 1.0 = diverse) ### ⚙️ Technical Details: - Model: Dia-1.6B-0626 by Nari Labs - Output Format: WAV audio - Recommended Text Length: 50-500 characters for best results """) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True, quiet=False )