import os import gradio as gr import torch import numpy as np from transformers import pipeline, AutoTokenizer from diffusers import DiffusionPipeline from pyannote.audio import Pipeline as PyannotePipeline from dia.model import DiaConfig, DiaModel, Dia from dac.utils import load_model as load_dac_model from accelerate import init_empty_weights, load_checkpoint_and_dispatch HF_TOKEN = os.environ["HF_TOKEN"] device_map = "auto" # RVQ Codec rvq = load_dac_model(tag="latest", model_type="44khz") rvq.eval() if torch.cuda.is_available(): rvq = rvq.to("cuda") # VAD Pipeline vad_pipe = PyannotePipeline.from_pretrained( "pyannote/voice-activity-detection", use_auth_token=HF_TOKEN ) # Ultravox Pipeline ultravox_pipe = pipeline( model="fixie-ai/ultravox-v0_4", trust_remote_code=True, device_map=device_map, torch_dtype=torch.float16 ) # Audio Diffusion diff_pipe = DiffusionPipeline.from_pretrained( "teticio/audio-diffusion-instrumental-hiphop-256", torch_dtype=torch.float16 ).to("cuda") # Dia TTS Loading config = DiaConfig.from_pretrained("nari-labs/Dia-1.6B") with init_empty_weights(): base_model = DiaModel(config) base_model = load_checkpoint_and_dispatch( base_model, "nari-labs/Dia-1.6B", device_map=device_map, dtype=torch.float16 ) dia = Dia(base_model, config) # Save tokenizer for Dia text processing tokenizer = AutoTokenizer.from_pretrained("nari-labs/Dia-1.6B") def process_audio(audio): sr, array = audio array = array.numpy() if torch.is_tensor(array) else array vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr}) x = torch.tensor(array).unsqueeze(0).to("cuda") codes = rvq.encode(x); decoded = rvq.decode(codes).squeeze().cpu().numpy() ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr}) text = ultra_out.get("text", "") pros = diff_pipe(raw_audio=decoded)["audios"][0] inputs = tokenizer(f"[emotion:neutral] {text}", return_tensors="pt").to("cuda") tts_tensors = dia.generate(**inputs) tts_np = tts_tensors.squeeze().cpu().numpy() tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np return (sr, tts_np), text with gr.Blocks(title="Maya AI 📈") as demo: gr.Markdown("## Maya-AI: Supernatural Conversational Agent") audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice") send_btn = gr.Button("Send") audio_out = gr.Audio(label="AI Response") text_out = gr.Textbox(label="Generated Text") send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out]) if __name__ == "__main__": demo.launch()