import gradio as gr import numpy as np import librosa from transformers import pipeline import tempfile from functools import lru_cache # Cache the model to avoid reloading on every interaction @lru_cache(maxsize=1) def load_model(): return pipeline( model='fixie-ai/ultravox-v0_5-llama-3_2-1b', trust_remote_code=True, device_map="auto" # Automatically uses GPU if available ) def process_audio(audio_file, user_message): try: # Load audio (supports file upload or microphone input) if isinstance(audio_file, (str, tempfile._TemporaryFileWrapper)): audio_path = audio_file.name if hasattr(audio_file, 'name') else audio_file audio, sr = librosa.load(audio_path, sr=16000) else: # Handle direct numpy array from microphone sr, audio = audio_file # Initialize conversation turns = [ { "role": "system", "content": "You are a friendly and helpful AI assistant. Respond conversationally to the user's audio input." }, { "role": "user", "content": user_message if user_message else "Describe what you heard in the audio." } ] # Get model prediction pipe = load_model() result = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=100) return result[-1]["content"] except Exception as e: return f"Error processing audio: {str(e)}" # Gradio UI with gr.Blocks(title="UltraVox Audio Assistant") as demo: gr.Markdown("## 🎤 UltraVox Audio Assistant") gr.Markdown("Upload an audio file or speak via microphone, then ask questions about it.") with gr.Row(): audio_input = gr.Audio( sources=["upload", "microphone"], type="filepath", label="Input Audio" ) text_input = gr.Textbox( label="Your Question (Optional)", placeholder="Ask me about the audio..." ) submit_btn = gr.Button("Process") output = gr.Textbox(label="AI Response", interactive=False) submit_btn.click( fn=process_audio, inputs=[audio_input, text_input], outputs=output ) gr.Examples( examples=[ ["examples/weather_report.wav", "What's the weather forecast?"], ["examples/meeting_notes.mp3", "Summarize the key points"] ], inputs=[audio_input, text_input] ) if __name__ == "__main__": demo.launch()