import os import sys import subprocess import threading import time import gradio as gr # Configure environment for HF Spaces HF_SPACES = os.environ.get("SPACE_ID") is not None MODEL_PATH = os.environ.get("MODEL_PATH", "ICTNLP/Llama-3.1-8B-Omni") DEVICE = "cuda" if os.environ.get("SYSTEM_CUDA_VISIBLE_DEVICES") else "cpu" def run_background_process(cmd, name): """Run a background process and return the process object.""" print(f"Starting {name}...") process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, universal_newlines=True, shell=True ) return process def read_process_output(process, output_box, name): """Read and update the output from a process.""" full_output = f"### {name} Output:\n\n" for line in process.stdout: full_output += line output_box.update(value=full_output) # Process ended return_code = process.wait() full_output += f"\n\nProcess exited with code {return_code}" output_box.update(value=full_output) def setup_environment(): """Set up the environment by installing dependencies and downloading models.""" # Create necessary directories os.makedirs("models/speech_encoder", exist_ok=True) os.makedirs("vocoder", exist_ok=True) output = "Setting up environment...\n" # Install dependencies only if not in HF Space (they're pre-installed there) if not HF_SPACES: output += "Installing dependencies...\n" subprocess.run("pip install openai-whisper>=20231117", shell=True) subprocess.run("pip install fairseq==0.12.2", shell=True) # Download vocoder if needed if not os.path.exists("vocoder/g_00500000"): output += "Downloading vocoder...\n" subprocess.run( "wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000 -P vocoder/", shell=True ) subprocess.run( "wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json -P vocoder/", shell=True ) # Initialize Whisper (it will be downloaded automatically) output += "Initializing Whisper model (this may take a while)...\n" try: import whisper whisper.load_model("tiny", download_root="models/speech_encoder/") output += "✅ Whisper model initialized successfully!\n" except Exception as e: output += f"❌ Error initializing Whisper model: {str(e)}\n" return output + "✅ Environment setup complete!" def start_services(): """Start the controller, model worker, and web server.""" output = "Starting LLaMA-Omni services...\n" # Start the controller controller_cmd = "python -m omni_speech.serve.controller --host 0.0.0.0 --port 10000" controller_process = run_background_process(controller_cmd, "Controller") output += "✅ Controller started\n" # Wait for controller to start time.sleep(5) # Start the model worker worker_cmd = f"python -m omni_speech.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path {MODEL_PATH} --model-name Llama-3.1-8B-Omni --s2s" model_worker_process = run_background_process(worker_cmd, "Model Worker") output += f"✅ Model worker started with model: {MODEL_PATH}\n" # Wait for model worker to start time.sleep(10) # Start the web server (this is handled separately since we're using the Gradio UI directly) output += "✅ All services started successfully!\n" # Keep references to processes to prevent garbage collection global controller_proc, worker_proc controller_proc = controller_process worker_proc = model_worker_process return output def create_chat_ui(setup_status="Not started", services_status="Not started"): """Create the chat interface for LLaMA-Omni.""" with gr.Blocks() as demo: gr.Markdown("# 🦙🎧 LLaMA-Omni: Seamless Speech Interaction") # Setup and status with gr.Row(): with gr.Column(scale=1): setup_btn = gr.Button("1️⃣ Setup Environment") services_btn = gr.Button("2️⃣ Start LLaMA-Omni Services", interactive=False) with gr.Column(scale=2): setup_output = gr.Textbox(label="Setup Status", value=setup_status, lines=5) services_output = gr.Textbox(label="Services Status", value=services_status, lines=5) # Chat interface with gr.Tabs(): with gr.TabItem("Speech Input"): audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record or upload audio") transcription_output = gr.Textbox(label="Transcription", interactive=False) submit_audio_btn = gr.Button("Submit Audio", interactive=False) with gr.TabItem("Text Input"): text_input = gr.Textbox(label="Text Input", placeholder="Type your message here...") submit_text_btn = gr.Button("Submit Text", interactive=False) # Output area with gr.Row(): with gr.Column(): chatbot = gr.Chatbot(label="Conversation", height=400, type="messages") audio_output = gr.Audio(label="Generated Speech", interactive=False) # Function to handle setup button def on_setup_click(): output_message = setup_environment() return { setup_output: gr.update(value=output_message), services_btn: gr.update(interactive=True) } # Function to handle services button def on_services_click(): output_message = start_services() return { services_output: gr.update(value=output_message), submit_audio_btn: gr.update(interactive=True), submit_text_btn: gr.update(interactive=True) } # Placeholder functions for API calls (to be implemented) def on_audio_input(audio): if audio: # This would use Whisper to transcribe return "Transcription will appear here when services are running." return "" def on_audio_submit(audio, chat_history): if not audio: # Ensure chat_history is returned even if no audio return chat_history if chat_history is not None else [], None # Placeholder for actual transcription logic transcribed_text = "Audio input (transcription pending)" # Create new messages in the "messages" format new_messages = [ {"role": "user", "content": transcribed_text}, {"role": "assistant", "content": "This is a placeholder response. The full model will be running after starting the services."} ] # Append new messages to existing history (or initialize if history is None) updated_history = (chat_history if chat_history is not None else []) + new_messages return updated_history, None def on_text_submit(text, chat_history): if not text: # Ensure chat_history is returned even if no text return chat_history if chat_history is not None else [], None # Create new messages in the "messages" format new_messages = [ {"role": "user", "content": text}, {"role": "assistant", "content": "This is a placeholder response. The full model will be running after starting the services."} ] # Append new messages to existing history (or initialize if history is None) updated_history = (chat_history if chat_history is not None else []) + new_messages return updated_history, None # Connect events setup_btn.click(on_setup_click, outputs=[setup_output, services_btn]) services_btn.click(on_services_click, outputs=[services_output, submit_audio_btn, submit_text_btn]) audio_input.change(on_audio_input, [audio_input], [transcription_output]) submit_audio_btn.click(on_audio_submit, [audio_input, chatbot], [chatbot, audio_output]) submit_text_btn.click(on_text_submit, [text_input, chatbot], [chatbot, audio_output]) # Auto-setup on HF Spaces if HF_SPACES: def perform_auto_setup_on_load(): # Update UI to show setup is starting yield { setup_output: gr.update(value="Auto-starting setup process...") } # Actual setup call final_setup_status_message = setup_environment() # Update UI with final status and enable next button yield { setup_output: gr.update(value=final_setup_status_message), services_btn: gr.update(interactive=True) } demo.load( perform_auto_setup_on_load, None, [setup_output, services_btn] ) return demo if __name__ == "__main__": # Global references to background processes controller_proc = None worker_proc = None # Build the UI demo = create_chat_ui() # Launch with appropriate parameters for HF Spaces demo.launch( server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)), share=False, favicon_path="https://huggingface.co/front/assets/huggingface_logo-noborder.ico" )