Spaces:

marcosremar2
/

llama-omni

Build error

File size: 10,070 Bytes

import os
import sys
import subprocess
import threading
import time
import gradio as gr

# Configure environment for HF Spaces
HF_SPACES = os.environ.get("SPACE_ID") is not None
MODEL_PATH = os.environ.get("MODEL_PATH", "ICTNLP/Llama-3.1-8B-Omni")
DEVICE = "cuda" if os.environ.get("SYSTEM_CUDA_VISIBLE_DEVICES") else "cpu"

def run_background_process(cmd, name):
    """Run a background process and return the process object."""
    print(f"Starting {name}...")
    process = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1,
        universal_newlines=True,
        shell=True
    )
    return process

def read_process_output(process, output_box, name):
    """Read and update the output from a process."""
    full_output = f"### {name} Output:\n\n"
    for line in process.stdout:
        full_output += line
        output_box.update(value=full_output)
    
    # Process ended
    return_code = process.wait()
    full_output += f"\n\nProcess exited with code {return_code}"
    output_box.update(value=full_output)

def setup_environment():
    """Set up the environment by installing dependencies and downloading models."""
    # Create necessary directories
    os.makedirs("models/speech_encoder", exist_ok=True)
    os.makedirs("vocoder", exist_ok=True)
    
    output = "Setting up environment...\n"
    
    # Install dependencies only if not in HF Space (they're pre-installed there)
    if not HF_SPACES:
        output += "Installing dependencies...\n"
        subprocess.run("pip install openai-whisper>=20231117", shell=True)
        subprocess.run("pip install fairseq==0.12.2", shell=True)
    
    # Download vocoder if needed
    if not os.path.exists("vocoder/g_00500000"):
        output += "Downloading vocoder...\n"
        subprocess.run(
            "wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000 -P vocoder/",
            shell=True
        )
        subprocess.run(
            "wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json -P vocoder/",
            shell=True
        )
    
    # Initialize Whisper (it will be downloaded automatically)
    output += "Initializing Whisper model (this may take a while)...\n"
    try:
        import whisper
        whisper.load_model("tiny", download_root="models/speech_encoder/")
        output += "✅ Whisper model initialized successfully!\n"
    except Exception as e:
        output += f"❌ Error initializing Whisper model: {str(e)}\n"
    
    return output + "✅ Environment setup complete!"

def start_services():
    """Start the controller, model worker, and web server."""
    output = "Starting LLaMA-Omni services...\n"
    
    # Start the controller
    controller_cmd = "python -m omni_speech.serve.controller --host 0.0.0.0 --port 10000"
    controller_process = run_background_process(controller_cmd, "Controller")
    output += "✅ Controller started\n"
    
    # Wait for controller to start
    time.sleep(5)
    
    # Start the model worker
    worker_cmd = f"python -m omni_speech.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path {MODEL_PATH} --model-name Llama-3.1-8B-Omni --s2s"
    model_worker_process = run_background_process(worker_cmd, "Model Worker")
    output += f"✅ Model worker started with model: {MODEL_PATH}\n"
    
    # Wait for model worker to start
    time.sleep(10)
    
    # Start the web server (this is handled separately since we're using the Gradio UI directly)
    output += "✅ All services started successfully!\n"
    
    # Keep references to processes to prevent garbage collection
    global controller_proc, worker_proc
    controller_proc = controller_process
    worker_proc = model_worker_process
    
    return output

def create_chat_ui(setup_status="Not started", services_status="Not started"):
    """Create the chat interface for LLaMA-Omni."""
    with gr.Blocks() as demo:
        gr.Markdown("# 🦙🎧 LLaMA-Omni: Seamless Speech Interaction")
        
        # Setup and status
        with gr.Row():
            with gr.Column(scale=1):
                setup_btn = gr.Button("1️⃣ Setup Environment")
                services_btn = gr.Button("2️⃣ Start LLaMA-Omni Services", interactive=False)
            
            with gr.Column(scale=2):
                setup_output = gr.Textbox(label="Setup Status", value=setup_status, lines=5)
                services_output = gr.Textbox(label="Services Status", value=services_status, lines=5)
        
        # Chat interface
        with gr.Tabs():
            with gr.TabItem("Speech Input"):
                audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record or upload audio")
                transcription_output = gr.Textbox(label="Transcription", interactive=False)
                submit_audio_btn = gr.Button("Submit Audio", interactive=False)
                
            with gr.TabItem("Text Input"):
                text_input = gr.Textbox(label="Text Input", placeholder="Type your message here...")
                submit_text_btn = gr.Button("Submit Text", interactive=False)
        
        
        # Output area
        with gr.Row():
            with gr.Column():
                chatbot = gr.Chatbot(label="Conversation", height=400, type="messages")
                audio_output = gr.Audio(label="Generated Speech", interactive=False)
        
        # Function to handle setup button
        def on_setup_click():
            output_message = setup_environment()
            return {
                setup_output: gr.update(value=output_message),
                services_btn: gr.update(interactive=True)
            }
        
        # Function to handle services button
        def on_services_click():
            output_message = start_services()
            return {
                services_output: gr.update(value=output_message),
                submit_audio_btn: gr.update(interactive=True),
                submit_text_btn: gr.update(interactive=True)
            }
        
        # Placeholder functions for API calls (to be implemented)
        def on_audio_input(audio):
            if audio:
                # This would use Whisper to transcribe
                return "Transcription will appear here when services are running."
            return ""
        
        def on_audio_submit(audio, chat_history):
            if not audio:
                # Ensure chat_history is returned even if no audio
                return chat_history if chat_history is not None else [], None
            
            # Placeholder for actual transcription logic
            transcribed_text = "Audio input (transcription pending)" 

            # Create new messages in the "messages" format
            new_messages = [
                {"role": "user", "content": transcribed_text},
                {"role": "assistant", "content": "This is a placeholder response. The full model will be running after starting the services."}
            ]
            
            # Append new messages to existing history (or initialize if history is None)
            updated_history = (chat_history if chat_history is not None else []) + new_messages
            return updated_history, None
        
        def on_text_submit(text, chat_history):
            if not text:
                # Ensure chat_history is returned even if no text
                return chat_history if chat_history is not None else [], None
            
            # Create new messages in the "messages" format
            new_messages = [
                {"role": "user", "content": text},
                {"role": "assistant", "content": "This is a placeholder response. The full model will be running after starting the services."}
            ]

            # Append new messages to existing history (or initialize if history is None)
            updated_history = (chat_history if chat_history is not None else []) + new_messages
            return updated_history, None
        
        # Connect events
        setup_btn.click(on_setup_click, outputs=[setup_output, services_btn])
        services_btn.click(on_services_click, outputs=[services_output, submit_audio_btn, submit_text_btn])
        
        audio_input.change(on_audio_input, [audio_input], [transcription_output])
        submit_audio_btn.click(on_audio_submit, [audio_input, chatbot], [chatbot, audio_output])
        submit_text_btn.click(on_text_submit, [text_input, chatbot], [chatbot, audio_output])
        
        # Auto-setup on HF Spaces
        if HF_SPACES:
            def perform_auto_setup_on_load():
                # Update UI to show setup is starting
                yield {
                    setup_output: gr.update(value="Auto-starting setup process...")
                }
                
                # Actual setup call
                final_setup_status_message = setup_environment() 

                # Update UI with final status and enable next button
                yield {
                    setup_output: gr.update(value=final_setup_status_message),
                    services_btn: gr.update(interactive=True)
                }

            demo.load(
                perform_auto_setup_on_load, 
                None, 
                [setup_output, services_btn] 
            )
    
    return demo

if __name__ == "__main__":
    # Global references to background processes
    controller_proc = None
    worker_proc = None
    
    # Build the UI
    demo = create_chat_ui()
    
    # Launch with appropriate parameters for HF Spaces
    demo.launch(
        server_name="0.0.0.0",
        server_port=int(os.environ.get("PORT", 7860)),
        share=False,
        favicon_path="https://huggingface.co/front/assets/huggingface_logo-noborder.ico"
    )