Spaces:

marcosremar2
/

llama-omni

Build error

App Files Files Community

marcosremar2 commited on 21 days ago

Commit

c57019c

1 Parent(s): 1cd5253

ereerre

Browse files

Files changed (11) hide show

.dockerignore +58 -0
.gitignore +8 -9
Dockerfile +42 -0
README.md +28 -4
app_gradio_spaces.py +136 -62
cog.yaml +4 -1
docker-compose.yml +21 -0
requirements.txt +1 -2
requirements_spaces.txt +12 -0
run_docker.sh +28 -0
setup_huggingface.sh +24 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,58 @@

+# Git
+.git
+.gitignore
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# Environments
+venv/
+ENV/
+.env/
+# Model files and data (will be downloaded during container startup)
+models/
+*.pt
+*.pth
+*.bin
+*.onnx
+*.safetensors
+*.plan
+*.zip
+.cache/
+*/.cache/
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+.cursor/
+# OS specific
+.DS_Store
+Thumbs.db
+# Docker
+Dockerfile
+docker-compose.yml
+.dockerignore

.gitignore CHANGED Viewed

@@ -26,20 +26,19 @@ ENV/
 .env/
 # Model files and data
-models/
-*.pt
-*.pth
-*.bin
-*.onnx
-*.safetensors
-*.plan
-*.zip
 .cache/
 */.cache/
 *incomplete
 whisper-large-v3/
 cosy2_decoder/
-speech_encoder/
 # Ignore all large model files
 flow.decoder.estimator.fp32.onnx

 .env/
 # Model files and data
+# Don't exclude directory structure, only the large files
+models/**/*.pt
+models/**/*.pth
+models/**/*.bin
+models/**/*.onnx
+models/**/*.safetensors
+models/**/*.plan
+models/**/*.zip
 .cache/
 */.cache/
 *incomplete
 whisper-large-v3/
 cosy2_decoder/
 # Ignore all large model files
 flow.decoder.estimator.fp32.onnx

Dockerfile ADDED Viewed

	@@ -0,0 +1,42 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    wget \
+    git \
+    ffmpeg \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+# Install PyTorch first
+RUN pip install --no-cache-dir torch>=2.0.0
+# Then install other dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Create necessary directories
+RUN mkdir -p models/speech_encoder vocoder
+# Download vocoder models if needed
+RUN wget -P vocoder/ https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000 \
+    && wget -P vocoder/ https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json
+# Copy the application code
+COPY . .
+# Optional: Install flash-attn on compatible systems only
+RUN if [ "$(uname -m)" != "aarch64" ]; then \
+        pip install --no-cache-dir flash-attn || echo "Failed to install flash-attn, continuing without it"; \
+    fi
+# Expose port for the application
+EXPOSE 7860
+# Command to run the application
+CMD ["python", "app_gradio_spaces.py"]

README.md CHANGED Viewed

@@ -51,11 +51,35 @@ This is a Gradio deployment of [LLaMA-Omni](https://github.com/ictnlp/LLaMA-Omni
    pip install flash-attn
    ```
-## 🚀 Deployment
-This repository is configured for deployment on Gradio. The model weights and required components will be downloaded automatically during the first initialization.
-### Gradio Spaces Deployment
 To deploy on Gradio Spaces:
@@ -72,7 +96,7 @@ The app will automatically:
 ## 🖥️ Local Usage
-If you want to run the application locally:
 ```bash
 python app.py

    pip install flash-attn
    ```
+## 🐳 Docker Deployment
+We provide Docker support for easy deployment without worrying about dependencies:
+1. Make sure Docker and Docker Compose are installed on your system
+2. Build and run the container:
+   ```bash
+   # Using the provided shell script
+   ./run_docker.sh
+   # Or manually with docker-compose
+   docker-compose up --build
+   ```
+3. Access the application at http://localhost:7860
+The Docker container will automatically:
+- Install all required dependencies
+- Download the necessary model files
+- Start the application
+### GPU Support
+The Docker setup includes NVIDIA GPU support. Make sure you have:
+- NVIDIA drivers installed on your host
+- NVIDIA Container Toolkit installed (for GPU passthrough)
+## 🚀 Gradio Spaces Deployment
 To deploy on Gradio Spaces:
 ## 🖥️ Local Usage
+If you want to run the application locally without Docker:
 ```bash
 python app.py

app_gradio_spaces.py CHANGED Viewed

@@ -5,6 +5,11 @@ import threading
 import time
 import gradio as gr
 def run_background_process(cmd, name):
     """Run a background process and return the process object."""
     print(f"Starting {name}...")
@@ -37,98 +42,167 @@ def setup_environment():
     os.makedirs("models/speech_encoder", exist_ok=True)
     os.makedirs("vocoder", exist_ok=True)
-    # Download whisper model
-    os.system("pip install openai-whisper>=20231117")
-    os.system("pip install fairseq==0.12.2")
-    # Download vocoder
     if not os.path.exists("vocoder/g_00500000"):
-        os.system("wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000 -P vocoder/")
-        os.system("wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json -P vocoder/")
     # Initialize Whisper (it will be downloaded automatically)
-    os.system("python -c \"import whisper; whisper.load_model('large-v3', download_root='models/speech_encoder/')\"")
-    return "✅ Environment setup complete!"
 def start_services():
     """Start the controller, model worker, and web server."""
     # Start the controller
-    controller_process = run_background_process(
-        "python -m omni_speech.serve.controller --host 0.0.0.0 --port 10000",
-        "Controller"
-    )
     # Wait for controller to start
     time.sleep(5)
     # Start the model worker
-    model_worker_process = run_background_process(
-        "python -m omni_speech.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path Llama-3.1-8B-Omni --model-name Llama-3.1-8B-Omni --s2s",
-        "Model Worker"
-    )
     # Wait for model worker to start
     time.sleep(10)
-    # Start the web server
-    web_server_process = run_background_process(
-        "python -m omni_speech.serve.gradio_web_server --controller http://localhost:10000 --port 8001 --model-list-mode reload --vocoder vocoder/g_00500000 --vocoder-cfg vocoder/config.json",
-        "Web Server"
-    )
-    # Wait for web server to start
-    time.sleep(5)
-    return "✅ All services started successfully! Click 'Open Interface' to access the application."
-def build_ui():
-    """Build the Gradio UI."""
     with gr.Blocks() as demo:
-        gr.Markdown("# 🦙🎧 LLaMA-Omni Deployment")
-        with gr.Tab("Setup"):
-            setup_btn = gr.Button("Setup Environment")
-            setup_output = gr.Textbox(label="Setup Output", value="Click 'Setup Environment' to start.")
-            setup_btn.click(setup_environment, outputs=setup_output)
-        with gr.Tab("Services"):
-            start_btn = gr.Button("Start LLaMA-Omni Services")
-            status_output = gr.Textbox(label="Status", value="Click 'Start LLaMA-Omni Services' to begin.")
-            controller_output = gr.Markdown(value="Controller not started")
-            model_worker_output = gr.Markdown(value="Model Worker not started")
-            web_server_output = gr.Markdown(value="Web Server not started")
-            start_btn.click(
-                start_services,
-                outputs=status_output
-            )
-            interface_btn = gr.Button("Open Interface")
-            interface_btn.click(lambda: gr.Redirect("http://localhost:8001"), None, None)
-        with gr.Tab("About"):
-            gr.Markdown("""
-            # About LLaMA-Omni
-            LLaMA-Omni is a speech-language model built upon Llama-3.1-8B-Instruct. It supports low-latency and high-quality speech interactions, simultaneously generating both text and speech responses based on speech instructions.
-            ## Features
-            * Built on Llama-3.1-8B-Instruct, ensuring high-quality responses
-            * Low-latency speech interaction with a latency as low as 226ms
-            * Simultaneous generation of both text and speech responses
-            ## License
-            This code is released under the Apache-2.0 License. The model is intended for academic research purposes only and may NOT be used for commercial purposes.
-            Original work by Qingkai Fang, Shoutao Guo, Yan Zhou, Zhengrui Ma, Shaolei Zhang, Yang Feng.
-            """)
     return demo
 if __name__ == "__main__":
-    demo = build_ui()
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import time
 import gradio as gr
+# Configure environment for HF Spaces
+HF_SPACES = os.environ.get("SPACE_ID") is not None
+MODEL_PATH = os.environ.get("MODEL_PATH", "ICTNLP/Llama-3.1-8B-Omni")
+DEVICE = "cuda" if os.environ.get("SYSTEM_CUDA_VISIBLE_DEVICES") else "cpu"
 def run_background_process(cmd, name):
     """Run a background process and return the process object."""
     print(f"Starting {name}...")
     os.makedirs("models/speech_encoder", exist_ok=True)
     os.makedirs("vocoder", exist_ok=True)
+    output = "Setting up environment...\n"
+    # Install dependencies only if not in HF Space (they're pre-installed there)
+    if not HF_SPACES:
+        output += "Installing dependencies...\n"
+        subprocess.run("pip install openai-whisper>=20231117", shell=True)
+        subprocess.run("pip install fairseq==0.12.2", shell=True)
+    # Download vocoder if needed
     if not os.path.exists("vocoder/g_00500000"):
+        output += "Downloading vocoder...\n"
+        subprocess.run(
+            "wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000 -P vocoder/",
+            shell=True
+        )
+        subprocess.run(
+            "wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json -P vocoder/",
+            shell=True
+        )
     # Initialize Whisper (it will be downloaded automatically)
+    output += "Initializing Whisper model (this may take a while)...\n"
+    try:
+        import whisper
+        whisper.load_model("large-v3", download_root="models/speech_encoder/")
+        output += "✅ Whisper model initialized successfully!\n"
+    except Exception as e:
+        output += f"❌ Error initializing Whisper model: {str(e)}\n"
+    return output + "✅ Environment setup complete!"
 def start_services():
     """Start the controller, model worker, and web server."""
+    output = "Starting LLaMA-Omni services...\n"
     # Start the controller
+    controller_cmd = "python -m omni_speech.serve.controller --host 0.0.0.0 --port 10000"
+    controller_process = run_background_process(controller_cmd, "Controller")
+    output += "✅ Controller started\n"
     # Wait for controller to start
     time.sleep(5)
     # Start the model worker
+    worker_cmd = f"python -m omni_speech.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path {MODEL_PATH} --model-name Llama-3.1-8B-Omni --s2s"
+    model_worker_process = run_background_process(worker_cmd, "Model Worker")
+    output += f"✅ Model worker started with model: {MODEL_PATH}\n"
     # Wait for model worker to start
     time.sleep(10)
+    # Start the web server (this is handled separately since we're using the Gradio UI directly)
+    output += "✅ All services started successfully!\n"
+    # Keep references to processes to prevent garbage collection
+    global controller_proc, worker_proc
+    controller_proc = controller_process
+    worker_proc = model_worker_process
+    return output
+def create_chat_ui(setup_status="Not started", services_status="Not started"):
+    """Create the chat interface for LLaMA-Omni."""
     with gr.Blocks() as demo:
+        gr.Markdown("# 🦙🎧 LLaMA-Omni: Seamless Speech Interaction")
+        # Setup and status
+        with gr.Row():
+            with gr.Column(scale=1):
+                setup_btn = gr.Button("1️⃣ Setup Environment")
+                services_btn = gr.Button("2️⃣ Start LLaMA-Omni Services", interactive=False)
+            with gr.Column(scale=2):
+                setup_output = gr.Textbox(label="Setup Status", value=setup_status, lines=5)
+                services_output = gr.Textbox(label="Services Status", value=services_status, lines=5)
+        # Chat interface
+        with gr.Tabs():
+            with gr.TabItem("Speech Input"):
+                audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record or upload audio")
+                transcription_output = gr.Textbox(label="Transcription", interactive=False)
+                submit_audio_btn = gr.Button("Submit Audio", interactive=False)
+            with gr.TabItem("Text Input"):
+                text_input = gr.Textbox(label="Text Input", placeholder="Type your message here...")
+                submit_text_btn = gr.Button("Submit Text", interactive=False)
+        # Output area
+        with gr.Row():
+            with gr.Column():
+                chatbot = gr.Chatbot(label="Conversation", height=400)
+                audio_output = gr.Audio(label="Generated Speech", interactive=False)
+        # Function to handle setup button
+        def on_setup_click():
+            output = setup_environment()
+            return output, gr.Button.update(interactive=True)
+        # Function to handle services button
+        def on_services_click():
+            output = start_services()
+            return output, gr.Button.update(interactive=True), gr.Button.update(interactive=True)
+        # Placeholder functions for API calls (to be implemented)
+        def on_audio_input(audio):
+            if audio:
+                # This would use Whisper to transcribe
+                return "Transcription will appear here when services are running."
+            return ""
+        def on_audio_submit(audio, chat_history):
+            if not audio:
+                return chat_history, None
+            user_msg = "Audio message (transcription will be added when implemented)"
+            bot_msg = "This is a placeholder response. The full model will be running after starting the services."
+            history = chat_history + [(user_msg, bot_msg)]
+            return history, None
+        def on_text_submit(text, chat_history):
+            if not text:
+                return chat_history, None
+            history = chat_history + [(text, "This is a placeholder response. The full model will be running after starting the services.")]
+            return history, None
+        # Connect events
+        setup_btn.click(on_setup_click, outputs=[setup_output, services_btn])
+        services_btn.click(on_services_click, outputs=[services_output, submit_audio_btn, submit_text_btn])
+        audio_input.change(on_audio_input, [audio_input], [transcription_output])
+        submit_audio_btn.click(on_audio_submit, [audio_input, chatbot], [chatbot, audio_output])
+        submit_text_btn.click(on_text_submit, [text_input, chatbot], [chatbot, audio_output])
+        # Auto-setup on HF Spaces
+        if HF_SPACES:
+            # Run setup automatically in a separate thread
+            def auto_setup():
+                time.sleep(2)  # Wait for UI to load
+                setup_output.update(value="Auto-starting setup process...")
+                setup_status = setup_environment()
+                setup_output.update(value=setup_status)
+                services_btn.update(interactive=True)
+            threading.Thread(target=auto_setup, daemon=True).start()
     return demo
 if __name__ == "__main__":
+    # Global references to background processes
+    controller_proc = None
+    worker_proc = None
+    # Build the UI
+    demo = create_chat_ui()
+    # Launch with appropriate parameters for HF Spaces
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=int(os.environ.get("PORT", 7860)),
+        share=False,
+        favicon_path="https://huggingface.co/front/assets/huggingface_logo-noborder.ico"
+    )

cog.yaml CHANGED Viewed

@@ -17,11 +17,14 @@ build:
     - "wget"
     - "ffmpeg"
     - "libsndfile1"
   run:
     - "pip install -e git+https://github.com/pytorch/fairseq.git#egg=fairseq"
-    - "if [ $(uname -m) != 'arm64' ] || [ $(uname -s) != 'Darwin' ]; then pip install flash-attn==2.3.0; fi"
     - "mkdir -p vocoder"
     - "wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000 -P vocoder/"
     - "wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json -P vocoder/"
 predict: "predict.py:Predictor"

     - "wget"
     - "ffmpeg"
     - "libsndfile1"
+    - "build-essential"
+    - "git"
   run:
     - "pip install -e git+https://github.com/pytorch/fairseq.git#egg=fairseq"
+    - "python -c 'import platform; import subprocess; arch=platform.machine(); is_arm64=arch==\"arm64\" or arch==\"aarch64\"; is_darwin=platform.system()==\"Darwin\"; not_compatible=is_arm64 and is_darwin; exit_code=subprocess.call([\"pip\", \"install\", \"flash-attn==2.3.0\"]) if not not_compatible else 0; print(f\"flash-attn installation {'skipped on Apple Silicon' if not_compatible else 'completed' if exit_code==0 else 'failed but continuing'}\")'"
     - "mkdir -p vocoder"
     - "wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000 -P vocoder/"
     - "wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json -P vocoder/"
+    - "mkdir -p models/speech_encoder"
 predict: "predict.py:Predictor"

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+version: '3'
+services:
+  llama-omni:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "7860:7860"
+    volumes:
+      - ./models:/app/models
+    environment:
+      - GRADIO_SERVER_NAME=0.0.0.0
+      - GRADIO_SERVER_PORT=7860
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]

requirements.txt CHANGED Viewed

@@ -9,5 +9,4 @@ pydantic>=2.3.0
 openai-whisper>=0.0.1
 tqdm>=4.66.1
 requests>=2.31.0
-git+https://github.com/pytorch/fairseq.git
-flash-attn>=2.3.0; platform_system != "Darwin" or platform_machine != "arm64"

 openai-whisper>=0.0.1
 tqdm>=4.66.1
 requests>=2.31.0
+git+https://github.com/pytorch/fairseq.git

requirements_spaces.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch>=2.0.0
+numpy>=1.24.0
+transformers>=4.34.0
+accelerate>=0.21.0
+gradio>=3.50.2
+fastapi>=0.104.0
+uvicorn>=0.23.2
+pydantic>=2.3.0
+openai-whisper>=0.0.1
+tqdm>=4.66.1
+requests>=2.31.0
+git+https://github.com/pytorch/fairseq.git

run_docker.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/bin/bash
+# Make script exit on error
+set -e
+# Check if Docker is installed
+if ! command -v docker &> /dev/null; then
+    echo "Error: Docker is not installed. Please install Docker first."
+    exit 1
+fi
+# Check if docker-compose is installed
+if ! command -v docker-compose &> /dev/null && ! docker compose version &> /dev/null; then
+    echo "Error: Neither docker-compose nor 'docker compose' is available. Please install docker-compose."
+    exit 1
+fi
+# Build and start the container
+echo "Building and starting LLaMA-Omni container..."
+# Check if docker compose plugin or standalone docker-compose is available
+if docker compose version &> /dev/null; then
+    # Using Docker Compose plugin
+    docker compose up --build
+else
+    # Using standalone docker-compose
+    docker-compose up --build
+fi

setup_huggingface.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/bin/bash
+# Make script exit on error
+set -e
+echo "Setting up LLaMA-Omni on Hugging Face Spaces..."
+# Create necessary directories
+mkdir -p models/speech_encoder vocoder
+# Download vocoder models if needed
+if [ ! -f "vocoder/g_00500000" ]; then
+    echo "Downloading vocoder models..."
+    wget -P vocoder/ https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000
+    wget -P vocoder/ https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json
+fi
+# Create empty __init__.py files for the package structure
+mkdir -p omni_speech/serve omni_speech/infer/examples
+touch omni_speech/__init__.py
+touch omni_speech/serve/__init__.py
+touch omni_speech/infer/__init__.py
+echo "✅ Setup complete! LLaMA-Omni is now ready to run on Hugging Face Spaces."