Spaces:
Build error
Build error
Commit
·
c57019c
1
Parent(s):
1cd5253
ereerre
Browse files- .dockerignore +58 -0
- .gitignore +8 -9
- Dockerfile +42 -0
- README.md +28 -4
- app_gradio_spaces.py +136 -62
- cog.yaml +4 -1
- docker-compose.yml +21 -0
- requirements.txt +1 -2
- requirements_spaces.txt +12 -0
- run_docker.sh +28 -0
- setup_huggingface.sh +24 -0
.dockerignore
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Git
|
2 |
+
.git
|
3 |
+
.gitignore
|
4 |
+
|
5 |
+
# Python
|
6 |
+
__pycache__/
|
7 |
+
*.py[cod]
|
8 |
+
*$py.class
|
9 |
+
*.so
|
10 |
+
.Python
|
11 |
+
env/
|
12 |
+
build/
|
13 |
+
develop-eggs/
|
14 |
+
dist/
|
15 |
+
downloads/
|
16 |
+
eggs/
|
17 |
+
.eggs/
|
18 |
+
lib/
|
19 |
+
lib64/
|
20 |
+
parts/
|
21 |
+
sdist/
|
22 |
+
var/
|
23 |
+
*.egg-info/
|
24 |
+
.installed.cfg
|
25 |
+
*.egg
|
26 |
+
|
27 |
+
# Environments
|
28 |
+
venv/
|
29 |
+
ENV/
|
30 |
+
.env/
|
31 |
+
|
32 |
+
# Model files and data (will be downloaded during container startup)
|
33 |
+
models/
|
34 |
+
*.pt
|
35 |
+
*.pth
|
36 |
+
*.bin
|
37 |
+
*.onnx
|
38 |
+
*.safetensors
|
39 |
+
*.plan
|
40 |
+
*.zip
|
41 |
+
.cache/
|
42 |
+
*/.cache/
|
43 |
+
|
44 |
+
# IDEs
|
45 |
+
.vscode/
|
46 |
+
.idea/
|
47 |
+
*.swp
|
48 |
+
*.swo
|
49 |
+
.cursor/
|
50 |
+
|
51 |
+
# OS specific
|
52 |
+
.DS_Store
|
53 |
+
Thumbs.db
|
54 |
+
|
55 |
+
# Docker
|
56 |
+
Dockerfile
|
57 |
+
docker-compose.yml
|
58 |
+
.dockerignore
|
.gitignore
CHANGED
@@ -26,20 +26,19 @@ ENV/
|
|
26 |
.env/
|
27 |
|
28 |
# Model files and data
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
.cache/
|
38 |
*/.cache/
|
39 |
*incomplete
|
40 |
whisper-large-v3/
|
41 |
cosy2_decoder/
|
42 |
-
speech_encoder/
|
43 |
|
44 |
# Ignore all large model files
|
45 |
flow.decoder.estimator.fp32.onnx
|
|
|
26 |
.env/
|
27 |
|
28 |
# Model files and data
|
29 |
+
# Don't exclude directory structure, only the large files
|
30 |
+
models/**/*.pt
|
31 |
+
models/**/*.pth
|
32 |
+
models/**/*.bin
|
33 |
+
models/**/*.onnx
|
34 |
+
models/**/*.safetensors
|
35 |
+
models/**/*.plan
|
36 |
+
models/**/*.zip
|
37 |
.cache/
|
38 |
*/.cache/
|
39 |
*incomplete
|
40 |
whisper-large-v3/
|
41 |
cosy2_decoder/
|
|
|
42 |
|
43 |
# Ignore all large model files
|
44 |
flow.decoder.estimator.fp32.onnx
|
Dockerfile
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
# Install system dependencies
|
6 |
+
RUN apt-get update && apt-get install -y \
|
7 |
+
build-essential \
|
8 |
+
wget \
|
9 |
+
git \
|
10 |
+
ffmpeg \
|
11 |
+
libsndfile1 \
|
12 |
+
&& rm -rf /var/lib/apt/lists/*
|
13 |
+
|
14 |
+
# Copy requirements first to leverage Docker cache
|
15 |
+
COPY requirements.txt .
|
16 |
+
|
17 |
+
# Install PyTorch first
|
18 |
+
RUN pip install --no-cache-dir torch>=2.0.0
|
19 |
+
|
20 |
+
# Then install other dependencies
|
21 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
22 |
+
|
23 |
+
# Create necessary directories
|
24 |
+
RUN mkdir -p models/speech_encoder vocoder
|
25 |
+
|
26 |
+
# Download vocoder models if needed
|
27 |
+
RUN wget -P vocoder/ https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000 \
|
28 |
+
&& wget -P vocoder/ https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json
|
29 |
+
|
30 |
+
# Copy the application code
|
31 |
+
COPY . .
|
32 |
+
|
33 |
+
# Optional: Install flash-attn on compatible systems only
|
34 |
+
RUN if [ "$(uname -m)" != "aarch64" ]; then \
|
35 |
+
pip install --no-cache-dir flash-attn || echo "Failed to install flash-attn, continuing without it"; \
|
36 |
+
fi
|
37 |
+
|
38 |
+
# Expose port for the application
|
39 |
+
EXPOSE 7860
|
40 |
+
|
41 |
+
# Command to run the application
|
42 |
+
CMD ["python", "app_gradio_spaces.py"]
|
README.md
CHANGED
@@ -51,11 +51,35 @@ This is a Gradio deployment of [LLaMA-Omni](https://github.com/ictnlp/LLaMA-Omni
|
|
51 |
pip install flash-attn
|
52 |
```
|
53 |
|
54 |
-
##
|
55 |
|
56 |
-
|
57 |
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
To deploy on Gradio Spaces:
|
61 |
|
@@ -72,7 +96,7 @@ The app will automatically:
|
|
72 |
|
73 |
## 🖥️ Local Usage
|
74 |
|
75 |
-
If you want to run the application locally:
|
76 |
|
77 |
```bash
|
78 |
python app.py
|
|
|
51 |
pip install flash-attn
|
52 |
```
|
53 |
|
54 |
+
## 🐳 Docker Deployment
|
55 |
|
56 |
+
We provide Docker support for easy deployment without worrying about dependencies:
|
57 |
|
58 |
+
1. Make sure Docker and Docker Compose are installed on your system
|
59 |
+
|
60 |
+
2. Build and run the container:
|
61 |
+
```bash
|
62 |
+
# Using the provided shell script
|
63 |
+
./run_docker.sh
|
64 |
+
|
65 |
+
# Or manually with docker-compose
|
66 |
+
docker-compose up --build
|
67 |
+
```
|
68 |
+
|
69 |
+
3. Access the application at http://localhost:7860
|
70 |
+
|
71 |
+
The Docker container will automatically:
|
72 |
+
- Install all required dependencies
|
73 |
+
- Download the necessary model files
|
74 |
+
- Start the application
|
75 |
+
|
76 |
+
### GPU Support
|
77 |
+
|
78 |
+
The Docker setup includes NVIDIA GPU support. Make sure you have:
|
79 |
+
- NVIDIA drivers installed on your host
|
80 |
+
- NVIDIA Container Toolkit installed (for GPU passthrough)
|
81 |
+
|
82 |
+
## 🚀 Gradio Spaces Deployment
|
83 |
|
84 |
To deploy on Gradio Spaces:
|
85 |
|
|
|
96 |
|
97 |
## 🖥️ Local Usage
|
98 |
|
99 |
+
If you want to run the application locally without Docker:
|
100 |
|
101 |
```bash
|
102 |
python app.py
|
app_gradio_spaces.py
CHANGED
@@ -5,6 +5,11 @@ import threading
|
|
5 |
import time
|
6 |
import gradio as gr
|
7 |
|
|
|
|
|
|
|
|
|
|
|
8 |
def run_background_process(cmd, name):
|
9 |
"""Run a background process and return the process object."""
|
10 |
print(f"Starting {name}...")
|
@@ -37,98 +42,167 @@ def setup_environment():
|
|
37 |
os.makedirs("models/speech_encoder", exist_ok=True)
|
38 |
os.makedirs("vocoder", exist_ok=True)
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
# Download vocoder
|
45 |
if not os.path.exists("vocoder/g_00500000"):
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
# Initialize Whisper (it will be downloaded automatically)
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
-
return "✅ Environment setup complete!"
|
53 |
|
54 |
def start_services():
|
55 |
"""Start the controller, model worker, and web server."""
|
|
|
|
|
56 |
# Start the controller
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
)
|
61 |
|
62 |
# Wait for controller to start
|
63 |
time.sleep(5)
|
64 |
|
65 |
# Start the model worker
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
)
|
70 |
|
71 |
# Wait for model worker to start
|
72 |
time.sleep(10)
|
73 |
|
74 |
-
# Start the web server
|
75 |
-
|
76 |
-
"python -m omni_speech.serve.gradio_web_server --controller http://localhost:10000 --port 8001 --model-list-mode reload --vocoder vocoder/g_00500000 --vocoder-cfg vocoder/config.json",
|
77 |
-
"Web Server"
|
78 |
-
)
|
79 |
|
80 |
-
#
|
81 |
-
|
|
|
|
|
82 |
|
83 |
-
return
|
84 |
|
85 |
-
def
|
86 |
-
"""
|
87 |
with gr.Blocks() as demo:
|
88 |
-
gr.Markdown("# 🦙🎧 LLaMA-Omni
|
89 |
-
|
90 |
-
with gr.Tab("Setup"):
|
91 |
-
setup_btn = gr.Button("Setup Environment")
|
92 |
-
setup_output = gr.Textbox(label="Setup Output", value="Click 'Setup Environment' to start.")
|
93 |
-
setup_btn.click(setup_environment, outputs=setup_output)
|
94 |
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
model_worker_output = gr.Markdown(value="Model Worker not started")
|
101 |
-
web_server_output = gr.Markdown(value="Web Server not started")
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
)
|
107 |
-
|
108 |
-
interface_btn = gr.Button("Open Interface")
|
109 |
-
interface_btn.click(lambda: gr.Redirect("http://localhost:8001"), None, None)
|
110 |
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
-
|
120 |
-
|
121 |
-
* Simultaneous generation of both text and speech responses
|
122 |
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
-
|
128 |
-
""")
|
129 |
|
130 |
return demo
|
131 |
|
132 |
if __name__ == "__main__":
|
133 |
-
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import time
|
6 |
import gradio as gr
|
7 |
|
8 |
+
# Configure environment for HF Spaces
|
9 |
+
HF_SPACES = os.environ.get("SPACE_ID") is not None
|
10 |
+
MODEL_PATH = os.environ.get("MODEL_PATH", "ICTNLP/Llama-3.1-8B-Omni")
|
11 |
+
DEVICE = "cuda" if os.environ.get("SYSTEM_CUDA_VISIBLE_DEVICES") else "cpu"
|
12 |
+
|
13 |
def run_background_process(cmd, name):
|
14 |
"""Run a background process and return the process object."""
|
15 |
print(f"Starting {name}...")
|
|
|
42 |
os.makedirs("models/speech_encoder", exist_ok=True)
|
43 |
os.makedirs("vocoder", exist_ok=True)
|
44 |
|
45 |
+
output = "Setting up environment...\n"
|
46 |
+
|
47 |
+
# Install dependencies only if not in HF Space (they're pre-installed there)
|
48 |
+
if not HF_SPACES:
|
49 |
+
output += "Installing dependencies...\n"
|
50 |
+
subprocess.run("pip install openai-whisper>=20231117", shell=True)
|
51 |
+
subprocess.run("pip install fairseq==0.12.2", shell=True)
|
52 |
|
53 |
+
# Download vocoder if needed
|
54 |
if not os.path.exists("vocoder/g_00500000"):
|
55 |
+
output += "Downloading vocoder...\n"
|
56 |
+
subprocess.run(
|
57 |
+
"wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000 -P vocoder/",
|
58 |
+
shell=True
|
59 |
+
)
|
60 |
+
subprocess.run(
|
61 |
+
"wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json -P vocoder/",
|
62 |
+
shell=True
|
63 |
+
)
|
64 |
|
65 |
# Initialize Whisper (it will be downloaded automatically)
|
66 |
+
output += "Initializing Whisper model (this may take a while)...\n"
|
67 |
+
try:
|
68 |
+
import whisper
|
69 |
+
whisper.load_model("large-v3", download_root="models/speech_encoder/")
|
70 |
+
output += "✅ Whisper model initialized successfully!\n"
|
71 |
+
except Exception as e:
|
72 |
+
output += f"❌ Error initializing Whisper model: {str(e)}\n"
|
73 |
|
74 |
+
return output + "✅ Environment setup complete!"
|
75 |
|
76 |
def start_services():
|
77 |
"""Start the controller, model worker, and web server."""
|
78 |
+
output = "Starting LLaMA-Omni services...\n"
|
79 |
+
|
80 |
# Start the controller
|
81 |
+
controller_cmd = "python -m omni_speech.serve.controller --host 0.0.0.0 --port 10000"
|
82 |
+
controller_process = run_background_process(controller_cmd, "Controller")
|
83 |
+
output += "✅ Controller started\n"
|
|
|
84 |
|
85 |
# Wait for controller to start
|
86 |
time.sleep(5)
|
87 |
|
88 |
# Start the model worker
|
89 |
+
worker_cmd = f"python -m omni_speech.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path {MODEL_PATH} --model-name Llama-3.1-8B-Omni --s2s"
|
90 |
+
model_worker_process = run_background_process(worker_cmd, "Model Worker")
|
91 |
+
output += f"✅ Model worker started with model: {MODEL_PATH}\n"
|
|
|
92 |
|
93 |
# Wait for model worker to start
|
94 |
time.sleep(10)
|
95 |
|
96 |
+
# Start the web server (this is handled separately since we're using the Gradio UI directly)
|
97 |
+
output += "✅ All services started successfully!\n"
|
|
|
|
|
|
|
98 |
|
99 |
+
# Keep references to processes to prevent garbage collection
|
100 |
+
global controller_proc, worker_proc
|
101 |
+
controller_proc = controller_process
|
102 |
+
worker_proc = model_worker_process
|
103 |
|
104 |
+
return output
|
105 |
|
106 |
+
def create_chat_ui(setup_status="Not started", services_status="Not started"):
|
107 |
+
"""Create the chat interface for LLaMA-Omni."""
|
108 |
with gr.Blocks() as demo:
|
109 |
+
gr.Markdown("# 🦙🎧 LLaMA-Omni: Seamless Speech Interaction")
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
+
# Setup and status
|
112 |
+
with gr.Row():
|
113 |
+
with gr.Column(scale=1):
|
114 |
+
setup_btn = gr.Button("1️⃣ Setup Environment")
|
115 |
+
services_btn = gr.Button("2️⃣ Start LLaMA-Omni Services", interactive=False)
|
|
|
|
|
116 |
|
117 |
+
with gr.Column(scale=2):
|
118 |
+
setup_output = gr.Textbox(label="Setup Status", value=setup_status, lines=5)
|
119 |
+
services_output = gr.Textbox(label="Services Status", value=services_status, lines=5)
|
|
|
|
|
|
|
|
|
120 |
|
121 |
+
# Chat interface
|
122 |
+
with gr.Tabs():
|
123 |
+
with gr.TabItem("Speech Input"):
|
124 |
+
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record or upload audio")
|
125 |
+
transcription_output = gr.Textbox(label="Transcription", interactive=False)
|
126 |
+
submit_audio_btn = gr.Button("Submit Audio", interactive=False)
|
127 |
+
|
128 |
+
with gr.TabItem("Text Input"):
|
129 |
+
text_input = gr.Textbox(label="Text Input", placeholder="Type your message here...")
|
130 |
+
submit_text_btn = gr.Button("Submit Text", interactive=False)
|
131 |
+
|
132 |
+
# Output area
|
133 |
+
with gr.Row():
|
134 |
+
with gr.Column():
|
135 |
+
chatbot = gr.Chatbot(label="Conversation", height=400)
|
136 |
+
audio_output = gr.Audio(label="Generated Speech", interactive=False)
|
137 |
+
|
138 |
+
# Function to handle setup button
|
139 |
+
def on_setup_click():
|
140 |
+
output = setup_environment()
|
141 |
+
return output, gr.Button.update(interactive=True)
|
142 |
+
|
143 |
+
# Function to handle services button
|
144 |
+
def on_services_click():
|
145 |
+
output = start_services()
|
146 |
+
return output, gr.Button.update(interactive=True), gr.Button.update(interactive=True)
|
147 |
+
|
148 |
+
# Placeholder functions for API calls (to be implemented)
|
149 |
+
def on_audio_input(audio):
|
150 |
+
if audio:
|
151 |
+
# This would use Whisper to transcribe
|
152 |
+
return "Transcription will appear here when services are running."
|
153 |
+
return ""
|
154 |
+
|
155 |
+
def on_audio_submit(audio, chat_history):
|
156 |
+
if not audio:
|
157 |
+
return chat_history, None
|
158 |
|
159 |
+
user_msg = "Audio message (transcription will be added when implemented)"
|
160 |
+
bot_msg = "This is a placeholder response. The full model will be running after starting the services."
|
|
|
161 |
|
162 |
+
history = chat_history + [(user_msg, bot_msg)]
|
163 |
+
return history, None
|
164 |
+
|
165 |
+
def on_text_submit(text, chat_history):
|
166 |
+
if not text:
|
167 |
+
return chat_history, None
|
168 |
|
169 |
+
history = chat_history + [(text, "This is a placeholder response. The full model will be running after starting the services.")]
|
170 |
+
return history, None
|
171 |
+
|
172 |
+
# Connect events
|
173 |
+
setup_btn.click(on_setup_click, outputs=[setup_output, services_btn])
|
174 |
+
services_btn.click(on_services_click, outputs=[services_output, submit_audio_btn, submit_text_btn])
|
175 |
+
|
176 |
+
audio_input.change(on_audio_input, [audio_input], [transcription_output])
|
177 |
+
submit_audio_btn.click(on_audio_submit, [audio_input, chatbot], [chatbot, audio_output])
|
178 |
+
submit_text_btn.click(on_text_submit, [text_input, chatbot], [chatbot, audio_output])
|
179 |
+
|
180 |
+
# Auto-setup on HF Spaces
|
181 |
+
if HF_SPACES:
|
182 |
+
# Run setup automatically in a separate thread
|
183 |
+
def auto_setup():
|
184 |
+
time.sleep(2) # Wait for UI to load
|
185 |
+
setup_output.update(value="Auto-starting setup process...")
|
186 |
+
setup_status = setup_environment()
|
187 |
+
setup_output.update(value=setup_status)
|
188 |
+
services_btn.update(interactive=True)
|
189 |
|
190 |
+
threading.Thread(target=auto_setup, daemon=True).start()
|
|
|
191 |
|
192 |
return demo
|
193 |
|
194 |
if __name__ == "__main__":
|
195 |
+
# Global references to background processes
|
196 |
+
controller_proc = None
|
197 |
+
worker_proc = None
|
198 |
+
|
199 |
+
# Build the UI
|
200 |
+
demo = create_chat_ui()
|
201 |
+
|
202 |
+
# Launch with appropriate parameters for HF Spaces
|
203 |
+
demo.launch(
|
204 |
+
server_name="0.0.0.0",
|
205 |
+
server_port=int(os.environ.get("PORT", 7860)),
|
206 |
+
share=False,
|
207 |
+
favicon_path="https://huggingface.co/front/assets/huggingface_logo-noborder.ico"
|
208 |
+
)
|
cog.yaml
CHANGED
@@ -17,11 +17,14 @@ build:
|
|
17 |
- "wget"
|
18 |
- "ffmpeg"
|
19 |
- "libsndfile1"
|
|
|
|
|
20 |
run:
|
21 |
- "pip install -e git+https://github.com/pytorch/fairseq.git#egg=fairseq"
|
22 |
-
- "
|
23 |
- "mkdir -p vocoder"
|
24 |
- "wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000 -P vocoder/"
|
25 |
- "wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json -P vocoder/"
|
|
|
26 |
|
27 |
predict: "predict.py:Predictor"
|
|
|
17 |
- "wget"
|
18 |
- "ffmpeg"
|
19 |
- "libsndfile1"
|
20 |
+
- "build-essential"
|
21 |
+
- "git"
|
22 |
run:
|
23 |
- "pip install -e git+https://github.com/pytorch/fairseq.git#egg=fairseq"
|
24 |
+
- "python -c 'import platform; import subprocess; arch=platform.machine(); is_arm64=arch==\"arm64\" or arch==\"aarch64\"; is_darwin=platform.system()==\"Darwin\"; not_compatible=is_arm64 and is_darwin; exit_code=subprocess.call([\"pip\", \"install\", \"flash-attn==2.3.0\"]) if not not_compatible else 0; print(f\"flash-attn installation {'skipped on Apple Silicon' if not_compatible else 'completed' if exit_code==0 else 'failed but continuing'}\")'"
|
25 |
- "mkdir -p vocoder"
|
26 |
- "wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000 -P vocoder/"
|
27 |
- "wget https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json -P vocoder/"
|
28 |
+
- "mkdir -p models/speech_encoder"
|
29 |
|
30 |
predict: "predict.py:Predictor"
|
docker-compose.yml
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: '3'
|
2 |
+
|
3 |
+
services:
|
4 |
+
llama-omni:
|
5 |
+
build:
|
6 |
+
context: .
|
7 |
+
dockerfile: Dockerfile
|
8 |
+
ports:
|
9 |
+
- "7860:7860"
|
10 |
+
volumes:
|
11 |
+
- ./models:/app/models
|
12 |
+
environment:
|
13 |
+
- GRADIO_SERVER_NAME=0.0.0.0
|
14 |
+
- GRADIO_SERVER_PORT=7860
|
15 |
+
deploy:
|
16 |
+
resources:
|
17 |
+
reservations:
|
18 |
+
devices:
|
19 |
+
- driver: nvidia
|
20 |
+
count: 1
|
21 |
+
capabilities: [gpu]
|
requirements.txt
CHANGED
@@ -9,5 +9,4 @@ pydantic>=2.3.0
|
|
9 |
openai-whisper>=0.0.1
|
10 |
tqdm>=4.66.1
|
11 |
requests>=2.31.0
|
12 |
-
git+https://github.com/pytorch/fairseq.git
|
13 |
-
flash-attn>=2.3.0; platform_system != "Darwin" or platform_machine != "arm64"
|
|
|
9 |
openai-whisper>=0.0.1
|
10 |
tqdm>=4.66.1
|
11 |
requests>=2.31.0
|
12 |
+
git+https://github.com/pytorch/fairseq.git
|
|
requirements_spaces.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch>=2.0.0
|
2 |
+
numpy>=1.24.0
|
3 |
+
transformers>=4.34.0
|
4 |
+
accelerate>=0.21.0
|
5 |
+
gradio>=3.50.2
|
6 |
+
fastapi>=0.104.0
|
7 |
+
uvicorn>=0.23.2
|
8 |
+
pydantic>=2.3.0
|
9 |
+
openai-whisper>=0.0.1
|
10 |
+
tqdm>=4.66.1
|
11 |
+
requests>=2.31.0
|
12 |
+
git+https://github.com/pytorch/fairseq.git
|
run_docker.sh
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Make script exit on error
|
4 |
+
set -e
|
5 |
+
|
6 |
+
# Check if Docker is installed
|
7 |
+
if ! command -v docker &> /dev/null; then
|
8 |
+
echo "Error: Docker is not installed. Please install Docker first."
|
9 |
+
exit 1
|
10 |
+
fi
|
11 |
+
|
12 |
+
# Check if docker-compose is installed
|
13 |
+
if ! command -v docker-compose &> /dev/null && ! docker compose version &> /dev/null; then
|
14 |
+
echo "Error: Neither docker-compose nor 'docker compose' is available. Please install docker-compose."
|
15 |
+
exit 1
|
16 |
+
fi
|
17 |
+
|
18 |
+
# Build and start the container
|
19 |
+
echo "Building and starting LLaMA-Omni container..."
|
20 |
+
|
21 |
+
# Check if docker compose plugin or standalone docker-compose is available
|
22 |
+
if docker compose version &> /dev/null; then
|
23 |
+
# Using Docker Compose plugin
|
24 |
+
docker compose up --build
|
25 |
+
else
|
26 |
+
# Using standalone docker-compose
|
27 |
+
docker-compose up --build
|
28 |
+
fi
|
setup_huggingface.sh
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Make script exit on error
|
4 |
+
set -e
|
5 |
+
|
6 |
+
echo "Setting up LLaMA-Omni on Hugging Face Spaces..."
|
7 |
+
|
8 |
+
# Create necessary directories
|
9 |
+
mkdir -p models/speech_encoder vocoder
|
10 |
+
|
11 |
+
# Download vocoder models if needed
|
12 |
+
if [ ! -f "vocoder/g_00500000" ]; then
|
13 |
+
echo "Downloading vocoder models..."
|
14 |
+
wget -P vocoder/ https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000
|
15 |
+
wget -P vocoder/ https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json
|
16 |
+
fi
|
17 |
+
|
18 |
+
# Create empty __init__.py files for the package structure
|
19 |
+
mkdir -p omni_speech/serve omni_speech/infer/examples
|
20 |
+
touch omni_speech/__init__.py
|
21 |
+
touch omni_speech/serve/__init__.py
|
22 |
+
touch omni_speech/infer/__init__.py
|
23 |
+
|
24 |
+
echo "✅ Setup complete! LLaMA-Omni is now ready to run on Hugging Face Spaces."
|