Spaces:
Build error
Build error
Commit
·
7ac31cd
1
Parent(s):
bef9f8f
dsd
Browse files
app.py
CHANGED
@@ -1,59 +1,57 @@
|
|
1 |
import os
|
2 |
import sys
|
3 |
-
import time
|
4 |
import gradio as gr
|
5 |
import whisper
|
6 |
from huggingface_hub import snapshot_download
|
7 |
import torch
|
8 |
import subprocess
|
|
|
9 |
|
10 |
# --- Aggressively update/install transformers and huggingface_hub BEFORE importing them ---
|
11 |
-
print(
|
12 |
try:
|
13 |
-
print(
|
14 |
-
subprocess.check_call([sys.executable,
|
15 |
-
print(
|
16 |
-
subprocess.check_call([sys.executable,
|
17 |
-
print(
|
18 |
-
subprocess.check_call([sys.executable,
|
19 |
-
print(
|
20 |
except subprocess.CalledProcessError as e:
|
21 |
-
print(f
|
22 |
-
print(
|
23 |
except Exception as e:
|
24 |
-
print(f
|
25 |
|
26 |
# --- Now, import from transformers ---
|
27 |
try:
|
28 |
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
|
29 |
-
print(
|
30 |
except ImportError as e:
|
31 |
-
print(f
|
32 |
-
print(
|
33 |
# As a last resort, define dummy classes if import fails, so the rest of the script doesn't crash immediately
|
34 |
class AutoModelForCausalLM: pass
|
35 |
class AutoTokenizer: pass
|
36 |
class AutoConfig: pass
|
37 |
except Exception as e:
|
38 |
-
print(f
|
39 |
|
40 |
# --- Configuration ---
|
41 |
-
WHISPER_MODEL_SIZE =
|
42 |
-
SPEECH_ENCODER_PATH =
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
COSYVOICE_HF_REPO = "ICTNLP/cosy2_decoder"
|
47 |
-
COSYVOICE_PATH = "models/cosy2_decoder"
|
48 |
|
49 |
# --- Print diagnostics ---
|
50 |
-
print(
|
51 |
-
print(
|
52 |
-
print(
|
53 |
-
print(f
|
54 |
if torch.cuda.is_available():
|
55 |
-
print(f
|
56 |
-
print(f
|
57 |
|
58 |
# --- Main models ---
|
59 |
whisper_model = None
|
@@ -61,115 +59,112 @@ llama_model = None
|
|
61 |
tokenizer = None
|
62 |
|
63 |
def load_whisper_model():
|
64 |
-
|
65 |
global whisper_model
|
66 |
-
print(f
|
67 |
|
68 |
# Create directory if it doesn't exist
|
69 |
os.makedirs(SPEECH_ENCODER_PATH, exist_ok=True)
|
70 |
|
71 |
# Load the model (will download if not present)
|
72 |
whisper_model = whisper.load_model(WHISPER_MODEL_SIZE, download_root=SPEECH_ENCODER_PATH)
|
73 |
-
print(f
|
74 |
return whisper_model
|
75 |
|
76 |
def load_llama_model():
|
77 |
-
|
78 |
global llama_model, tokenizer
|
79 |
-
print(f
|
80 |
|
81 |
# Ensure local model directory exists for downloads
|
82 |
-
os.makedirs(
|
83 |
|
84 |
# Download model files if they aren't already present locally
|
85 |
# Check for a common file like config.json to decide if download is needed
|
86 |
-
if not os.path.exists(os.path.join(
|
87 |
-
print(f
|
88 |
try:
|
89 |
snapshot_download(
|
90 |
-
repo_id=
|
91 |
-
local_dir=
|
92 |
local_dir_use_symlinks=False,
|
93 |
resume_download=True,
|
94 |
-
# token=os.environ.get("HF_TOKEN") # Optional: use if your model is private
|
95 |
)
|
96 |
-
print(
|
97 |
except Exception as e:
|
98 |
-
print(f
|
99 |
-
#
|
100 |
-
# The function will try to load from local path anyway, but it will likely fail.
|
101 |
-
pass # Allow to proceed to loading attempt, which will then fail more descriptively
|
102 |
|
103 |
try:
|
104 |
-
device =
|
105 |
-
torch_dtype = torch.float16 if device ==
|
106 |
-
print(f
|
107 |
|
108 |
-
print(f
|
109 |
try:
|
110 |
tokenizer = AutoTokenizer.from_pretrained(
|
111 |
-
|
112 |
trust_remote_code=True
|
113 |
)
|
114 |
-
print(
|
115 |
|
116 |
config = AutoConfig.from_pretrained(
|
117 |
-
|
118 |
trust_remote_code=True
|
119 |
)
|
120 |
-
print(
|
121 |
|
122 |
llama_model = AutoModelForCausalLM.from_pretrained(
|
123 |
-
|
124 |
-
config=config,
|
125 |
torch_dtype=torch_dtype,
|
126 |
-
device_map=device,
|
127 |
trust_remote_code=True
|
128 |
)
|
129 |
-
print(f
|
130 |
return llama_model
|
131 |
except Exception as e1:
|
132 |
-
print(f
|
133 |
-
print(
|
134 |
|
135 |
-
print(f
|
136 |
try:
|
137 |
tokenizer = AutoTokenizer.from_pretrained(
|
138 |
-
|
139 |
trust_remote_code=True
|
140 |
)
|
141 |
-
print(
|
142 |
|
143 |
config = AutoConfig.from_pretrained(
|
144 |
-
|
145 |
trust_remote_code=True
|
146 |
)
|
147 |
-
print(
|
148 |
|
149 |
llama_model = AutoModelForCausalLM.from_pretrained(
|
150 |
-
|
151 |
config=config,
|
152 |
torch_dtype=torch_dtype,
|
153 |
device_map=device,
|
154 |
trust_remote_code=True
|
155 |
)
|
156 |
-
print(f
|
157 |
return llama_model
|
158 |
except Exception as e2:
|
159 |
-
print(f
|
160 |
|
161 |
-
print(
|
162 |
-
raise RuntimeError(
|
163 |
|
164 |
except Exception as e_outer:
|
165 |
-
print(f
|
166 |
-
print(
|
167 |
-
llama_model = None
|
168 |
-
tokenizer = None
|
169 |
return None
|
170 |
|
171 |
def transcribe_audio(audio_path):
|
172 |
-
|
173 |
global whisper_model
|
174 |
|
175 |
if whisper_model is None:
|
@@ -177,12 +172,12 @@ def transcribe_audio(audio_path):
|
|
177 |
|
178 |
try:
|
179 |
result = whisper_model.transcribe(audio_path)
|
180 |
-
return result[
|
181 |
except Exception as e:
|
182 |
-
return f
|
183 |
|
184 |
def generate_text(input_text):
|
185 |
-
|
186 |
global llama_model, tokenizer
|
187 |
|
188 |
if llama_model is None or tokenizer is None:
|
@@ -191,10 +186,10 @@ def generate_text(input_text):
|
|
191 |
try:
|
192 |
# If model loading failed, just return a placeholder response
|
193 |
if llama_model is None:
|
194 |
-
return f
|
195 |
|
196 |
device = next(llama_model.parameters()).device
|
197 |
-
inputs = tokenizer(input_text, return_tensors=
|
198 |
|
199 |
outputs = llama_model.generate(
|
200 |
inputs.input_ids,
|
@@ -206,10 +201,10 @@ def generate_text(input_text):
|
|
206 |
|
207 |
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
208 |
except Exception as e:
|
209 |
-
return f
|
210 |
|
211 |
def speech_to_text_to_speech(audio_path):
|
212 |
-
|
213 |
# First transcribe the audio
|
214 |
transcription = transcribe_audio(audio_path)
|
215 |
|
@@ -218,42 +213,38 @@ def speech_to_text_to_speech(audio_path):
|
|
218 |
|
219 |
return transcription, response
|
220 |
|
221 |
-
# --- Gradio Interface ---
|
222 |
def create_demo():
|
223 |
-
with gr.Blocks(title=
|
224 |
-
gr.Markdown(
|
|
|
225 |
|
226 |
-
with gr.Tab(
|
227 |
with gr.Row():
|
228 |
-
text_input = gr.Textbox(label=
|
229 |
-
text_output = gr.Textbox(label=
|
230 |
|
231 |
-
text_button = gr.Button(
|
232 |
text_button.click(generate_text, inputs=text_input, outputs=text_output)
|
233 |
|
234 |
-
with gr.Tab(
|
235 |
-
audio_input = gr.Audio(type=
|
236 |
-
transcription_output = gr.Textbox(label=
|
237 |
-
response_output = gr.Textbox(label=
|
238 |
|
239 |
-
transcribe_button = gr.Button(
|
240 |
transcribe_button.click(speech_to_text_to_speech,
|
241 |
inputs=audio_input,
|
242 |
outputs=[transcription_output, response_output])
|
243 |
|
244 |
-
gr.Markdown(
|
245 |
|
246 |
return demo
|
247 |
|
248 |
# --- Main entry point ---
|
249 |
-
if __name__ ==
|
250 |
-
print(
|
251 |
-
|
252 |
-
# Pre-load models (comment out if you want lazy loading)
|
253 |
-
# print("Pre-loading models...")
|
254 |
-
# load_whisper_model()
|
255 |
-
# load_llama_model()
|
256 |
|
257 |
# Create and launch the Gradio interface
|
258 |
demo = create_demo()
|
259 |
-
demo.launch(server_name=
|
|
|
1 |
import os
|
2 |
import sys
|
|
|
3 |
import gradio as gr
|
4 |
import whisper
|
5 |
from huggingface_hub import snapshot_download
|
6 |
import torch
|
7 |
import subprocess
|
8 |
+
import transformers; transformers.utils.import_utils.check_dependency_versions()
|
9 |
|
10 |
# --- Aggressively update/install transformers and huggingface_hub BEFORE importing them ---
|
11 |
+
print('Attempting to upgrade pip, transformers, and huggingface_hub...')
|
12 |
try:
|
13 |
+
print('Upgrading pip...')
|
14 |
+
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-U', 'pip'])
|
15 |
+
print('Upgrading transformers and huggingface_hub...')
|
16 |
+
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-U', 'transformers', 'huggingface_hub'])
|
17 |
+
print('Attempting to install transformers from main branch for latest features...')
|
18 |
+
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'git+https://github.com/huggingface/transformers.git'])
|
19 |
+
print('Pip, Transformers, and huggingface_hub update/install process completed.')
|
20 |
except subprocess.CalledProcessError as e:
|
21 |
+
print(f'ERROR: Failed to upgrade/install packages: {e}')
|
22 |
+
print('Continuing with potentially older versions. This might lead to model loading issues.')
|
23 |
except Exception as e:
|
24 |
+
print(f'An unexpected error occurred during package upgrades: {e}')
|
25 |
|
26 |
# --- Now, import from transformers ---
|
27 |
try:
|
28 |
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
|
29 |
+
print('Successfully imported AutoModelForCausalLM, AutoTokenizer, AutoConfig from transformers.')
|
30 |
except ImportError as e:
|
31 |
+
print(f'CRITICAL ERROR: Failed to import from transformers after attempting upgrades: {e}')
|
32 |
+
print('The application might not work correctly. Please check the environment and dependencies.')
|
33 |
# As a last resort, define dummy classes if import fails, so the rest of the script doesn't crash immediately
|
34 |
class AutoModelForCausalLM: pass
|
35 |
class AutoTokenizer: pass
|
36 |
class AutoConfig: pass
|
37 |
except Exception as e:
|
38 |
+
print(f'An unexpected error occurred during transformers import: {e}')
|
39 |
|
40 |
# --- Configuration ---
|
41 |
+
WHISPER_MODEL_SIZE = 'small' # Using smallest model for faster processing in testing
|
42 |
+
SPEECH_ENCODER_PATH = 'models/speech_encoder'
|
43 |
+
MODEL_NAME = 'LLaMA-Omni2-0.5B'
|
44 |
+
MODEL_PATH = f'models/{MODEL_NAME}'
|
45 |
+
HF_REPO = f'ICTNLP/{MODEL_NAME}'
|
|
|
|
|
46 |
|
47 |
# --- Print diagnostics ---
|
48 |
+
print('===== Application Startup =====')
|
49 |
+
print('Python:', sys.version)
|
50 |
+
print('Torch version:', torch.__version__)
|
51 |
+
print(f'CUDA available: {torch.cuda.is_available()}')
|
52 |
if torch.cuda.is_available():
|
53 |
+
print(f'CUDA device: {torch.cuda.get_device_name(0)}')
|
54 |
+
print(f'CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB')
|
55 |
|
56 |
# --- Main models ---
|
57 |
whisper_model = None
|
|
|
59 |
tokenizer = None
|
60 |
|
61 |
def load_whisper_model():
|
62 |
+
'''Load Whisper model for speech recognition'''
|
63 |
global whisper_model
|
64 |
+
print(f'Loading Whisper {WHISPER_MODEL_SIZE} model...')
|
65 |
|
66 |
# Create directory if it doesn't exist
|
67 |
os.makedirs(SPEECH_ENCODER_PATH, exist_ok=True)
|
68 |
|
69 |
# Load the model (will download if not present)
|
70 |
whisper_model = whisper.load_model(WHISPER_MODEL_SIZE, download_root=SPEECH_ENCODER_PATH)
|
71 |
+
print(f'Whisper {WHISPER_MODEL_SIZE} model loaded successfully!')
|
72 |
return whisper_model
|
73 |
|
74 |
def load_llama_model():
|
75 |
+
'''Load LLaMA-Omni2 model'''
|
76 |
global llama_model, tokenizer
|
77 |
+
print(f'Attempting to load LLaMA-Omni2 model: {HF_REPO}')
|
78 |
|
79 |
# Ensure local model directory exists for downloads
|
80 |
+
os.makedirs(MODEL_PATH, exist_ok=True)
|
81 |
|
82 |
# Download model files if they aren't already present locally
|
83 |
# Check for a common file like config.json to decide if download is needed
|
84 |
+
if not os.path.exists(os.path.join(MODEL_PATH, 'config.json')):
|
85 |
+
print(f'Local model files not found. Downloading from Hugging Face Hub: {HF_REPO} to {MODEL_PATH}')
|
86 |
try:
|
87 |
snapshot_download(
|
88 |
+
repo_id=HF_REPO,
|
89 |
+
local_dir=MODEL_PATH,
|
90 |
local_dir_use_symlinks=False,
|
91 |
resume_download=True,
|
|
|
92 |
)
|
93 |
+
print('Model download complete.')
|
94 |
except Exception as e:
|
95 |
+
print(f'ERROR during model download: {e}')
|
96 |
+
pass # Allow to proceed to loading attempt, which will then fail more descriptively
|
|
|
|
|
97 |
|
98 |
try:
|
99 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
100 |
+
torch_dtype = torch.float16 if device == 'cuda' else torch.float32
|
101 |
+
print(f'Target device: {device}, dtype: {torch_dtype}')
|
102 |
|
103 |
+
print(f'Attempt 1: Loading tokenizer and model directly from Hub identifier: {HF_REPO} with trust_remote_code=True')
|
104 |
try:
|
105 |
tokenizer = AutoTokenizer.from_pretrained(
|
106 |
+
HF_REPO,
|
107 |
trust_remote_code=True
|
108 |
)
|
109 |
+
print('Tokenizer loaded successfully from Hub identifier.')
|
110 |
|
111 |
config = AutoConfig.from_pretrained(
|
112 |
+
HF_REPO,
|
113 |
trust_remote_code=True
|
114 |
)
|
115 |
+
print('Config loaded successfully from Hub identifier.')
|
116 |
|
117 |
llama_model = AutoModelForCausalLM.from_pretrained(
|
118 |
+
HF_REPO,
|
119 |
+
config=config, # Pass the loaded config
|
120 |
torch_dtype=torch_dtype,
|
121 |
+
device_map=device, # device_map handles moving parts of the model to CPU if OOM on GPU
|
122 |
trust_remote_code=True
|
123 |
)
|
124 |
+
print(f'LLaMA-Omni2 model loaded successfully directly from Hub: {HF_REPO}')
|
125 |
return llama_model
|
126 |
except Exception as e1:
|
127 |
+
print(f'Error in Attempt 1 (direct Hub load for {HF_REPO}): {e1}')
|
128 |
+
print('This often means the model requires a specific transformers version or has complex remote code.')
|
129 |
|
130 |
+
print(f'Attempt 2: Loading tokenizer and model from local path: {MODEL_PATH} with trust_remote_code=True (fallback)')
|
131 |
try:
|
132 |
tokenizer = AutoTokenizer.from_pretrained(
|
133 |
+
MODEL_PATH, # Fallback to local path
|
134 |
trust_remote_code=True
|
135 |
)
|
136 |
+
print('Tokenizer loaded successfully from local path.')
|
137 |
|
138 |
config = AutoConfig.from_pretrained(
|
139 |
+
MODEL_PATH,
|
140 |
trust_remote_code=True
|
141 |
)
|
142 |
+
print('Config loaded successfully from local path.')
|
143 |
|
144 |
llama_model = AutoModelForCausalLM.from_pretrained(
|
145 |
+
MODEL_PATH, # Fallback to local path
|
146 |
config=config,
|
147 |
torch_dtype=torch_dtype,
|
148 |
device_map=device,
|
149 |
trust_remote_code=True
|
150 |
)
|
151 |
+
print(f'LLaMA-Omni2 model loaded successfully from local path: {MODEL_PATH}')
|
152 |
return llama_model
|
153 |
except Exception as e2:
|
154 |
+
print(f'Error in Attempt 2 (local path load for {MODEL_PATH}): {e2}')
|
155 |
|
156 |
+
print('All attempts to load the LLaMA-Omni2 model failed.')
|
157 |
+
raise RuntimeError('Failed to load LLaMA-Omni2 model after multiple attempts.')
|
158 |
|
159 |
except Exception as e_outer:
|
160 |
+
print(f'CRITICAL ERROR loading LLaMA-Omni2 model: {e_outer}')
|
161 |
+
print('Falling back: Text generation will not be available.')
|
162 |
+
llama_model = None # Ensure llama_model is None if loading fails
|
163 |
+
tokenizer = None # Ensure tokenizer is None
|
164 |
return None
|
165 |
|
166 |
def transcribe_audio(audio_path):
|
167 |
+
'''Transcribe audio using Whisper'''
|
168 |
global whisper_model
|
169 |
|
170 |
if whisper_model is None:
|
|
|
172 |
|
173 |
try:
|
174 |
result = whisper_model.transcribe(audio_path)
|
175 |
+
return result['text']
|
176 |
except Exception as e:
|
177 |
+
return f'Error transcribing audio: {e}'
|
178 |
|
179 |
def generate_text(input_text):
|
180 |
+
'''Generate text using LLaMA-Omni2'''
|
181 |
global llama_model, tokenizer
|
182 |
|
183 |
if llama_model is None or tokenizer is None:
|
|
|
186 |
try:
|
187 |
# If model loading failed, just return a placeholder response
|
188 |
if llama_model is None:
|
189 |
+
return f'Model could not be loaded. Input was: {input_text}'
|
190 |
|
191 |
device = next(llama_model.parameters()).device
|
192 |
+
inputs = tokenizer(input_text, return_tensors='pt').to(device)
|
193 |
|
194 |
outputs = llama_model.generate(
|
195 |
inputs.input_ids,
|
|
|
201 |
|
202 |
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
203 |
except Exception as e:
|
204 |
+
return f'Error generating text: {e}'
|
205 |
|
206 |
def speech_to_text_to_speech(audio_path):
|
207 |
+
'''Pipeline: Speech -> Text -> Response'''
|
208 |
# First transcribe the audio
|
209 |
transcription = transcribe_audio(audio_path)
|
210 |
|
|
|
213 |
|
214 |
return transcription, response
|
215 |
|
216 |
+
# --- Gradio Interface for Hugging Face Spaces ---
|
217 |
def create_demo():
|
218 |
+
with gr.Blocks(title='LLaMA-Omni2 Demo on Hugging Face Spaces') as demo:
|
219 |
+
gr.Markdown('# LLaMA-Omni2 Demo')
|
220 |
+
gr.Markdown('This demo uses the smallest Whisper model and LLaMA-Omni2-0.5B for testing purposes.')
|
221 |
|
222 |
+
with gr.Tab('Text Generation'):
|
223 |
with gr.Row():
|
224 |
+
text_input = gr.Textbox(label='Input Text', placeholder='Enter text here...')
|
225 |
+
text_output = gr.Textbox(label='Generated Response')
|
226 |
|
227 |
+
text_button = gr.Button('Generate Response')
|
228 |
text_button.click(generate_text, inputs=text_input, outputs=text_output)
|
229 |
|
230 |
+
with gr.Tab('Speech-to-Text'):
|
231 |
+
audio_input = gr.Audio(type='filepath', label='Upload or Record Audio')
|
232 |
+
transcription_output = gr.Textbox(label='Transcription')
|
233 |
+
response_output = gr.Textbox(label='Generated Response')
|
234 |
|
235 |
+
transcribe_button = gr.Button('Transcribe and Respond')
|
236 |
transcribe_button.click(speech_to_text_to_speech,
|
237 |
inputs=audio_input,
|
238 |
outputs=[transcription_output, response_output])
|
239 |
|
240 |
+
gr.Markdown('### Note: The first run will download models if needed, which may take some time.')
|
241 |
|
242 |
return demo
|
243 |
|
244 |
# --- Main entry point ---
|
245 |
+
if __name__ == '__main__':
|
246 |
+
print('Starting LLaMA-Omni2 Interface for Hugging Face Spaces...')
|
|
|
|
|
|
|
|
|
|
|
247 |
|
248 |
# Create and launch the Gradio interface
|
249 |
demo = create_demo()
|
250 |
+
demo.launch(server_name='0.0.0.0', server_port=7860, share=True) # share=True for Hugging Face Spaces
|