marcosremar2 commited on
Commit
7ac31cd
·
1 Parent(s): bef9f8f
Files changed (1) hide show
  1. app.py +93 -102
app.py CHANGED
@@ -1,59 +1,57 @@
1
  import os
2
  import sys
3
- import time
4
  import gradio as gr
5
  import whisper
6
  from huggingface_hub import snapshot_download
7
  import torch
8
  import subprocess
 
9
 
10
  # --- Aggressively update/install transformers and huggingface_hub BEFORE importing them ---
11
- print("Attempting to upgrade pip, transformers, and huggingface_hub...")
12
  try:
13
- print("Upgrading pip...")
14
- subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "pip"])
15
- print("Upgrading transformers and huggingface_hub...")
16
- subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "transformers", "huggingface_hub"])
17
- print("Attempting to install transformers from main branch for latest features...")
18
- subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/huggingface/transformers.git"])
19
- print("Pip, Transformers, and huggingface_hub update/install process completed.")
20
  except subprocess.CalledProcessError as e:
21
- print(f"ERROR: Failed to upgrade/install packages: {e}")
22
- print("Continuing with potentially older versions. This might lead to model loading issues.")
23
  except Exception as e:
24
- print(f"An unexpected error occurred during package upgrades: {e}")
25
 
26
  # --- Now, import from transformers ---
27
  try:
28
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
29
- print("Successfully imported AutoModelForCausalLM, AutoTokenizer, AutoConfig from transformers.")
30
  except ImportError as e:
31
- print(f"CRITICAL ERROR: Failed to import from transformers after attempting upgrades: {e}")
32
- print("The application might not work correctly. Please check the environment and dependencies.")
33
  # As a last resort, define dummy classes if import fails, so the rest of the script doesn't crash immediately
34
  class AutoModelForCausalLM: pass
35
  class AutoTokenizer: pass
36
  class AutoConfig: pass
37
  except Exception as e:
38
- print(f"An unexpected error occurred during transformers import: {e}")
39
 
40
  # --- Configuration ---
41
- WHISPER_MODEL_SIZE = "small" # Using small model for faster processing
42
- SPEECH_ENCODER_PATH = "models/speech_encoder"
43
- LLAMA_OMNI2_MODEL_NAME = "LLaMA-Omni2-0.5B"
44
- LLAMA_OMNI2_HF_REPO = f"ICTNLP/{LLAMA_OMNI2_MODEL_NAME}"
45
- LLAMA_OMNI2_MODEL_PATH = f"models/{LLAMA_OMNI2_MODEL_NAME}"
46
- COSYVOICE_HF_REPO = "ICTNLP/cosy2_decoder"
47
- COSYVOICE_PATH = "models/cosy2_decoder"
48
 
49
  # --- Print diagnostics ---
50
- print("===== Application Startup =====")
51
- print("Python:", sys.version)
52
- print("Torch version:", torch.__version__)
53
- print(f"CUDA available: {torch.cuda.is_available()}")
54
  if torch.cuda.is_available():
55
- print(f"CUDA device: {torch.cuda.get_device_name(0)}")
56
- print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
57
 
58
  # --- Main models ---
59
  whisper_model = None
@@ -61,115 +59,112 @@ llama_model = None
61
  tokenizer = None
62
 
63
  def load_whisper_model():
64
- """Load Whisper model for speech recognition"""
65
  global whisper_model
66
- print(f"Loading Whisper {WHISPER_MODEL_SIZE} model...")
67
 
68
  # Create directory if it doesn't exist
69
  os.makedirs(SPEECH_ENCODER_PATH, exist_ok=True)
70
 
71
  # Load the model (will download if not present)
72
  whisper_model = whisper.load_model(WHISPER_MODEL_SIZE, download_root=SPEECH_ENCODER_PATH)
73
- print(f"Whisper {WHISPER_MODEL_SIZE} model loaded successfully!")
74
  return whisper_model
75
 
76
  def load_llama_model():
77
- """Load LLaMA-Omni2 model"""
78
  global llama_model, tokenizer
79
- print(f"Attempting to load LLaMA-Omni2 model: {LLAMA_OMNI2_HF_REPO}")
80
 
81
  # Ensure local model directory exists for downloads
82
- os.makedirs(LLAMA_OMNI2_MODEL_PATH, exist_ok=True)
83
 
84
  # Download model files if they aren't already present locally
85
  # Check for a common file like config.json to decide if download is needed
86
- if not os.path.exists(os.path.join(LLAMA_OMNI2_MODEL_PATH, "config.json")):
87
- print(f"Local model files not found. Downloading from Hugging Face Hub: {LLAMA_OMNI2_HF_REPO} to {LLAMA_OMNI2_MODEL_PATH}")
88
  try:
89
  snapshot_download(
90
- repo_id=LLAMA_OMNI2_HF_REPO,
91
- local_dir=LLAMA_OMNI2_MODEL_PATH,
92
  local_dir_use_symlinks=False,
93
  resume_download=True,
94
- # token=os.environ.get("HF_TOKEN") # Optional: use if your model is private
95
  )
96
- print("Model download complete.")
97
  except Exception as e:
98
- print(f"ERROR during model download: {e}")
99
- # If download fails, we likely can't proceed with this model.
100
- # The function will try to load from local path anyway, but it will likely fail.
101
- pass # Allow to proceed to loading attempt, which will then fail more descriptively
102
 
103
  try:
104
- device = "cuda" if torch.cuda.is_available() else "cpu"
105
- torch_dtype = torch.float16 if device == "cuda" else torch.float32
106
- print(f"Target device: {device}, dtype: {torch_dtype}")
107
 
108
- print(f"Attempt 1: Loading tokenizer and model directly from Hub identifier: {LLAMA_OMNI2_HF_REPO} with trust_remote_code=True")
109
  try:
110
  tokenizer = AutoTokenizer.from_pretrained(
111
- LLAMA_OMNI2_HF_REPO,
112
  trust_remote_code=True
113
  )
114
- print("Tokenizer loaded successfully from Hub identifier.")
115
 
116
  config = AutoConfig.from_pretrained(
117
- LLAMA_OMNI2_HF_REPO,
118
  trust_remote_code=True
119
  )
120
- print("Config loaded successfully from Hub identifier.")
121
 
122
  llama_model = AutoModelForCausalLM.from_pretrained(
123
- LLAMA_OMNI2_HF_REPO,
124
- config=config, # Pass the loaded config
125
  torch_dtype=torch_dtype,
126
- device_map=device, # device_map handles moving parts of the model to CPU if OOM on GPU
127
  trust_remote_code=True
128
  )
129
- print(f"LLaMA-Omni2 model loaded successfully directly from Hub: {LLAMA_OMNI2_HF_REPO}")
130
  return llama_model
131
  except Exception as e1:
132
- print(f"Error in Attempt 1 (direct Hub load for {LLAMA_OMNI2_HF_REPO}): {e1}")
133
- print("This often means the model requires a specific transformers version or has complex remote code.")
134
 
135
- print(f"Attempt 2: Loading tokenizer and model from local path: {LLAMA_OMNI2_MODEL_PATH} with trust_remote_code=True (fallback)")
136
  try:
137
  tokenizer = AutoTokenizer.from_pretrained(
138
- LLAMA_OMNI2_MODEL_PATH, # Fallback to local path
139
  trust_remote_code=True
140
  )
141
- print("Tokenizer loaded successfully from local path.")
142
 
143
  config = AutoConfig.from_pretrained(
144
- LLAMA_OMNI2_MODEL_PATH,
145
  trust_remote_code=True
146
  )
147
- print("Config loaded successfully from local path.")
148
 
149
  llama_model = AutoModelForCausalLM.from_pretrained(
150
- LLAMA_OMNI2_MODEL_PATH, # Fallback to local path
151
  config=config,
152
  torch_dtype=torch_dtype,
153
  device_map=device,
154
  trust_remote_code=True
155
  )
156
- print(f"LLaMA-Omni2 model loaded successfully from local path: {LLAMA_OMNI2_MODEL_PATH}")
157
  return llama_model
158
  except Exception as e2:
159
- print(f"Error in Attempt 2 (local path load for {LLAMA_OMNI2_MODEL_PATH}): {e2}")
160
 
161
- print("All attempts to load the LLaMA-Omni2 model failed.")
162
- raise RuntimeError("Failed to load LLaMA-Omni2 model after multiple attempts.")
163
 
164
  except Exception as e_outer:
165
- print(f"CRITICAL ERROR loading LLaMA-Omni2 model: {e_outer}")
166
- print("Falling back: Text generation will not be available.")
167
- llama_model = None # Ensure llama_model is None if loading fails
168
- tokenizer = None # Ensure tokenizer is None
169
  return None
170
 
171
  def transcribe_audio(audio_path):
172
- """Transcribe audio using Whisper"""
173
  global whisper_model
174
 
175
  if whisper_model is None:
@@ -177,12 +172,12 @@ def transcribe_audio(audio_path):
177
 
178
  try:
179
  result = whisper_model.transcribe(audio_path)
180
- return result["text"]
181
  except Exception as e:
182
- return f"Error transcribing audio: {e}"
183
 
184
  def generate_text(input_text):
185
- """Generate text using LLaMA-Omni2"""
186
  global llama_model, tokenizer
187
 
188
  if llama_model is None or tokenizer is None:
@@ -191,10 +186,10 @@ def generate_text(input_text):
191
  try:
192
  # If model loading failed, just return a placeholder response
193
  if llama_model is None:
194
- return f"Model could not be loaded. Input was: {input_text}"
195
 
196
  device = next(llama_model.parameters()).device
197
- inputs = tokenizer(input_text, return_tensors="pt").to(device)
198
 
199
  outputs = llama_model.generate(
200
  inputs.input_ids,
@@ -206,10 +201,10 @@ def generate_text(input_text):
206
 
207
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
208
  except Exception as e:
209
- return f"Error generating text: {e}"
210
 
211
  def speech_to_text_to_speech(audio_path):
212
- """Pipeline: Speech -> Text -> Response"""
213
  # First transcribe the audio
214
  transcription = transcribe_audio(audio_path)
215
 
@@ -218,42 +213,38 @@ def speech_to_text_to_speech(audio_path):
218
 
219
  return transcription, response
220
 
221
- # --- Gradio Interface ---
222
  def create_demo():
223
- with gr.Blocks(title="LLaMA-Omni2 Interface") as demo:
224
- gr.Markdown("# LLaMA-Omni2 Demo")
 
225
 
226
- with gr.Tab("Text Generation"):
227
  with gr.Row():
228
- text_input = gr.Textbox(label="Input Text", placeholder="Enter text here...")
229
- text_output = gr.Textbox(label="Generated Response")
230
 
231
- text_button = gr.Button("Generate Response")
232
  text_button.click(generate_text, inputs=text_input, outputs=text_output)
233
 
234
- with gr.Tab("Speech-to-Text"):
235
- audio_input = gr.Audio(type="filepath", label="Upload or Record Audio")
236
- transcription_output = gr.Textbox(label="Transcription")
237
- response_output = gr.Textbox(label="Generated Response")
238
 
239
- transcribe_button = gr.Button("Transcribe and Respond")
240
  transcribe_button.click(speech_to_text_to_speech,
241
  inputs=audio_input,
242
  outputs=[transcription_output, response_output])
243
 
244
- gr.Markdown("### Note: The first run will download models if needed, which may take some time.")
245
 
246
  return demo
247
 
248
  # --- Main entry point ---
249
- if __name__ == "__main__":
250
- print("Starting LLaMA-Omni2 Interface...")
251
-
252
- # Pre-load models (comment out if you want lazy loading)
253
- # print("Pre-loading models...")
254
- # load_whisper_model()
255
- # load_llama_model()
256
 
257
  # Create and launch the Gradio interface
258
  demo = create_demo()
259
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 
1
  import os
2
  import sys
 
3
  import gradio as gr
4
  import whisper
5
  from huggingface_hub import snapshot_download
6
  import torch
7
  import subprocess
8
+ import transformers; transformers.utils.import_utils.check_dependency_versions()
9
 
10
  # --- Aggressively update/install transformers and huggingface_hub BEFORE importing them ---
11
+ print('Attempting to upgrade pip, transformers, and huggingface_hub...')
12
  try:
13
+ print('Upgrading pip...')
14
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-U', 'pip'])
15
+ print('Upgrading transformers and huggingface_hub...')
16
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-U', 'transformers', 'huggingface_hub'])
17
+ print('Attempting to install transformers from main branch for latest features...')
18
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'git+https://github.com/huggingface/transformers.git'])
19
+ print('Pip, Transformers, and huggingface_hub update/install process completed.')
20
  except subprocess.CalledProcessError as e:
21
+ print(f'ERROR: Failed to upgrade/install packages: {e}')
22
+ print('Continuing with potentially older versions. This might lead to model loading issues.')
23
  except Exception as e:
24
+ print(f'An unexpected error occurred during package upgrades: {e}')
25
 
26
  # --- Now, import from transformers ---
27
  try:
28
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
29
+ print('Successfully imported AutoModelForCausalLM, AutoTokenizer, AutoConfig from transformers.')
30
  except ImportError as e:
31
+ print(f'CRITICAL ERROR: Failed to import from transformers after attempting upgrades: {e}')
32
+ print('The application might not work correctly. Please check the environment and dependencies.')
33
  # As a last resort, define dummy classes if import fails, so the rest of the script doesn't crash immediately
34
  class AutoModelForCausalLM: pass
35
  class AutoTokenizer: pass
36
  class AutoConfig: pass
37
  except Exception as e:
38
+ print(f'An unexpected error occurred during transformers import: {e}')
39
 
40
  # --- Configuration ---
41
+ WHISPER_MODEL_SIZE = 'small' # Using smallest model for faster processing in testing
42
+ SPEECH_ENCODER_PATH = 'models/speech_encoder'
43
+ MODEL_NAME = 'LLaMA-Omni2-0.5B'
44
+ MODEL_PATH = f'models/{MODEL_NAME}'
45
+ HF_REPO = f'ICTNLP/{MODEL_NAME}'
 
 
46
 
47
  # --- Print diagnostics ---
48
+ print('===== Application Startup =====')
49
+ print('Python:', sys.version)
50
+ print('Torch version:', torch.__version__)
51
+ print(f'CUDA available: {torch.cuda.is_available()}')
52
  if torch.cuda.is_available():
53
+ print(f'CUDA device: {torch.cuda.get_device_name(0)}')
54
+ print(f'CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB')
55
 
56
  # --- Main models ---
57
  whisper_model = None
 
59
  tokenizer = None
60
 
61
  def load_whisper_model():
62
+ '''Load Whisper model for speech recognition'''
63
  global whisper_model
64
+ print(f'Loading Whisper {WHISPER_MODEL_SIZE} model...')
65
 
66
  # Create directory if it doesn't exist
67
  os.makedirs(SPEECH_ENCODER_PATH, exist_ok=True)
68
 
69
  # Load the model (will download if not present)
70
  whisper_model = whisper.load_model(WHISPER_MODEL_SIZE, download_root=SPEECH_ENCODER_PATH)
71
+ print(f'Whisper {WHISPER_MODEL_SIZE} model loaded successfully!')
72
  return whisper_model
73
 
74
  def load_llama_model():
75
+ '''Load LLaMA-Omni2 model'''
76
  global llama_model, tokenizer
77
+ print(f'Attempting to load LLaMA-Omni2 model: {HF_REPO}')
78
 
79
  # Ensure local model directory exists for downloads
80
+ os.makedirs(MODEL_PATH, exist_ok=True)
81
 
82
  # Download model files if they aren't already present locally
83
  # Check for a common file like config.json to decide if download is needed
84
+ if not os.path.exists(os.path.join(MODEL_PATH, 'config.json')):
85
+ print(f'Local model files not found. Downloading from Hugging Face Hub: {HF_REPO} to {MODEL_PATH}')
86
  try:
87
  snapshot_download(
88
+ repo_id=HF_REPO,
89
+ local_dir=MODEL_PATH,
90
  local_dir_use_symlinks=False,
91
  resume_download=True,
 
92
  )
93
+ print('Model download complete.')
94
  except Exception as e:
95
+ print(f'ERROR during model download: {e}')
96
+ pass # Allow to proceed to loading attempt, which will then fail more descriptively
 
 
97
 
98
  try:
99
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
100
+ torch_dtype = torch.float16 if device == 'cuda' else torch.float32
101
+ print(f'Target device: {device}, dtype: {torch_dtype}')
102
 
103
+ print(f'Attempt 1: Loading tokenizer and model directly from Hub identifier: {HF_REPO} with trust_remote_code=True')
104
  try:
105
  tokenizer = AutoTokenizer.from_pretrained(
106
+ HF_REPO,
107
  trust_remote_code=True
108
  )
109
+ print('Tokenizer loaded successfully from Hub identifier.')
110
 
111
  config = AutoConfig.from_pretrained(
112
+ HF_REPO,
113
  trust_remote_code=True
114
  )
115
+ print('Config loaded successfully from Hub identifier.')
116
 
117
  llama_model = AutoModelForCausalLM.from_pretrained(
118
+ HF_REPO,
119
+ config=config, # Pass the loaded config
120
  torch_dtype=torch_dtype,
121
+ device_map=device, # device_map handles moving parts of the model to CPU if OOM on GPU
122
  trust_remote_code=True
123
  )
124
+ print(f'LLaMA-Omni2 model loaded successfully directly from Hub: {HF_REPO}')
125
  return llama_model
126
  except Exception as e1:
127
+ print(f'Error in Attempt 1 (direct Hub load for {HF_REPO}): {e1}')
128
+ print('This often means the model requires a specific transformers version or has complex remote code.')
129
 
130
+ print(f'Attempt 2: Loading tokenizer and model from local path: {MODEL_PATH} with trust_remote_code=True (fallback)')
131
  try:
132
  tokenizer = AutoTokenizer.from_pretrained(
133
+ MODEL_PATH, # Fallback to local path
134
  trust_remote_code=True
135
  )
136
+ print('Tokenizer loaded successfully from local path.')
137
 
138
  config = AutoConfig.from_pretrained(
139
+ MODEL_PATH,
140
  trust_remote_code=True
141
  )
142
+ print('Config loaded successfully from local path.')
143
 
144
  llama_model = AutoModelForCausalLM.from_pretrained(
145
+ MODEL_PATH, # Fallback to local path
146
  config=config,
147
  torch_dtype=torch_dtype,
148
  device_map=device,
149
  trust_remote_code=True
150
  )
151
+ print(f'LLaMA-Omni2 model loaded successfully from local path: {MODEL_PATH}')
152
  return llama_model
153
  except Exception as e2:
154
+ print(f'Error in Attempt 2 (local path load for {MODEL_PATH}): {e2}')
155
 
156
+ print('All attempts to load the LLaMA-Omni2 model failed.')
157
+ raise RuntimeError('Failed to load LLaMA-Omni2 model after multiple attempts.')
158
 
159
  except Exception as e_outer:
160
+ print(f'CRITICAL ERROR loading LLaMA-Omni2 model: {e_outer}')
161
+ print('Falling back: Text generation will not be available.')
162
+ llama_model = None # Ensure llama_model is None if loading fails
163
+ tokenizer = None # Ensure tokenizer is None
164
  return None
165
 
166
  def transcribe_audio(audio_path):
167
+ '''Transcribe audio using Whisper'''
168
  global whisper_model
169
 
170
  if whisper_model is None:
 
172
 
173
  try:
174
  result = whisper_model.transcribe(audio_path)
175
+ return result['text']
176
  except Exception as e:
177
+ return f'Error transcribing audio: {e}'
178
 
179
  def generate_text(input_text):
180
+ '''Generate text using LLaMA-Omni2'''
181
  global llama_model, tokenizer
182
 
183
  if llama_model is None or tokenizer is None:
 
186
  try:
187
  # If model loading failed, just return a placeholder response
188
  if llama_model is None:
189
+ return f'Model could not be loaded. Input was: {input_text}'
190
 
191
  device = next(llama_model.parameters()).device
192
+ inputs = tokenizer(input_text, return_tensors='pt').to(device)
193
 
194
  outputs = llama_model.generate(
195
  inputs.input_ids,
 
201
 
202
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
203
  except Exception as e:
204
+ return f'Error generating text: {e}'
205
 
206
  def speech_to_text_to_speech(audio_path):
207
+ '''Pipeline: Speech -> Text -> Response'''
208
  # First transcribe the audio
209
  transcription = transcribe_audio(audio_path)
210
 
 
213
 
214
  return transcription, response
215
 
216
+ # --- Gradio Interface for Hugging Face Spaces ---
217
  def create_demo():
218
+ with gr.Blocks(title='LLaMA-Omni2 Demo on Hugging Face Spaces') as demo:
219
+ gr.Markdown('# LLaMA-Omni2 Demo')
220
+ gr.Markdown('This demo uses the smallest Whisper model and LLaMA-Omni2-0.5B for testing purposes.')
221
 
222
+ with gr.Tab('Text Generation'):
223
  with gr.Row():
224
+ text_input = gr.Textbox(label='Input Text', placeholder='Enter text here...')
225
+ text_output = gr.Textbox(label='Generated Response')
226
 
227
+ text_button = gr.Button('Generate Response')
228
  text_button.click(generate_text, inputs=text_input, outputs=text_output)
229
 
230
+ with gr.Tab('Speech-to-Text'):
231
+ audio_input = gr.Audio(type='filepath', label='Upload or Record Audio')
232
+ transcription_output = gr.Textbox(label='Transcription')
233
+ response_output = gr.Textbox(label='Generated Response')
234
 
235
+ transcribe_button = gr.Button('Transcribe and Respond')
236
  transcribe_button.click(speech_to_text_to_speech,
237
  inputs=audio_input,
238
  outputs=[transcription_output, response_output])
239
 
240
+ gr.Markdown('### Note: The first run will download models if needed, which may take some time.')
241
 
242
  return demo
243
 
244
  # --- Main entry point ---
245
+ if __name__ == '__main__':
246
+ print('Starting LLaMA-Omni2 Interface for Hugging Face Spaces...')
 
 
 
 
 
247
 
248
  # Create and launch the Gradio interface
249
  demo = create_demo()
250
+ demo.launch(server_name='0.0.0.0', server_port=7860, share=True) # share=True for Hugging Face Spaces