AC2513 commited on
Commit
1a184e0
·
1 Parent(s): ce78f65

Revert "added audio processing"

Browse files

This reverts commit 80d03f778e1e2fd2b3d0126abf41a75857409f18.

Files changed (3) hide show
  1. app.py +6 -47
  2. requirements.txt +1 -2
  3. utils.py +6 -62
app.py CHANGED
@@ -68,16 +68,6 @@ def run(
68
  f"system_prompt: {system_prompt} \n model_choice: {model_choice} \n max_new_tokens: {max_new_tokens} \n max_images: {max_images}"
69
  )
70
 
71
- # Validate audio files are only used with 3n model
72
- if message.get("files"):
73
- audio_extensions = [".wav", ".mp3", ".m4a", ".flac", ".ogg"]
74
- has_audio = any(any(file.lower().endswith(ext) for ext in audio_extensions) for file in message["files"])
75
-
76
- if has_audio and model_choice != "Gemma 3n E4B":
77
- error_msg = "❌ **Audio files are only supported with the Gemma 3n E4B model.**\n\nPlease switch to the Gemma 3n E4B model to process audio files, or remove audio files to continue with the current model."
78
- yield error_msg
79
- return
80
-
81
  def try_fallback_model(original_model_choice: str):
82
  fallback_model = model_3n if original_model_choice == "Gemma 3 12B" else model_12
83
  fallback_name = "Gemma 3n E4B" if original_model_choice == "Gemma 3 12B" else "Gemma 3 12B"
@@ -245,26 +235,13 @@ def run(
245
  yield error_message
246
 
247
 
248
- def update_file_types(model_choice):
249
- """Update allowed file types based on model selection."""
250
- base_types = [".mp4", ".jpg", ".png", ".pdf"]
251
- if model_choice == "Gemma 3n E4B":
252
- # Add audio file types for 3n model
253
- return base_types + [".wav", ".mp3", ".m4a", ".flac", ".ogg"]
254
- return base_types
255
-
256
- # Create a custom textbox that we can update
257
- custom_textbox = gr.MultimodalTextbox(
258
- file_types=[".mp4", ".jpg", ".png", ".pdf"],
259
- file_count="multiple",
260
- autofocus=True
261
- )
262
-
263
  demo = gr.ChatInterface(
264
  fn=run,
265
  type="messages",
266
  chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
267
- textbox=custom_textbox,
 
 
268
  multimodal=True,
269
  additional_inputs=[
270
  gr.Dropdown(
@@ -291,7 +268,7 @@ demo = gr.ChatInterface(
291
  label="Model",
292
  choices=["Gemma 3 12B", "Gemma 3n E4B"],
293
  value="Gemma 3 12B",
294
- info="Gemma 3 12B: More powerful and detailed responses, supports images, videos, and PDFs. Gemma 3n E4B: Faster processing with efficient performance, supports images, videos, PDFs, and audio files."
295
  ),
296
  gr.Slider(
297
  label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700
@@ -316,29 +293,11 @@ demo = gr.ChatInterface(
316
  # Connect the dropdown to update the textbox
317
  with demo:
318
  preset_dropdown = demo.additional_inputs[0]
319
- custom_textbox_input = demo.additional_inputs[1]
320
- model_dropdown = demo.additional_inputs[2]
321
-
322
- # Update custom prompt when preset changes
323
  preset_dropdown.change(
324
  fn=update_custom_prompt,
325
  inputs=[preset_dropdown],
326
- outputs=[custom_textbox_input]
327
- )
328
-
329
- # Update file types when model changes
330
- def update_textbox_file_types(model_choice):
331
- allowed_types = update_file_types(model_choice)
332
- return gr.MultimodalTextbox(
333
- file_types=allowed_types,
334
- file_count="multiple",
335
- autofocus=True
336
- )
337
-
338
- model_dropdown.change(
339
- fn=update_textbox_file_types,
340
- inputs=[model_dropdown],
341
- outputs=[demo.textbox]
342
  )
343
 
344
  if __name__ == "__main__":
 
68
  f"system_prompt: {system_prompt} \n model_choice: {model_choice} \n max_new_tokens: {max_new_tokens} \n max_images: {max_images}"
69
  )
70
 
 
 
 
 
 
 
 
 
 
 
71
  def try_fallback_model(original_model_choice: str):
72
  fallback_model = model_3n if original_model_choice == "Gemma 3 12B" else model_12
73
  fallback_name = "Gemma 3n E4B" if original_model_choice == "Gemma 3 12B" else "Gemma 3 12B"
 
235
  yield error_message
236
 
237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  demo = gr.ChatInterface(
239
  fn=run,
240
  type="messages",
241
  chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
242
+ textbox=gr.MultimodalTextbox(
243
+ file_types=[".mp4", ".jpg", ".png", ".pdf"], file_count="multiple", autofocus=True
244
+ ),
245
  multimodal=True,
246
  additional_inputs=[
247
  gr.Dropdown(
 
268
  label="Model",
269
  choices=["Gemma 3 12B", "Gemma 3n E4B"],
270
  value="Gemma 3 12B",
271
+ info="Gemma 3 12B: More powerful and detailed responses, but slower processing. Gemma 3n E4B: Faster processing with efficient performance for most tasks."
272
  ),
273
  gr.Slider(
274
  label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700
 
293
  # Connect the dropdown to update the textbox
294
  with demo:
295
  preset_dropdown = demo.additional_inputs[0]
296
+ custom_textbox = demo.additional_inputs[1]
 
 
 
297
  preset_dropdown.change(
298
  fn=update_custom_prompt,
299
  inputs=[preset_dropdown],
300
+ outputs=[custom_textbox]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  )
302
 
303
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -9,5 +9,4 @@ loguru
9
  python-dotenv
10
  opencv-python
11
  timm
12
- pymupdf
13
- librosa
 
9
  python-dotenv
10
  opencv-python
11
  timm
12
+ pymupdf
 
utils.py CHANGED
@@ -2,15 +2,12 @@ import os
2
  import cv2
3
  import fitz
4
  import tempfile
5
- import librosa
6
- import numpy as np
7
  from PIL import Image
8
  from loguru import logger
9
 
10
  # Constants
11
  MAX_VIDEO_SIZE = 100 * 1024 * 1024 # 100 MB
12
  MAX_IMAGE_SIZE = 10 * 1024 * 1024 # 10 MB
13
- MAX_AUDIO_SIZE = 50 * 1024 * 1024 # 50 MB
14
 
15
  PRESET_PROMPTS = {
16
  "General Assistant": "You are a helpful AI assistant capable of analyzing images, videos, and PDF documents. Provide clear, accurate, and helpful responses to user queries.",
@@ -32,17 +29,13 @@ def check_file_size(file_path: str) -> bool:
32
  raise ValueError(f"File not found: {file_path}")
33
 
34
  file_size = os.path.getsize(file_path)
35
- file_lower = file_path.lower()
36
 
37
- if file_lower.endswith((".mp4", ".mov")):
38
  if file_size > MAX_VIDEO_SIZE:
39
  raise ValueError(f"Video file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_VIDEO_SIZE / (1024*1024):.0f}MB")
40
- elif file_lower.endswith((".wav", ".mp3", ".m4a", ".flac", ".ogg")):
41
- if file_size > MAX_AUDIO_SIZE:
42
- raise ValueError(f"Audio file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_AUDIO_SIZE / (1024*1024):.0f}MB")
43
  else:
44
  if file_size > MAX_IMAGE_SIZE:
45
- raise ValueError(f"Image/document file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_IMAGE_SIZE / (1024*1024):.0f}MB")
46
 
47
  return True
48
 
@@ -94,44 +87,6 @@ def process_video(video_path: str, max_images: int) -> list[dict]:
94
  return result_content
95
 
96
 
97
- def process_audio(audio_path: str) -> list[dict]:
98
- """Process an audio file and return formatted content for the model."""
99
- check_file_size(audio_path)
100
-
101
- try:
102
- # Load audio file
103
- audio_data, sample_rate = librosa.load(audio_path, sr=None)
104
- duration = len(audio_data) / sample_rate
105
-
106
- # Get basic audio features
107
- rms = librosa.feature.rms(y=audio_data)[0]
108
- spectral_centroids = librosa.feature.spectral_centroid(y=audio_data, sr=sample_rate)[0]
109
- zero_crossings = librosa.zero_crossings(audio_data, pad=False)
110
-
111
- # Calculate statistics
112
- avg_rms = np.mean(rms)
113
- avg_spectral_centroid = np.mean(spectral_centroids)
114
- zcr_rate = np.sum(zero_crossings) / len(audio_data)
115
-
116
- # Create audio analysis text
117
- audio_analysis = f"""Audio Analysis:
118
- - Duration: {duration:.2f} seconds
119
- - Sample Rate: {sample_rate} Hz
120
- - Average RMS Energy: {avg_rms:.4f}
121
- - Average Spectral Centroid: {avg_spectral_centroid:.2f} Hz
122
- - Zero Crossing Rate: {zcr_rate:.4f}
123
- - File: {os.path.basename(audio_path)}"""
124
-
125
- result_content = [{"type": "text", "text": audio_analysis}]
126
-
127
- logger.debug(f"Processed audio file {audio_path} - Duration: {duration:.2f}s")
128
- return result_content
129
-
130
- except Exception as e:
131
- logger.error(f"Error processing audio {audio_path}: {e}")
132
- raise ValueError(f"Failed to process audio file: {str(e)}")
133
-
134
-
135
  def extract_pdf_text(pdf_path: str) -> str:
136
  """Extract text content from a PDF file."""
137
  check_file_size(pdf_path)
@@ -172,22 +127,14 @@ def process_user_input(message: dict, max_images: int) -> list[dict]:
172
  logger.error(f"File size check failed: {e}")
173
  result_content.append({"type": "text", "text": f"Error: {str(e)}"})
174
  continue
175
-
176
- file_lower = file_path.lower()
177
 
178
- if file_lower.endswith((".mp4", ".mov")):
179
  try:
180
  result_content = [*result_content, *process_video(file_path, max_images)]
181
  except Exception as e:
182
  logger.error(f"Video processing failed: {e}")
183
  result_content.append({"type": "text", "text": f"Error processing video: {str(e)}"})
184
- elif file_lower.endswith((".wav", ".mp3", ".m4a", ".flac", ".ogg")):
185
- try:
186
- result_content = [*result_content, *process_audio(file_path)]
187
- except Exception as e:
188
- logger.error(f"Audio processing failed: {e}")
189
- result_content.append({"type": "text", "text": f"Error processing audio: {str(e)}"})
190
- elif file_lower.endswith(".pdf"):
191
  try:
192
  logger.info(f"Processing PDF file: {file_path}")
193
  pdf_text = extract_pdf_text(file_path)
@@ -228,12 +175,9 @@ def process_history(history: list[dict]) -> list[dict]:
228
  content_buffer.append({"type": "text", "text": content})
229
  elif isinstance(content, tuple) and len(content) > 0:
230
  file_path = content[0]
231
- file_lower = file_path.lower()
232
- if file_lower.endswith((".mp4", ".mov")):
233
  content_buffer.append({"type": "text", "text": "[Video uploaded previously]"})
234
- elif file_lower.endswith((".wav", ".mp3", ".m4a", ".flac", ".ogg")):
235
- content_buffer.append({"type": "text", "text": "[Audio uploaded previously]"})
236
- elif file_lower.endswith(".pdf"):
237
  content_buffer.append({"type": "text", "text": "[PDF uploaded previously]"})
238
  else:
239
  content_buffer.append({"type": "image", "url": file_path})
 
2
  import cv2
3
  import fitz
4
  import tempfile
 
 
5
  from PIL import Image
6
  from loguru import logger
7
 
8
  # Constants
9
  MAX_VIDEO_SIZE = 100 * 1024 * 1024 # 100 MB
10
  MAX_IMAGE_SIZE = 10 * 1024 * 1024 # 10 MB
 
11
 
12
  PRESET_PROMPTS = {
13
  "General Assistant": "You are a helpful AI assistant capable of analyzing images, videos, and PDF documents. Provide clear, accurate, and helpful responses to user queries.",
 
29
  raise ValueError(f"File not found: {file_path}")
30
 
31
  file_size = os.path.getsize(file_path)
 
32
 
33
+ if file_path.lower().endswith((".mp4", ".mov")):
34
  if file_size > MAX_VIDEO_SIZE:
35
  raise ValueError(f"Video file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_VIDEO_SIZE / (1024*1024):.0f}MB")
 
 
 
36
  else:
37
  if file_size > MAX_IMAGE_SIZE:
38
+ raise ValueError(f"Image file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_IMAGE_SIZE / (1024*1024):.0f}MB")
39
 
40
  return True
41
 
 
87
  return result_content
88
 
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def extract_pdf_text(pdf_path: str) -> str:
91
  """Extract text content from a PDF file."""
92
  check_file_size(pdf_path)
 
127
  logger.error(f"File size check failed: {e}")
128
  result_content.append({"type": "text", "text": f"Error: {str(e)}"})
129
  continue
 
 
130
 
131
+ if file_path.endswith((".mp4", ".mov")):
132
  try:
133
  result_content = [*result_content, *process_video(file_path, max_images)]
134
  except Exception as e:
135
  logger.error(f"Video processing failed: {e}")
136
  result_content.append({"type": "text", "text": f"Error processing video: {str(e)}"})
137
+ elif file_path.lower().endswith(".pdf"):
 
 
 
 
 
 
138
  try:
139
  logger.info(f"Processing PDF file: {file_path}")
140
  pdf_text = extract_pdf_text(file_path)
 
175
  content_buffer.append({"type": "text", "text": content})
176
  elif isinstance(content, tuple) and len(content) > 0:
177
  file_path = content[0]
178
+ if file_path.endswith((".mp4", ".mov")):
 
179
  content_buffer.append({"type": "text", "text": "[Video uploaded previously]"})
180
+ elif file_path.lower().endswith(".pdf"):
 
 
181
  content_buffer.append({"type": "text", "text": "[PDF uploaded previously]"})
182
  else:
183
  content_buffer.append({"type": "image", "url": file_path})