AC2513 commited on
Commit
80d03f7
·
1 Parent(s): ceb2ea0

added audio processing

Browse files
Files changed (3) hide show
  1. app.py +47 -6
  2. requirements.txt +2 -1
  3. utils.py +62 -6
app.py CHANGED
@@ -68,6 +68,16 @@ def run(
68
  f"system_prompt: {system_prompt} \n model_choice: {model_choice} \n max_new_tokens: {max_new_tokens} \n max_images: {max_images}"
69
  )
70
 
 
 
 
 
 
 
 
 
 
 
71
  def try_fallback_model(original_model_choice: str):
72
  fallback_model = model_3n if original_model_choice == "Gemma 3 12B" else model_12
73
  fallback_name = "Gemma 3n E4B" if original_model_choice == "Gemma 3 12B" else "Gemma 3 12B"
@@ -235,13 +245,26 @@ def run(
235
  yield error_message
236
 
237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  demo = gr.ChatInterface(
239
  fn=run,
240
  type="messages",
241
  chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
242
- textbox=gr.MultimodalTextbox(
243
- file_types=[".mp4", ".jpg", ".png", ".pdf"], file_count="multiple", autofocus=True
244
- ),
245
  multimodal=True,
246
  additional_inputs=[
247
  gr.Dropdown(
@@ -268,7 +291,7 @@ demo = gr.ChatInterface(
268
  label="Model",
269
  choices=["Gemma 3 12B", "Gemma 3n E4B"],
270
  value="Gemma 3 12B",
271
- info="Gemma 3 12B: More powerful and detailed responses, but slower processing. Gemma 3n E4B: Faster processing with efficient performance for most tasks."
272
  ),
273
  gr.Slider(
274
  label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700
@@ -293,11 +316,29 @@ demo = gr.ChatInterface(
293
  # Connect the dropdown to update the textbox
294
  with demo:
295
  preset_dropdown = demo.additional_inputs[0]
296
- custom_textbox = demo.additional_inputs[1]
 
 
 
297
  preset_dropdown.change(
298
  fn=update_custom_prompt,
299
  inputs=[preset_dropdown],
300
- outputs=[custom_textbox]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  )
302
 
303
  if __name__ == "__main__":
 
68
  f"system_prompt: {system_prompt} \n model_choice: {model_choice} \n max_new_tokens: {max_new_tokens} \n max_images: {max_images}"
69
  )
70
 
71
+ # Validate audio files are only used with 3n model
72
+ if message.get("files"):
73
+ audio_extensions = [".wav", ".mp3", ".m4a", ".flac", ".ogg"]
74
+ has_audio = any(any(file.lower().endswith(ext) for ext in audio_extensions) for file in message["files"])
75
+
76
+ if has_audio and model_choice != "Gemma 3n E4B":
77
+ error_msg = "❌ **Audio files are only supported with the Gemma 3n E4B model.**\n\nPlease switch to the Gemma 3n E4B model to process audio files, or remove audio files to continue with the current model."
78
+ yield error_msg
79
+ return
80
+
81
  def try_fallback_model(original_model_choice: str):
82
  fallback_model = model_3n if original_model_choice == "Gemma 3 12B" else model_12
83
  fallback_name = "Gemma 3n E4B" if original_model_choice == "Gemma 3 12B" else "Gemma 3 12B"
 
245
  yield error_message
246
 
247
 
248
+ def update_file_types(model_choice):
249
+ """Update allowed file types based on model selection."""
250
+ base_types = [".mp4", ".jpg", ".png", ".pdf"]
251
+ if model_choice == "Gemma 3n E4B":
252
+ # Add audio file types for 3n model
253
+ return base_types + [".wav", ".mp3", ".m4a", ".flac", ".ogg"]
254
+ return base_types
255
+
256
+ # Create a custom textbox that we can update
257
+ custom_textbox = gr.MultimodalTextbox(
258
+ file_types=[".mp4", ".jpg", ".png", ".pdf"],
259
+ file_count="multiple",
260
+ autofocus=True
261
+ )
262
+
263
  demo = gr.ChatInterface(
264
  fn=run,
265
  type="messages",
266
  chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
267
+ textbox=custom_textbox,
 
 
268
  multimodal=True,
269
  additional_inputs=[
270
  gr.Dropdown(
 
291
  label="Model",
292
  choices=["Gemma 3 12B", "Gemma 3n E4B"],
293
  value="Gemma 3 12B",
294
+ info="Gemma 3 12B: More powerful and detailed responses, supports images, videos, and PDFs. Gemma 3n E4B: Faster processing with efficient performance, supports images, videos, PDFs, and audio files."
295
  ),
296
  gr.Slider(
297
  label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700
 
316
  # Connect the dropdown to update the textbox
317
  with demo:
318
  preset_dropdown = demo.additional_inputs[0]
319
+ custom_textbox_input = demo.additional_inputs[1]
320
+ model_dropdown = demo.additional_inputs[2]
321
+
322
+ # Update custom prompt when preset changes
323
  preset_dropdown.change(
324
  fn=update_custom_prompt,
325
  inputs=[preset_dropdown],
326
+ outputs=[custom_textbox_input]
327
+ )
328
+
329
+ # Update file types when model changes
330
+ def update_textbox_file_types(model_choice):
331
+ allowed_types = update_file_types(model_choice)
332
+ return gr.MultimodalTextbox(
333
+ file_types=allowed_types,
334
+ file_count="multiple",
335
+ autofocus=True
336
+ )
337
+
338
+ model_dropdown.change(
339
+ fn=update_textbox_file_types,
340
+ inputs=[model_dropdown],
341
+ outputs=[demo.textbox]
342
  )
343
 
344
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -9,4 +9,5 @@ loguru
9
  python-dotenv
10
  opencv-python
11
  timm
12
- pymupdf
 
 
9
  python-dotenv
10
  opencv-python
11
  timm
12
+ pymupdf
13
+ librosa
utils.py CHANGED
@@ -2,12 +2,15 @@ import os
2
  import cv2
3
  import fitz
4
  import tempfile
 
 
5
  from PIL import Image
6
  from loguru import logger
7
 
8
  # Constants
9
  MAX_VIDEO_SIZE = 100 * 1024 * 1024 # 100 MB
10
  MAX_IMAGE_SIZE = 10 * 1024 * 1024 # 10 MB
 
11
 
12
 
13
  def check_file_size(file_path: str) -> bool:
@@ -16,13 +19,17 @@ def check_file_size(file_path: str) -> bool:
16
  raise ValueError(f"File not found: {file_path}")
17
 
18
  file_size = os.path.getsize(file_path)
 
19
 
20
- if file_path.lower().endswith((".mp4", ".mov")):
21
  if file_size > MAX_VIDEO_SIZE:
22
  raise ValueError(f"Video file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_VIDEO_SIZE / (1024*1024):.0f}MB")
 
 
 
23
  else:
24
  if file_size > MAX_IMAGE_SIZE:
25
- raise ValueError(f"Image file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_IMAGE_SIZE / (1024*1024):.0f}MB")
26
 
27
  return True
28
 
@@ -74,6 +81,44 @@ def process_video(video_path: str, max_images: int) -> list[dict]:
74
  return result_content
75
 
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def extract_pdf_text(pdf_path: str) -> str:
78
  """Extract text content from a PDF file."""
79
  check_file_size(pdf_path)
@@ -114,14 +159,22 @@ def process_user_input(message: dict, max_images: int) -> list[dict]:
114
  logger.error(f"File size check failed: {e}")
115
  result_content.append({"type": "text", "text": f"Error: {str(e)}"})
116
  continue
 
 
117
 
118
- if file_path.endswith((".mp4", ".mov")):
119
  try:
120
  result_content = [*result_content, *process_video(file_path, max_images)]
121
  except Exception as e:
122
  logger.error(f"Video processing failed: {e}")
123
  result_content.append({"type": "text", "text": f"Error processing video: {str(e)}"})
124
- elif file_path.lower().endswith(".pdf"):
 
 
 
 
 
 
125
  try:
126
  logger.info(f"Processing PDF file: {file_path}")
127
  pdf_text = extract_pdf_text(file_path)
@@ -162,9 +215,12 @@ def process_history(history: list[dict]) -> list[dict]:
162
  content_buffer.append({"type": "text", "text": content})
163
  elif isinstance(content, tuple) and len(content) > 0:
164
  file_path = content[0]
165
- if file_path.endswith((".mp4", ".mov")):
 
166
  content_buffer.append({"type": "text", "text": "[Video uploaded previously]"})
167
- elif file_path.lower().endswith(".pdf"):
 
 
168
  content_buffer.append({"type": "text", "text": "[PDF uploaded previously]"})
169
  else:
170
  content_buffer.append({"type": "image", "url": file_path})
 
2
  import cv2
3
  import fitz
4
  import tempfile
5
+ import librosa
6
+ import numpy as np
7
  from PIL import Image
8
  from loguru import logger
9
 
10
  # Constants
11
  MAX_VIDEO_SIZE = 100 * 1024 * 1024 # 100 MB
12
  MAX_IMAGE_SIZE = 10 * 1024 * 1024 # 10 MB
13
+ MAX_AUDIO_SIZE = 50 * 1024 * 1024 # 50 MB
14
 
15
 
16
  def check_file_size(file_path: str) -> bool:
 
19
  raise ValueError(f"File not found: {file_path}")
20
 
21
  file_size = os.path.getsize(file_path)
22
+ file_lower = file_path.lower()
23
 
24
+ if file_lower.endswith((".mp4", ".mov")):
25
  if file_size > MAX_VIDEO_SIZE:
26
  raise ValueError(f"Video file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_VIDEO_SIZE / (1024*1024):.0f}MB")
27
+ elif file_lower.endswith((".wav", ".mp3", ".m4a", ".flac", ".ogg")):
28
+ if file_size > MAX_AUDIO_SIZE:
29
+ raise ValueError(f"Audio file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_AUDIO_SIZE / (1024*1024):.0f}MB")
30
  else:
31
  if file_size > MAX_IMAGE_SIZE:
32
+ raise ValueError(f"Image/document file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_IMAGE_SIZE / (1024*1024):.0f}MB")
33
 
34
  return True
35
 
 
81
  return result_content
82
 
83
 
84
+ def process_audio(audio_path: str) -> list[dict]:
85
+ """Process an audio file and return formatted content for the model."""
86
+ check_file_size(audio_path)
87
+
88
+ try:
89
+ # Load audio file
90
+ audio_data, sample_rate = librosa.load(audio_path, sr=None)
91
+ duration = len(audio_data) / sample_rate
92
+
93
+ # Get basic audio features
94
+ rms = librosa.feature.rms(y=audio_data)[0]
95
+ spectral_centroids = librosa.feature.spectral_centroid(y=audio_data, sr=sample_rate)[0]
96
+ zero_crossings = librosa.zero_crossings(audio_data, pad=False)
97
+
98
+ # Calculate statistics
99
+ avg_rms = np.mean(rms)
100
+ avg_spectral_centroid = np.mean(spectral_centroids)
101
+ zcr_rate = np.sum(zero_crossings) / len(audio_data)
102
+
103
+ # Create audio analysis text
104
+ audio_analysis = f"""Audio Analysis:
105
+ - Duration: {duration:.2f} seconds
106
+ - Sample Rate: {sample_rate} Hz
107
+ - Average RMS Energy: {avg_rms:.4f}
108
+ - Average Spectral Centroid: {avg_spectral_centroid:.2f} Hz
109
+ - Zero Crossing Rate: {zcr_rate:.4f}
110
+ - File: {os.path.basename(audio_path)}"""
111
+
112
+ result_content = [{"type": "text", "text": audio_analysis}]
113
+
114
+ logger.debug(f"Processed audio file {audio_path} - Duration: {duration:.2f}s")
115
+ return result_content
116
+
117
+ except Exception as e:
118
+ logger.error(f"Error processing audio {audio_path}: {e}")
119
+ raise ValueError(f"Failed to process audio file: {str(e)}")
120
+
121
+
122
  def extract_pdf_text(pdf_path: str) -> str:
123
  """Extract text content from a PDF file."""
124
  check_file_size(pdf_path)
 
159
  logger.error(f"File size check failed: {e}")
160
  result_content.append({"type": "text", "text": f"Error: {str(e)}"})
161
  continue
162
+
163
+ file_lower = file_path.lower()
164
 
165
+ if file_lower.endswith((".mp4", ".mov")):
166
  try:
167
  result_content = [*result_content, *process_video(file_path, max_images)]
168
  except Exception as e:
169
  logger.error(f"Video processing failed: {e}")
170
  result_content.append({"type": "text", "text": f"Error processing video: {str(e)}"})
171
+ elif file_lower.endswith((".wav", ".mp3", ".m4a", ".flac", ".ogg")):
172
+ try:
173
+ result_content = [*result_content, *process_audio(file_path)]
174
+ except Exception as e:
175
+ logger.error(f"Audio processing failed: {e}")
176
+ result_content.append({"type": "text", "text": f"Error processing audio: {str(e)}"})
177
+ elif file_lower.endswith(".pdf"):
178
  try:
179
  logger.info(f"Processing PDF file: {file_path}")
180
  pdf_text = extract_pdf_text(file_path)
 
215
  content_buffer.append({"type": "text", "text": content})
216
  elif isinstance(content, tuple) and len(content) > 0:
217
  file_path = content[0]
218
+ file_lower = file_path.lower()
219
+ if file_lower.endswith((".mp4", ".mov")):
220
  content_buffer.append({"type": "text", "text": "[Video uploaded previously]"})
221
+ elif file_lower.endswith((".wav", ".mp3", ".m4a", ".flac", ".ogg")):
222
+ content_buffer.append({"type": "text", "text": "[Audio uploaded previously]"})
223
+ elif file_lower.endswith(".pdf"):
224
  content_buffer.append({"type": "text", "text": "[PDF uploaded previously]"})
225
  else:
226
  content_buffer.append({"type": "image", "url": file_path})