AndyC commited on
Commit
5f4b02a
·
1 Parent(s): 3a8e58d

added pdf file reading

Browse files
Files changed (2) hide show
  1. app.py +34 -2
  2. requirements.txt +2 -1
app.py CHANGED
@@ -16,6 +16,7 @@ from dotenv import load_dotenv, find_dotenv
16
  import cv2
17
  from loguru import logger
18
  from PIL import Image
 
19
 
20
  dotenv_path = find_dotenv()
21
 
@@ -61,7 +62,6 @@ def check_file_size(file_path: str) -> bool:
61
 
62
 
63
  def get_frames(video_path: str, max_images: int) -> list[tuple[Image.Image, float]]:
64
- # Check file size before processing
65
  check_file_size(video_path)
66
 
67
  frames: list[tuple[Image.Image, float]] = []
@@ -106,6 +106,31 @@ def process_video(video_path: str, max_images: int) -> list[dict]:
106
  return result_content
107
 
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  def process_user_input(message: dict, max_images: int) -> list[dict]:
110
  if not message["files"]:
111
  return [{"type": "text", "text": message["text"]}]
@@ -126,6 +151,13 @@ def process_user_input(message: dict, max_images: int) -> list[dict]:
126
  except Exception as e:
127
  logger.error(f"Video processing failed: {e}")
128
  result_content.append({"type": "text", "text": f"Error processing video: {str(e)}"})
 
 
 
 
 
 
 
129
  else:
130
  result_content = [*result_content, {"type": "image", "url": file_path}]
131
 
@@ -230,7 +262,7 @@ demo = gr.ChatInterface(
230
  type="messages",
231
  chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
232
  textbox=gr.MultimodalTextbox(
233
- file_types=[".mp4", ".jpg", ".png"], file_count="multiple", autofocus=True
234
  ),
235
  multimodal=True,
236
  additional_inputs=[
 
16
  import cv2
17
  from loguru import logger
18
  from PIL import Image
19
+ import fitz
20
 
21
  dotenv_path = find_dotenv()
22
 
 
62
 
63
 
64
  def get_frames(video_path: str, max_images: int) -> list[tuple[Image.Image, float]]:
 
65
  check_file_size(video_path)
66
 
67
  frames: list[tuple[Image.Image, float]] = []
 
106
  return result_content
107
 
108
 
109
+ def extract_pdf_text(pdf_path: str) -> str:
110
+ check_file_size(pdf_path)
111
+
112
+ try:
113
+ doc = fitz.open(pdf_path)
114
+ text_content = []
115
+
116
+ for page_num in range(len(doc)):
117
+ page = doc.load_page(page_num)
118
+ text = page.get_text()
119
+ if text.strip(): # Only add non-empty pages
120
+ text_content.append(f"Page {page_num + 1}:\n{text}")
121
+
122
+ doc.close()
123
+
124
+ if not text_content:
125
+ return "No text content found in the PDF."
126
+
127
+ return "\n\n".join(text_content)
128
+
129
+ except Exception as e:
130
+ logger.error(f"Error extracting text from PDF {pdf_path}: {e}")
131
+ return ValueError(f"Failed to extract text from PDF: {str(e)}")
132
+
133
+
134
  def process_user_input(message: dict, max_images: int) -> list[dict]:
135
  if not message["files"]:
136
  return [{"type": "text", "text": message["text"]}]
 
151
  except Exception as e:
152
  logger.error(f"Video processing failed: {e}")
153
  result_content.append({"type": "text", "text": f"Error processing video: {str(e)}"})
154
+ elif file_path.lower().endswith(".pdf"):
155
+ try:
156
+ pdf_text = extract_pdf_text(file_path)
157
+ result_content.append({"type": "text", "text": f"PDF Content:\n{pdf_text}"})
158
+ except Exception as e:
159
+ logger.error(f"PDF processing failed: {e}")
160
+ result_content.append({"type": "text", "text": f"Error processing PDF: {str(e)}"})
161
  else:
162
  result_content = [*result_content, {"type": "image", "url": file_path}]
163
 
 
262
  type="messages",
263
  chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
264
  textbox=gr.MultimodalTextbox(
265
+ file_types=[".mp4", ".jpg", ".png", ".pdf"], file_count="multiple", autofocus=True
266
  ),
267
  multimodal=True,
268
  additional_inputs=[
requirements.txt CHANGED
@@ -7,4 +7,5 @@ pytest
7
  loguru
8
  python-dotenv
9
  opencv-python
10
- timm
 
 
7
  loguru
8
  python-dotenv
9
  opencv-python
10
+ timm
11
+ pymupdf