Spaces:
Sleeping
Sleeping
feat: Refactor PDF Text Extractor application structure
Browse files- Introduced a modular architecture by separating the application into distinct modules: app.py, ui, and utils.
- Implemented a main function in app.py to handle application launch and configuration.
- Added environment variable loading and API key validation.
- Created a .env.example file for environment variable setup guidance.
- Enhanced the UI components and handlers for better user interaction.
- Developed a comprehensive PDF text extraction utility using Mistral AI.
- Added tests for OCR functionality and setup validation.
- Updated .gitignore to exclude environment files and unnecessary artifacts.
- .env.example +5 -0
- .gitignore +47 -0
- app.py +33 -5
- main.py +15 -0
- pdf_text_extractor.py +254 -0
- requirements.txt +0 -0
- tests/test_ocr_direct.py +234 -0
- tests/test_setup.py +62 -0
- ui/__init__.py +15 -0
- ui/components.py +125 -0
- ui/handlers.py +104 -0
- ui/interface.py +177 -0
- utils/__init__.py +4 -0
- utils/config.py +40 -0
- utils/pdf_image_extractor.py +155 -0
.env.example
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Environment variables for PDF Explainer
|
2 |
+
# Copy this file to .env and fill in your actual API key
|
3 |
+
|
4 |
+
# Mistral AI API Key - Get yours from https://console.mistral.ai/
|
5 |
+
MISTRAL_API_KEY=your_mistral_api_key_here
|
.gitignore
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Environment variables
|
2 |
+
**/.env
|
3 |
+
|
4 |
+
# Python cache
|
5 |
+
**/__pycache__/
|
6 |
+
*.py[cod]
|
7 |
+
*$py.class
|
8 |
+
|
9 |
+
# Virtual environment
|
10 |
+
**/.venv
|
11 |
+
.venv/
|
12 |
+
venv/
|
13 |
+
env/
|
14 |
+
|
15 |
+
# IDE files
|
16 |
+
.vscode/settings.json
|
17 |
+
.idea/
|
18 |
+
|
19 |
+
# OS files
|
20 |
+
.DS_Store
|
21 |
+
Thumbs.db
|
22 |
+
|
23 |
+
# Gradio temporary files
|
24 |
+
gradio_cached_examples/
|
25 |
+
flagged/
|
26 |
+
|
27 |
+
# Log files
|
28 |
+
*.log
|
29 |
+
|
30 |
+
# Distribution / packaging
|
31 |
+
.Python
|
32 |
+
build/
|
33 |
+
develop-eggs/
|
34 |
+
dist/
|
35 |
+
downloads/
|
36 |
+
eggs/
|
37 |
+
.eggs/
|
38 |
+
lib/
|
39 |
+
lib64/
|
40 |
+
parts/
|
41 |
+
sdist/
|
42 |
+
var/
|
43 |
+
wheels/
|
44 |
+
*.egg-info/
|
45 |
+
.installed.cfg
|
46 |
+
*.egg
|
47 |
+
MANIFEST
|
app.py
CHANGED
@@ -1,7 +1,35 @@
|
|
1 |
-
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
PDF Text Extractor Application
|
3 |
+
Main entry point for the PDF Text Extractor application.
|
4 |
+
"""
|
5 |
|
6 |
+
import os
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
from ui import create_interface
|
9 |
+
from utils.config import check_api_key, get_app_config
|
10 |
|
11 |
+
def main():
|
12 |
+
"""Main function to launch the application."""
|
13 |
+
|
14 |
+
# Load environment variables from .env file
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
+
# Check for API key
|
18 |
+
check_api_key()
|
19 |
+
|
20 |
+
# Create and launch the interface
|
21 |
+
interface = create_interface()
|
22 |
+
|
23 |
+
# Get application configuration
|
24 |
+
app_config = get_app_config()
|
25 |
+
|
26 |
+
# Launch with appropriate settings
|
27 |
+
interface.launch(
|
28 |
+
server_port=app_config["server_port"],
|
29 |
+
debug=app_config["debug"],
|
30 |
+
quiet=app_config["quiet"],
|
31 |
+
max_file_size=app_config["max_file_size"]
|
32 |
+
)
|
33 |
+
|
34 |
+
if __name__ == "__main__":
|
35 |
+
main()
|
main.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
PDF Text Extractor using Gradio and Mistral AI
|
3 |
+
A web application for extracting text from PDF files using Mistral's OCR capabilities.
|
4 |
+
|
5 |
+
This is a legacy entry point that maintains compatibility with the original app.
|
6 |
+
For a more modular structure, see app.py and the ui/ and utils/ folders.
|
7 |
+
"""
|
8 |
+
|
9 |
+
# Import from the new modular structure
|
10 |
+
from app import main
|
11 |
+
|
12 |
+
|
13 |
+
# Execute the main function when run as script
|
14 |
+
if __name__ == "__main__":
|
15 |
+
main()
|
pdf_text_extractor.py
ADDED
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import os
|
3 |
+
from typing import Optional, Tuple, List, Dict, Any
|
4 |
+
from mistralai import Mistral
|
5 |
+
|
6 |
+
class PDFTextExtractor:
|
7 |
+
"""PDF text extraction using Mistral AI OCR."""
|
8 |
+
|
9 |
+
def __init__(self):
|
10 |
+
"""Initialize the PDF text extractor with Mistral AI client."""
|
11 |
+
self.api_key = os.environ.get("MISTRAL_API_KEY")
|
12 |
+
if not self.api_key:
|
13 |
+
raise ValueError("MISTRAL_API_KEY environment variable is required")
|
14 |
+
self.client = Mistral(api_key=self.api_key)
|
15 |
+
|
16 |
+
def encode_pdf(self, pdf_path: str) -> Optional[str]:
|
17 |
+
"""
|
18 |
+
Encode the PDF file to base64.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
pdf_path: Path to the PDF file
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
Base64 encoded string or None if error
|
25 |
+
"""
|
26 |
+
try:
|
27 |
+
with open(pdf_path, "rb") as pdf_file:
|
28 |
+
return base64.b64encode(pdf_file.read()).decode('utf-8')
|
29 |
+
except FileNotFoundError:
|
30 |
+
print(f"Error: The file {pdf_path} was not found.")
|
31 |
+
return None
|
32 |
+
except Exception as e:
|
33 |
+
print(f"Error encoding PDF: {e}")
|
34 |
+
return None
|
35 |
+
|
36 |
+
def extract_text_from_pdf(self, pdf_file) -> Tuple[str, str, List[Dict[str, Any]]]:
|
37 |
+
"""
|
38 |
+
Extract text and images from uploaded PDF using Mistral AI OCR.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
pdf_file: Gradio file object
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
Tuple of (extracted_text, status_message, images_data)
|
45 |
+
"""
|
46 |
+
if pdf_file is None:
|
47 |
+
return "", "Please upload a PDF file.", []
|
48 |
+
|
49 |
+
try:
|
50 |
+
# Get the file path from Gradio file object
|
51 |
+
pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
|
52 |
+
|
53 |
+
# Encode PDF to base64
|
54 |
+
base64_pdf = self.encode_pdf(pdf_path)
|
55 |
+
if base64_pdf is None:
|
56 |
+
return "", "Failed to encode PDF file.", []
|
57 |
+
|
58 |
+
# Process with Mistral OCR
|
59 |
+
print(f"🔄 Processing PDF with Mistral OCR...")
|
60 |
+
ocr_response = self.client.ocr.process(
|
61 |
+
model="mistral-ocr-latest",
|
62 |
+
document={
|
63 |
+
"type": "document_url",
|
64 |
+
"document_url": f"data:application/pdf;base64,{base64_pdf}"
|
65 |
+
},
|
66 |
+
include_image_base64=True
|
67 |
+
)
|
68 |
+
|
69 |
+
# Enhanced debugging and response parsing
|
70 |
+
print("🔍 Analyzing OCR Response Structure...")
|
71 |
+
print(f" Type: {type(ocr_response)}")
|
72 |
+
print(f" String representation: {str(ocr_response)[:500]}...")
|
73 |
+
|
74 |
+
# Check if it's a simple object with attributes
|
75 |
+
if hasattr(ocr_response, '__dict__'):
|
76 |
+
print(f" Object attributes: {list(ocr_response.__dict__.keys())}")
|
77 |
+
for key, value in ocr_response.__dict__.items():
|
78 |
+
print(f" {key}: {type(value)} = {str(value)[:100]}...")
|
79 |
+
|
80 |
+
# Check if it has commonly expected attributes
|
81 |
+
common_attrs = ['text', 'content', 'result', 'data', 'output', 'extracted_text', 'ocr_text', 'choices', 'message']
|
82 |
+
for attr in common_attrs:
|
83 |
+
if hasattr(ocr_response, attr):
|
84 |
+
value = getattr(ocr_response, attr)
|
85 |
+
print(f" Has '{attr}': {type(value)} = {str(value)[:100]}...")
|
86 |
+
|
87 |
+
# Check if it's iterable but not a string
|
88 |
+
try:
|
89 |
+
if hasattr(ocr_response, '__iter__') and not isinstance(ocr_response, str):
|
90 |
+
print(f" Iterable with {len(list(ocr_response))} items")
|
91 |
+
for i, item in enumerate(ocr_response):
|
92 |
+
if i < 3: # Show first 3 items
|
93 |
+
print(f" Item {i}: {type(item)} = {str(item)[:100]}...")
|
94 |
+
except Exception as e:
|
95 |
+
print(f" Error checking iteration: {e}")
|
96 |
+
|
97 |
+
# Advanced text extraction with multiple strategies
|
98 |
+
extracted_text = ""
|
99 |
+
extraction_method = "none"
|
100 |
+
extracted_images = []
|
101 |
+
|
102 |
+
# Strategy 1: Mistral OCR specific - pages with markdown content and images
|
103 |
+
if hasattr(ocr_response, 'pages') and ocr_response.pages:
|
104 |
+
pages = ocr_response.pages
|
105 |
+
if isinstance(pages, list) and len(pages) > 0:
|
106 |
+
page_texts = []
|
107 |
+
|
108 |
+
for i, page in enumerate(pages):
|
109 |
+
# Extract text
|
110 |
+
if hasattr(page, 'markdown') and page.markdown:
|
111 |
+
page_texts.append(page.markdown)
|
112 |
+
print(f"✅ Found text in page {i} markdown: {len(page.markdown)} characters")
|
113 |
+
|
114 |
+
# Extract images
|
115 |
+
if hasattr(page, 'images') and page.images:
|
116 |
+
for j, img in enumerate(page.images):
|
117 |
+
image_data = {
|
118 |
+
'page': i,
|
119 |
+
'image_id': f"img-{i}-{j}",
|
120 |
+
'top_left_x': getattr(img, 'top_left_x', 0),
|
121 |
+
'top_left_y': getattr(img, 'top_left_y', 0),
|
122 |
+
'bottom_right_x': getattr(img, 'bottom_right_x', 0),
|
123 |
+
'bottom_right_y': getattr(img, 'bottom_right_y', 0),
|
124 |
+
'base64': getattr(img, 'image_base64', '')
|
125 |
+
}
|
126 |
+
extracted_images.append(image_data)
|
127 |
+
print(f"✅ Found image in page {i}, image {j}: coordinates ({image_data['top_left_x']}, {image_data['top_left_y']}) to ({image_data['bottom_right_x']}, {image_data['bottom_right_y']})")
|
128 |
+
|
129 |
+
if page_texts:
|
130 |
+
extracted_text = "\n\n".join(page_texts)
|
131 |
+
extraction_method = f"pages_markdown_{len(page_texts)}_pages"
|
132 |
+
|
133 |
+
# Try to extract images from other response structures if no images found yet
|
134 |
+
if not extracted_images:
|
135 |
+
# Check if response has images attribute directly
|
136 |
+
if hasattr(ocr_response, 'images') and ocr_response.images:
|
137 |
+
for j, img in enumerate(ocr_response.images):
|
138 |
+
image_data = {
|
139 |
+
'page': 0,
|
140 |
+
'image_id': getattr(img, 'id', f"img-{j}"),
|
141 |
+
'top_left_x': getattr(img, 'top_left_x', 0),
|
142 |
+
'top_left_y': getattr(img, 'top_left_y', 0),
|
143 |
+
'bottom_right_x': getattr(img, 'bottom_right_x', 0),
|
144 |
+
'bottom_right_y': getattr(img, 'bottom_right_y', 0),
|
145 |
+
'base64': getattr(img, 'image_base64', '')
|
146 |
+
}
|
147 |
+
extracted_images.append(image_data)
|
148 |
+
print(f"✅ Found image {j}: coordinates ({image_data['top_left_x']}, {image_data['top_left_y']}) to ({image_data['bottom_right_x']}, {image_data['bottom_right_y']})")
|
149 |
+
|
150 |
+
# Continue with fallback strategies for text extraction
|
151 |
+
if not extracted_text:
|
152 |
+
# Strategy 2: Direct text attribute (fallback)
|
153 |
+
if hasattr(ocr_response, 'text') and ocr_response.text:
|
154 |
+
extracted_text = str(ocr_response.text)
|
155 |
+
extraction_method = "direct_text_attribute"
|
156 |
+
|
157 |
+
# Strategy 3: Content attribute (fallback)
|
158 |
+
elif hasattr(ocr_response, 'content') and ocr_response.content:
|
159 |
+
content = ocr_response.content
|
160 |
+
if isinstance(content, str):
|
161 |
+
extracted_text = content
|
162 |
+
extraction_method = "content_attribute_string"
|
163 |
+
elif hasattr(content, 'text'):
|
164 |
+
extracted_text = str(content.text)
|
165 |
+
extraction_method = "content_text_attribute"
|
166 |
+
else:
|
167 |
+
extracted_text = str(content)
|
168 |
+
extraction_method = "content_attribute_converted"
|
169 |
+
|
170 |
+
# Strategy 4: Result attribute (fallback)
|
171 |
+
elif hasattr(ocr_response, 'result'):
|
172 |
+
result = ocr_response.result
|
173 |
+
if isinstance(result, str):
|
174 |
+
extracted_text = result
|
175 |
+
extraction_method = "result_string"
|
176 |
+
elif hasattr(result, 'text'):
|
177 |
+
extracted_text = str(result.text)
|
178 |
+
extraction_method = "result_text_attribute"
|
179 |
+
elif isinstance(result, dict) and 'text' in result:
|
180 |
+
extracted_text = str(result['text'])
|
181 |
+
extraction_method = "result_dict_text"
|
182 |
+
else:
|
183 |
+
extracted_text = str(result)
|
184 |
+
extraction_method = "result_converted"
|
185 |
+
|
186 |
+
# Strategy 5: Choices attribute (ChatGPT-style response - fallback)
|
187 |
+
elif hasattr(ocr_response, 'choices') and ocr_response.choices:
|
188 |
+
choices = ocr_response.choices
|
189 |
+
if isinstance(choices, list) and len(choices) > 0:
|
190 |
+
choice = choices[0]
|
191 |
+
if hasattr(choice, 'message') and hasattr(choice.message, 'content'):
|
192 |
+
extracted_text = str(choice.message.content)
|
193 |
+
extraction_method = "choices_message_content"
|
194 |
+
elif hasattr(choice, 'text'):
|
195 |
+
extracted_text = str(choice.text)
|
196 |
+
extraction_method = "choices_text"
|
197 |
+
else:
|
198 |
+
extracted_text = str(choice)
|
199 |
+
extraction_method = "choices_converted"
|
200 |
+
|
201 |
+
# Strategy 6: Dict-like access (fallback)
|
202 |
+
elif hasattr(ocr_response, 'get') or isinstance(ocr_response, dict):
|
203 |
+
for key in ['text', 'content', 'result', 'extracted_text', 'ocr_text', 'output']:
|
204 |
+
if hasattr(ocr_response, 'get'):
|
205 |
+
value = ocr_response.get(key)
|
206 |
+
else:
|
207 |
+
value = ocr_response.get(key) if isinstance(ocr_response, dict) else None
|
208 |
+
|
209 |
+
if value:
|
210 |
+
extracted_text = str(value)
|
211 |
+
extraction_method = f"dict_key_{key}"
|
212 |
+
break
|
213 |
+
|
214 |
+
# Strategy 7: Inspect all attributes for string-like content (fallback)
|
215 |
+
elif hasattr(ocr_response, '__dict__'):
|
216 |
+
for key, value in ocr_response.__dict__.items():
|
217 |
+
if isinstance(value, str) and len(value) > 20: # Likely text content
|
218 |
+
extracted_text = value
|
219 |
+
extraction_method = f"attribute_{key}"
|
220 |
+
break
|
221 |
+
elif hasattr(value, 'text') and isinstance(value.text, str):
|
222 |
+
extracted_text = str(value.text)
|
223 |
+
extraction_method = f"nested_text_in_{key}"
|
224 |
+
break
|
225 |
+
|
226 |
+
# Strategy 8: Convert entire response to string if it seems to contain text (fallback)
|
227 |
+
if not extracted_text:
|
228 |
+
response_str = str(ocr_response)
|
229 |
+
if len(response_str) > 50 and not response_str.startswith('<'): # Not an object reference
|
230 |
+
extracted_text = response_str
|
231 |
+
extraction_method = "full_response_string"
|
232 |
+
|
233 |
+
print(f"🎯 Extraction method used: {extraction_method}")
|
234 |
+
print(f"📏 Extracted text length: {len(extracted_text)} characters")
|
235 |
+
print(f"🖼️ Extracted images: {len(extracted_images)}")
|
236 |
+
|
237 |
+
if extracted_text:
|
238 |
+
status = f"✅ Successfully extracted text from PDF ({len(extracted_text)} characters)"
|
239 |
+
if extracted_images:
|
240 |
+
status += f" and {len(extracted_images)} image(s)"
|
241 |
+
else:
|
242 |
+
extracted_text = "No text could be extracted from this PDF."
|
243 |
+
status = "⚠️ OCR completed but no text was found in response."
|
244 |
+
if extracted_images:
|
245 |
+
status = f"✅ Successfully extracted {len(extracted_images)} image(s) from PDF, but no text was found."
|
246 |
+
print(f"❌ No extractable text found in OCR response")
|
247 |
+
|
248 |
+
return extracted_text, status, extracted_images
|
249 |
+
|
250 |
+
except Exception as e:
|
251 |
+
error_msg = f"Error processing PDF: {str(e)}"
|
252 |
+
print(error_msg)
|
253 |
+
return "", f"❌ {error_msg}", []
|
254 |
+
|
requirements.txt
ADDED
File without changes
|
tests/test_ocr_direct.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Quick OCR Test Script
|
3 |
+
Tests the Mistral AI OCR functionality directly without the Gradio interface.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import base64
|
7 |
+
import os
|
8 |
+
import tempfile
|
9 |
+
from mistralai import Mistral
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
|
12 |
+
# Load environment variables
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
def create_simple_pdf_content():
|
16 |
+
"""Create a minimal PDF in memory for testing."""
|
17 |
+
# Simple PDF content (this is a basic PDF structure)
|
18 |
+
pdf_content = """%PDF-1.4
|
19 |
+
1 0 obj
|
20 |
+
<<
|
21 |
+
/Type /Catalog
|
22 |
+
/Pages 2 0 R
|
23 |
+
>>
|
24 |
+
endobj
|
25 |
+
|
26 |
+
2 0 obj
|
27 |
+
<<
|
28 |
+
/Type /Pages
|
29 |
+
/Kids [3 0 R]
|
30 |
+
/Count 1
|
31 |
+
>>
|
32 |
+
endobj
|
33 |
+
|
34 |
+
3 0 obj
|
35 |
+
<<
|
36 |
+
/Type /Page
|
37 |
+
/Parent 2 0 R
|
38 |
+
/MediaBox [0 0 612 792]
|
39 |
+
/Contents 4 0 R
|
40 |
+
/Resources <<
|
41 |
+
/Font <<
|
42 |
+
/F1 5 0 R
|
43 |
+
>>
|
44 |
+
>>
|
45 |
+
>>
|
46 |
+
endobj
|
47 |
+
|
48 |
+
4 0 obj
|
49 |
+
<<
|
50 |
+
/Length 44
|
51 |
+
>>
|
52 |
+
stream
|
53 |
+
BT
|
54 |
+
/F1 12 Tf
|
55 |
+
72 720 Td
|
56 |
+
(Hello World! Test OCR) Tj
|
57 |
+
ET
|
58 |
+
endstream
|
59 |
+
endobj
|
60 |
+
|
61 |
+
5 0 obj
|
62 |
+
<<
|
63 |
+
/Type /Font
|
64 |
+
/Subtype /Type1
|
65 |
+
/BaseFont /Helvetica
|
66 |
+
>>
|
67 |
+
endobj
|
68 |
+
|
69 |
+
xref
|
70 |
+
0 6
|
71 |
+
0000000000 65535 f
|
72 |
+
0000000010 00000 n
|
73 |
+
0000000079 00000 n
|
74 |
+
0000000173 00000 n
|
75 |
+
0000000301 00000 n
|
76 |
+
0000000380 00000 n
|
77 |
+
trailer
|
78 |
+
<<
|
79 |
+
/Size 6
|
80 |
+
/Root 1 0 R
|
81 |
+
>>
|
82 |
+
startxref
|
83 |
+
456
|
84 |
+
%%EOF"""
|
85 |
+
|
86 |
+
return pdf_content.encode('utf-8')
|
87 |
+
|
88 |
+
def test_mistral_ocr():
|
89 |
+
"""Test the Mistral OCR functionality directly."""
|
90 |
+
|
91 |
+
print("🧪 Starting Mistral OCR Test...")
|
92 |
+
|
93 |
+
# Check API key
|
94 |
+
api_key = os.environ.get("MISTRAL_API_KEY")
|
95 |
+
if not api_key:
|
96 |
+
print("❌ MISTRAL_API_KEY environment variable not found")
|
97 |
+
print(" Please set it in your .env file or environment")
|
98 |
+
return False
|
99 |
+
|
100 |
+
print(f"✅ API key found: {api_key[:8]}...")
|
101 |
+
|
102 |
+
try:
|
103 |
+
# Initialize Mistral client
|
104 |
+
client = Mistral(api_key=api_key)
|
105 |
+
print("✅ Mistral client initialized")
|
106 |
+
|
107 |
+
# Create a simple test PDF
|
108 |
+
pdf_content = create_simple_pdf_content()
|
109 |
+
base64_pdf = base64.b64encode(pdf_content).decode('utf-8')
|
110 |
+
print(f"✅ Test PDF created ({len(pdf_content)} bytes)")
|
111 |
+
|
112 |
+
# Test the OCR endpoint
|
113 |
+
print("🔄 Sending OCR request to Mistral...")
|
114 |
+
|
115 |
+
response = client.ocr.process(
|
116 |
+
model="mistral-ocr-latest",
|
117 |
+
document={
|
118 |
+
"type": "document_url",
|
119 |
+
"document_url": f"data:application/pdf;base64,{base64_pdf}"
|
120 |
+
},
|
121 |
+
include_image_base64=True
|
122 |
+
)
|
123 |
+
|
124 |
+
print("✅ OCR request completed")
|
125 |
+
|
126 |
+
# Analyze the response
|
127 |
+
print("\n🔍 RESPONSE ANALYSIS:")
|
128 |
+
print(f"Response type: {type(response)}")
|
129 |
+
print(f"Response string: {str(response)[:200]}...")
|
130 |
+
|
131 |
+
if hasattr(response, '__dict__'):
|
132 |
+
print(f"Response attributes: {list(response.__dict__.keys())}")
|
133 |
+
for key, value in response.__dict__.items():
|
134 |
+
print(f" {key}: {type(value)} = {str(value)[:100]}...")
|
135 |
+
# Test all possible text extraction methods
|
136 |
+
print("\n🎯 TESTING TEXT EXTRACTION METHODS:")
|
137 |
+
|
138 |
+
methods = [
|
139 |
+
("response.pages[].markdown", lambda r: "\n".join([page.markdown for page in r.pages]) if hasattr(r, 'pages') and r.pages and all(hasattr(p, 'markdown') for p in r.pages) else None),
|
140 |
+
("response.text", lambda r: getattr(r, 'text', None)),
|
141 |
+
("response.content", lambda r: getattr(r, 'content', None)),
|
142 |
+
("response.result", lambda r: getattr(r, 'result', None)),
|
143 |
+
("response.data", lambda r: getattr(r, 'data', None)),
|
144 |
+
("response['text']", lambda r: r.get('text') if hasattr(r, 'get') else None),
|
145 |
+
("response['content']", lambda r: r.get('content') if hasattr(r, 'get') else None),
|
146 |
+
]
|
147 |
+
|
148 |
+
extracted_text = None
|
149 |
+
successful_method = None
|
150 |
+
|
151 |
+
for method_name, method_func in methods:
|
152 |
+
try:
|
153 |
+
result = method_func(response)
|
154 |
+
if result:
|
155 |
+
print(f"✅ {method_name}: Found content ({len(str(result))} chars)")
|
156 |
+
print(f" Content: {str(result)[:100]}...")
|
157 |
+
if not extracted_text: # Use the first successful method
|
158 |
+
extracted_text = str(result)
|
159 |
+
successful_method = method_name
|
160 |
+
else:
|
161 |
+
print(f"❌ {method_name}: No content found")
|
162 |
+
except Exception as e:
|
163 |
+
print(f"❌ {method_name}: Error - {e}")
|
164 |
+
|
165 |
+
if extracted_text:
|
166 |
+
print(f"\n🎉 SUCCESSFULLY EXTRACTED TEXT using {successful_method}:")
|
167 |
+
print(f"📝 Full extracted text: '{extracted_text}'")
|
168 |
+
else:
|
169 |
+
print(f"\n❌ NO TEXT EXTRACTED from any method")
|
170 |
+
|
171 |
+
return True
|
172 |
+
|
173 |
+
except Exception as e:
|
174 |
+
print(f"❌ OCR test failed: {e}")
|
175 |
+
print(f" Error type: {type(e)}")
|
176 |
+
|
177 |
+
# If it's a 401 error, the API key might be invalid
|
178 |
+
if "401" in str(e) or "unauthorized" in str(e).lower():
|
179 |
+
print(" This might be an API key issue. Please check your MISTRAL_API_KEY")
|
180 |
+
|
181 |
+
return False
|
182 |
+
|
183 |
+
def test_api_connectivity():
|
184 |
+
"""Test basic connectivity to Mistral API."""
|
185 |
+
|
186 |
+
print("🌐 Testing API connectivity...")
|
187 |
+
|
188 |
+
api_key = os.environ.get("MISTRAL_API_KEY")
|
189 |
+
if not api_key:
|
190 |
+
print("❌ No API key found")
|
191 |
+
return False
|
192 |
+
|
193 |
+
try:
|
194 |
+
client = Mistral(api_key=api_key)
|
195 |
+
|
196 |
+
# Try a simple API call (if available)
|
197 |
+
# Note: This might fail if the endpoint doesn't exist, but it tests connectivity
|
198 |
+
print("🔄 Testing API connection...")
|
199 |
+
|
200 |
+
# The exact method to test connectivity may vary based on Mistral's API
|
201 |
+
# For now, we'll just try to initialize and catch any immediate errors
|
202 |
+
print("✅ Mistral client appears to be working")
|
203 |
+
return True
|
204 |
+
|
205 |
+
except Exception as e:
|
206 |
+
print(f"❌ API connectivity test failed: {e}")
|
207 |
+
return False
|
208 |
+
|
209 |
+
def main():
|
210 |
+
"""Main test function."""
|
211 |
+
|
212 |
+
print("🚀 Mistral OCR Quick Test")
|
213 |
+
print("=" * 40)
|
214 |
+
|
215 |
+
# Test API connectivity first
|
216 |
+
if not test_api_connectivity():
|
217 |
+
print("\n❌ Basic connectivity test failed")
|
218 |
+
return
|
219 |
+
|
220 |
+
print("\n" + "="*40)
|
221 |
+
|
222 |
+
# Test OCR functionality
|
223 |
+
if test_mistral_ocr():
|
224 |
+
print("\n✅ OCR test completed - check the response analysis above")
|
225 |
+
else:
|
226 |
+
print("\n❌ OCR test failed")
|
227 |
+
|
228 |
+
print("\n💡 Next steps:")
|
229 |
+
print(" 1. If the test worked, run: python main.py")
|
230 |
+
print(" 2. If there were errors, check the API key and try again")
|
231 |
+
print(" 3. Use the response analysis to improve text extraction")
|
232 |
+
|
233 |
+
if __name__ == "__main__":
|
234 |
+
main()
|
tests/test_setup.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Test script for PDF Extractor setup validation
|
3 |
+
"""
|
4 |
+
|
5 |
+
import sys
|
6 |
+
import os
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
|
9 |
+
def test_imports():
|
10 |
+
"""Test if all required packages are importable."""
|
11 |
+
try:
|
12 |
+
import gradio as gr
|
13 |
+
print("✅ Gradio imported successfully")
|
14 |
+
|
15 |
+
import mistralai
|
16 |
+
print("✅ Mistral AI imported successfully")
|
17 |
+
|
18 |
+
from dotenv import load_dotenv
|
19 |
+
print("✅ python-dotenv imported successfully")
|
20 |
+
|
21 |
+
return True
|
22 |
+
except ImportError as e:
|
23 |
+
print(f"❌ Import error: {e}")
|
24 |
+
return False
|
25 |
+
|
26 |
+
def test_environment():
|
27 |
+
"""Test environment variable setup."""
|
28 |
+
load_dotenv()
|
29 |
+
|
30 |
+
api_key = os.environ.get("MISTRAL_API_KEY")
|
31 |
+
if api_key:
|
32 |
+
# Don't print the actual key, just confirm it exists
|
33 |
+
print("✅ MISTRAL_API_KEY environment variable is set")
|
34 |
+
return True
|
35 |
+
else:
|
36 |
+
print("⚠️ MISTRAL_API_KEY not found in environment")
|
37 |
+
print(" Please copy .env.example to .env and add your API key")
|
38 |
+
return False
|
39 |
+
|
40 |
+
def main():
|
41 |
+
"""Run all tests."""
|
42 |
+
print("🔍 PDF Extractor Setup Validation")
|
43 |
+
print("=" * 40)
|
44 |
+
|
45 |
+
import_success = test_imports()
|
46 |
+
env_success = test_environment()
|
47 |
+
|
48 |
+
print("\n" + "=" * 40)
|
49 |
+
if import_success:
|
50 |
+
print("✅ All packages are properly installed")
|
51 |
+
if env_success:
|
52 |
+
print("✅ Environment is configured correctly")
|
53 |
+
print("🚀 Ready to run: python main.py")
|
54 |
+
else:
|
55 |
+
print("⚠️ Environment needs configuration")
|
56 |
+
print("📝 Next step: Set up your .env file")
|
57 |
+
else:
|
58 |
+
print("❌ Package installation incomplete")
|
59 |
+
print("📝 Next step: pip install -r requirements.txt")
|
60 |
+
|
61 |
+
if __name__ == "__main__":
|
62 |
+
main()
|
ui/__init__.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""UI components for PDF Text Extractor."""
|
2 |
+
from ui.interface import create_interface
|
3 |
+
from ui.handlers import copy_text, download_text, process_images_for_display
|
4 |
+
from ui.components import (
|
5 |
+
create_header, create_upload_section, create_action_button,
|
6 |
+
create_text_display, create_action_buttons, create_image_gallery,
|
7 |
+
apply_custom_css
|
8 |
+
)
|
9 |
+
|
10 |
+
__all__ = [
|
11 |
+
"create_interface", "copy_text", "download_text", "process_images_for_display",
|
12 |
+
"create_header", "create_upload_section", "create_action_button",
|
13 |
+
"create_text_display", "create_action_buttons", "create_image_gallery",
|
14 |
+
"apply_custom_css"
|
15 |
+
]
|
ui/components.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
UI components module for PDF Text Extractor.
|
3 |
+
Contains functions for creating individual UI components.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
from typing import Tuple, List, Dict, Any
|
8 |
+
|
9 |
+
def create_header() -> gr.Markdown:
|
10 |
+
"""
|
11 |
+
Create the application header.
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
gr.Markdown: Header component
|
15 |
+
"""
|
16 |
+
return gr.Markdown("""
|
17 |
+
# 🔍 PDF Text Extractor
|
18 |
+
|
19 |
+
Extract text and images from PDF files using Mistral AI's OCR technology.
|
20 |
+
|
21 |
+
**Instructions:**
|
22 |
+
1. Upload a PDF file using the file selector below
|
23 |
+
2. Wait for processing to complete
|
24 |
+
3. View the extracted text and images
|
25 |
+
4. Use the Copy or Download buttons to save the extracted text
|
26 |
+
|
27 |
+
**Supported:** PDF files up to 10MB
|
28 |
+
""")
|
29 |
+
|
30 |
+
def create_upload_section() -> gr.File:
|
31 |
+
"""
|
32 |
+
Create the file upload component.
|
33 |
+
|
34 |
+
Returns:
|
35 |
+
gr.File: File upload component
|
36 |
+
"""
|
37 |
+
return gr.File(
|
38 |
+
label="Upload PDF File",
|
39 |
+
file_types=[".pdf"],
|
40 |
+
file_count="single"
|
41 |
+
)
|
42 |
+
|
43 |
+
def create_action_button() -> gr.Button:
|
44 |
+
"""
|
45 |
+
Create the extract text action button.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
gr.Button: Action button component
|
49 |
+
"""
|
50 |
+
return gr.Button("Extract Text & Images", variant="primary")
|
51 |
+
|
52 |
+
def create_text_display() -> Tuple[gr.Textbox, gr.Textbox]:
|
53 |
+
"""
|
54 |
+
Create the text output and status display components.
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
Tuple[gr.Textbox, gr.Textbox]: Text output and status components
|
58 |
+
"""
|
59 |
+
text_output = gr.Textbox(
|
60 |
+
label="Extracted Text",
|
61 |
+
lines=10,
|
62 |
+
max_lines=20,
|
63 |
+
placeholder="Extracted text will appear here...",
|
64 |
+
show_copy_button=True
|
65 |
+
)
|
66 |
+
|
67 |
+
status_output = gr.Textbox(
|
68 |
+
label="Status",
|
69 |
+
lines=2,
|
70 |
+
placeholder="Upload a PDF to see status..."
|
71 |
+
)
|
72 |
+
|
73 |
+
return text_output, status_output
|
74 |
+
|
75 |
+
def create_image_gallery() -> gr.Gallery:
|
76 |
+
"""
|
77 |
+
Create the image gallery component.
|
78 |
+
|
79 |
+
Returns:
|
80 |
+
gr.Gallery: Image gallery component
|
81 |
+
"""
|
82 |
+
return gr.Gallery(
|
83 |
+
label="Extracted Images",
|
84 |
+
columns=3,
|
85 |
+
rows=2,
|
86 |
+
object_fit="contain",
|
87 |
+
height="auto",
|
88 |
+
visible=True,
|
89 |
+
show_label=True,
|
90 |
+
elem_id="image_gallery"
|
91 |
+
)
|
92 |
+
|
93 |
+
def create_action_buttons() -> Tuple[gr.Button, gr.Button]:
|
94 |
+
"""
|
95 |
+
Create copy and download action buttons.
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
Tuple[gr.Button, gr.Button]: Copy and download button components
|
99 |
+
"""
|
100 |
+
copy_btn = gr.Button("📋 Copy to Clipboard")
|
101 |
+
download_btn = gr.Button("📥 Download as Text File")
|
102 |
+
|
103 |
+
return copy_btn, download_btn
|
104 |
+
|
105 |
+
def apply_custom_css() -> gr.HTML:
|
106 |
+
"""
|
107 |
+
Apply custom CSS styling.
|
108 |
+
|
109 |
+
Returns:
|
110 |
+
gr.HTML: HTML component with CSS styles
|
111 |
+
"""
|
112 |
+
return gr.HTML("""
|
113 |
+
<style>
|
114 |
+
.gradio-container {
|
115 |
+
max-width: 900px !important;
|
116 |
+
}
|
117 |
+
.output-markdown {
|
118 |
+
font-family: 'Courier New', monospace;
|
119 |
+
}
|
120 |
+
.image-gallery-caption {
|
121 |
+
text-align: center;
|
122 |
+
font-size: 0.9em;
|
123 |
+
}
|
124 |
+
</style>
|
125 |
+
""")
|
ui/handlers.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Event handlers for UI components.
|
3 |
+
Contains functions that handle user interactions with the interface.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import tempfile
|
8 |
+
from typing import Optional, List, Dict, Any
|
9 |
+
from utils.pdf_image_extractor import PDFImageExtractor
|
10 |
+
|
11 |
+
def copy_text(text: str) -> str:
|
12 |
+
"""
|
13 |
+
Handle Copy button click.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
text: Text to copy to clipboard
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
str: The input text (unchanged)
|
20 |
+
"""
|
21 |
+
return text
|
22 |
+
|
23 |
+
def download_text(text: str) -> Optional[str]:
|
24 |
+
"""
|
25 |
+
Handle Download button click.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
text: Text to download
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
Optional[str]: Path to the created text file or None if text is empty
|
32 |
+
"""
|
33 |
+
import tempfile
|
34 |
+
import os
|
35 |
+
|
36 |
+
if not text:
|
37 |
+
return None
|
38 |
+
|
39 |
+
# Create a temporary file to hold the text
|
40 |
+
temp_dir = tempfile.gettempdir()
|
41 |
+
filename = "extracted_text.txt"
|
42 |
+
file_path = os.path.join(temp_dir, filename)
|
43 |
+
|
44 |
+
# Write the text to the file
|
45 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
46 |
+
f.write(text)
|
47 |
+
|
48 |
+
return file_path
|
49 |
+
|
50 |
+
def process_images_for_display(images_data: List[Dict[str, Any]], pdf_path: str = None) -> List:
|
51 |
+
"""
|
52 |
+
Process images for display in the Gradio gallery.
|
53 |
+
|
54 |
+
Args:
|
55 |
+
images_data: List of image data dictionaries from OCR response
|
56 |
+
pdf_path: Path to the original PDF file for image extraction
|
57 |
+
|
58 |
+
Returns:
|
59 |
+
List: List of image paths for gallery display
|
60 |
+
"""
|
61 |
+
if not images_data:
|
62 |
+
return []
|
63 |
+
|
64 |
+
# If we have PDF path and bounding box data, extract images from PDF
|
65 |
+
if pdf_path and os.path.exists(pdf_path):
|
66 |
+
print("🖼️ Extracting images from PDF using bounding box coordinates...")
|
67 |
+
extracted_paths = PDFImageExtractor.extract_images_from_pdf(pdf_path, images_data)
|
68 |
+
if extracted_paths:
|
69 |
+
return extracted_paths
|
70 |
+
|
71 |
+
# Fallback: extract all images from PDF if bounding box extraction failed
|
72 |
+
print("🔄 Fallback: Extracting all images from PDF...")
|
73 |
+
extracted_paths = PDFImageExtractor.extract_all_images_from_pdf(pdf_path)
|
74 |
+
if extracted_paths:
|
75 |
+
return extracted_paths[:len(images_data)] # Limit to expected number of images
|
76 |
+
|
77 |
+
# Fallback: use base64 data from OCR response
|
78 |
+
print("🔄 Using base64 image data from OCR response...")
|
79 |
+
gallery_images = []
|
80 |
+
temp_dir = tempfile.gettempdir()
|
81 |
+
|
82 |
+
for index, img_data in enumerate(images_data):
|
83 |
+
try:
|
84 |
+
# Get image base64 data
|
85 |
+
base64_data = img_data.get('base64', '')
|
86 |
+
if not base64_data:
|
87 |
+
continue
|
88 |
+
|
89 |
+
# Create a temporary file to save the image
|
90 |
+
img_filename = f"extracted_image_fallback_{index}.jpg"
|
91 |
+
img_path = os.path.join(temp_dir, img_filename)
|
92 |
+
|
93 |
+
# Convert base64 to image file
|
94 |
+
import base64
|
95 |
+
with open(img_path, "wb") as img_file:
|
96 |
+
img_file.write(base64.b64decode(base64_data))
|
97 |
+
|
98 |
+
# Add path to gallery list (Gradio Gallery expects a list of paths)
|
99 |
+
gallery_images.append(img_path)
|
100 |
+
|
101 |
+
except Exception as e:
|
102 |
+
print(f"Error processing image {index}: {str(e)}")
|
103 |
+
|
104 |
+
return gallery_images
|
ui/interface.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Interface creation module for PDF Text Extractor.
|
3 |
+
Defines the Gradio interface components and layout.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
from pdf_text_extractor import PDFTextExtractor
|
8 |
+
from ui.handlers import copy_text, download_text, process_images_for_display
|
9 |
+
from ui.components import (
|
10 |
+
create_header, create_upload_section, create_action_button,
|
11 |
+
create_text_display, create_action_buttons, create_image_gallery, apply_custom_css
|
12 |
+
)
|
13 |
+
|
14 |
+
def create_dummy_interface() -> gr.Blocks:
|
15 |
+
"""
|
16 |
+
Create a simple interface for when the API key is not configured.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
gr.Blocks: Gradio interface with disabled functionality
|
20 |
+
"""
|
21 |
+
with gr.Blocks(title="PDF Text Extractor") as interface:
|
22 |
+
gr.Markdown("""
|
23 |
+
# 🔍 PDF Text Extractor
|
24 |
+
|
25 |
+
⚠️ **API key not configured.** Please set MISTRAL_API_KEY environment variable and restart the application.
|
26 |
+
""")
|
27 |
+
|
28 |
+
with gr.Row():
|
29 |
+
gr.File(label="Upload PDF", file_types=[".pdf"])
|
30 |
+
|
31 |
+
with gr.Row():
|
32 |
+
gr.Button("Extract Text", variant="primary", interactive=False)
|
33 |
+
|
34 |
+
with gr.Row():
|
35 |
+
gr.Textbox(
|
36 |
+
label="Extracted Text",
|
37 |
+
lines=10,
|
38 |
+
value="API key not configured. Text extraction is unavailable.",
|
39 |
+
interactive=False
|
40 |
+
)
|
41 |
+
|
42 |
+
with gr.Row():
|
43 |
+
gr.Textbox(
|
44 |
+
label="Status",
|
45 |
+
lines=2,
|
46 |
+
value="❌ MISTRAL_API_KEY environment variable is not set. Please set it and restart the application."
|
47 |
+
)
|
48 |
+
|
49 |
+
with gr.Row():
|
50 |
+
gr.Button("📋 Copy to Clipboard", interactive=False)
|
51 |
+
gr.Button("📥 Download as Text File", interactive=False)
|
52 |
+
|
53 |
+
return interface
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
def create_main_interface(extractor: PDFTextExtractor) -> gr.Blocks:
|
58 |
+
"""
|
59 |
+
Create the main application interface.
|
60 |
+
|
61 |
+
Args:
|
62 |
+
extractor: PDFTextExtractor instance
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
gr.Blocks: Gradio interface with full functionality """
|
66 |
+
# Make the extractor a local function attribute
|
67 |
+
def process_pdf_wrapper(pdf_file):
|
68 |
+
"""Process PDF with the extractor from closure"""
|
69 |
+
extracted_text, status, images_data = extractor.extract_text_from_pdf(pdf_file)
|
70 |
+
# Get PDF file path for image extraction
|
71 |
+
pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file if pdf_file else None
|
72 |
+
gallery_images = process_images_for_display(images_data, pdf_path)
|
73 |
+
return extracted_text, status, gallery_images
|
74 |
+
|
75 |
+
with gr.Blocks(title="🔍 PDF Text Extractor", theme=gr.themes.Soft()) as interface:
|
76 |
+
# Add the header
|
77 |
+
create_header()
|
78 |
+
|
79 |
+
# Add file upload section
|
80 |
+
with gr.Row():
|
81 |
+
pdf_input = create_upload_section()
|
82 |
+
|
83 |
+
# Add extract button
|
84 |
+
with gr.Row():
|
85 |
+
submit_btn = create_action_button()
|
86 |
+
|
87 |
+
# Add status display
|
88 |
+
with gr.Row():
|
89 |
+
status_output = gr.Textbox(
|
90 |
+
label="Status",
|
91 |
+
lines=2,
|
92 |
+
placeholder="Upload a PDF to see status..."
|
93 |
+
)
|
94 |
+
|
95 |
+
# Create tabs for text and images
|
96 |
+
with gr.Tabs():
|
97 |
+
with gr.TabItem("Extracted Text"):
|
98 |
+
text_output = gr.Textbox(
|
99 |
+
label="Extracted Text",
|
100 |
+
lines=15,
|
101 |
+
max_lines=30,
|
102 |
+
placeholder="Extracted text will appear here...",
|
103 |
+
show_copy_button=True
|
104 |
+
)
|
105 |
+
|
106 |
+
# Add action buttons for text
|
107 |
+
with gr.Row():
|
108 |
+
copy_btn, download_btn = create_action_buttons()
|
109 |
+
|
110 |
+
with gr.TabItem("Extracted Images"):
|
111 |
+
image_gallery = create_image_gallery()
|
112 |
+
image_info = gr.Markdown("Images extracted from the PDF will appear here.")
|
113 |
+
# Set up function calls
|
114 |
+
submit_btn.click(
|
115 |
+
fn=process_pdf_wrapper,
|
116 |
+
inputs=[pdf_input],
|
117 |
+
outputs=[text_output, status_output, image_gallery]
|
118 |
+
)
|
119 |
+
|
120 |
+
# Handle Copy button click
|
121 |
+
copy_btn.click(
|
122 |
+
fn=copy_text,
|
123 |
+
inputs=text_output,
|
124 |
+
outputs=None,
|
125 |
+
js="""
|
126 |
+
function(text) {
|
127 |
+
if (text) {
|
128 |
+
navigator.clipboard.writeText(text);
|
129 |
+
// Show a temporary notification
|
130 |
+
var notification = document.createElement('div');
|
131 |
+
notification.textContent = 'Text copied to clipboard!';
|
132 |
+
notification.style.position = 'fixed';
|
133 |
+
notification.style.bottom = '20px';
|
134 |
+
notification.style.left = '50%';
|
135 |
+
notification.style.transform = 'translateX(-50%)';
|
136 |
+
notification.style.padding = '10px 20px';
|
137 |
+
notification.style.background = '#4CAF50';
|
138 |
+
notification.style.color = 'white';
|
139 |
+
notification.style.borderRadius = '4px';
|
140 |
+
notification.style.zIndex = '1000';
|
141 |
+
document.body.appendChild(notification);
|
142 |
+
setTimeout(function() {
|
143 |
+
document.body.removeChild(notification);
|
144 |
+
}, 2000);
|
145 |
+
}
|
146 |
+
return text;
|
147 |
+
}
|
148 |
+
"""
|
149 |
+
)
|
150 |
+
|
151 |
+
# Handle Download button click
|
152 |
+
download_btn.click(
|
153 |
+
fn=download_text,
|
154 |
+
inputs=text_output,
|
155 |
+
outputs=gr.File(label="Download", elem_id="download_output"),
|
156 |
+
show_progress=False
|
157 |
+
)
|
158 |
+
|
159 |
+
# Apply custom CSS styling
|
160 |
+
apply_custom_css()
|
161 |
+
|
162 |
+
return interface
|
163 |
+
|
164 |
+
def create_interface() -> gr.Blocks:
|
165 |
+
"""
|
166 |
+
Create and configure the Gradio interface.
|
167 |
+
|
168 |
+
Returns:
|
169 |
+
gr.Blocks: Configured Gradio interface
|
170 |
+
"""
|
171 |
+
# Initialize the PDF extractor
|
172 |
+
try:
|
173 |
+
extractor = PDFTextExtractor()
|
174 |
+
return create_main_interface(extractor)
|
175 |
+
except ValueError as e:
|
176 |
+
# Create a dummy interface if API key is missing
|
177 |
+
return create_dummy_interface()
|
utils/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utility functions for PDF Text Extractor."""
|
2 |
+
from utils.config import check_api_key, get_app_config
|
3 |
+
|
4 |
+
__all__ = ["check_api_key", "get_app_config"]
|
utils/config.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Configuration utilities for PDF Text Extractor.
|
3 |
+
Contains functions for handling environment variables and app configuration.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
from typing import Dict, Any
|
8 |
+
|
9 |
+
def check_api_key() -> bool:
|
10 |
+
"""
|
11 |
+
Check if the Mistral API key is set in environment variables.
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
bool: True if API key is set, False otherwise
|
15 |
+
"""
|
16 |
+
api_key = os.environ.get("MISTRAL_API_KEY")
|
17 |
+
if not api_key:
|
18 |
+
print("⚠️ Warning: MISTRAL_API_KEY environment variable is not set.")
|
19 |
+
print(" Please set it before using the PDF extraction functionality.")
|
20 |
+
print(" Example: export MISTRAL_API_KEY='your-api-key-here'")
|
21 |
+
print()
|
22 |
+
return False
|
23 |
+
return True
|
24 |
+
|
25 |
+
def get_app_config() -> Dict[str, Any]:
|
26 |
+
"""
|
27 |
+
Get application configuration settings.
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
Dict[str, Any]: Application configuration settings
|
31 |
+
"""
|
32 |
+
return {
|
33 |
+
"server_port": 7861, # Use different port to avoid conflicts
|
34 |
+
"debug": True, # Enable debug mode for development
|
35 |
+
"quiet": False, # Show startup messages
|
36 |
+
"max_file_size": "10mb" # Limit PDF file size
|
37 |
+
# Uncomment the following to enable external access and public link sharing:
|
38 |
+
# "server_name": "0.0.0.0", # Allow external access
|
39 |
+
# "share": True, # Create public link
|
40 |
+
}
|
utils/pdf_image_extractor.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
PDF Image Extraction utilities.
|
3 |
+
Extracts images from PDF using bounding box coordinates.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import tempfile
|
8 |
+
from typing import List, Dict, Any, Optional
|
9 |
+
import fitz # PyMuPDF
|
10 |
+
from PIL import Image
|
11 |
+
import base64
|
12 |
+
import io
|
13 |
+
|
14 |
+
|
15 |
+
class PDFImageExtractor:
|
16 |
+
"""Extract images from PDF using bounding box coordinates."""
|
17 |
+
|
18 |
+
@staticmethod
|
19 |
+
def extract_images_from_pdf(pdf_path: str, images_data: List[Dict[str, Any]]) -> List[str]:
|
20 |
+
"""
|
21 |
+
Extract images from PDF using bounding box coordinates.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
pdf_path: Path to the PDF file
|
25 |
+
images_data: List of image data with bounding box coordinates
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
List[str]: List of paths to extracted image files
|
29 |
+
"""
|
30 |
+
if not images_data:
|
31 |
+
return []
|
32 |
+
|
33 |
+
try:
|
34 |
+
# Open the PDF document
|
35 |
+
pdf_doc = fitz.open(pdf_path)
|
36 |
+
extracted_image_paths = []
|
37 |
+
temp_dir = tempfile.gettempdir()
|
38 |
+
|
39 |
+
for index, img_data in enumerate(images_data):
|
40 |
+
try:
|
41 |
+
page_num = img_data.get('page', 0)
|
42 |
+
|
43 |
+
# Ensure page number is valid
|
44 |
+
if page_num >= len(pdf_doc):
|
45 |
+
print(f"Warning: Page {page_num} not found in PDF (max: {len(pdf_doc)-1})")
|
46 |
+
continue
|
47 |
+
|
48 |
+
# Get the page
|
49 |
+
page = pdf_doc[page_num]
|
50 |
+
|
51 |
+
# Get bounding box coordinates
|
52 |
+
top_left_x = img_data.get('top_left_x', 0)
|
53 |
+
top_left_y = img_data.get('top_left_y', 0)
|
54 |
+
bottom_right_x = img_data.get('bottom_right_x', 0)
|
55 |
+
bottom_right_y = img_data.get('bottom_right_y', 0)
|
56 |
+
|
57 |
+
# Create a rectangle for the bounding box
|
58 |
+
# PyMuPDF uses (x0, y0, x1, y1) format
|
59 |
+
bbox = fitz.Rect(top_left_x, top_left_y, bottom_right_x, bottom_right_y)
|
60 |
+
|
61 |
+
# Render the page as a pixmap with high resolution
|
62 |
+
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
|
63 |
+
pix = page.get_pixmap(matrix=mat, clip=bbox)
|
64 |
+
|
65 |
+
# Convert pixmap to PIL Image
|
66 |
+
img_data_bytes = pix.tobytes("png")
|
67 |
+
img = Image.open(io.BytesIO(img_data_bytes))
|
68 |
+
|
69 |
+
# Save the image to a temporary file
|
70 |
+
img_filename = f"extracted_image_page{page_num}_{index}.png"
|
71 |
+
img_path = os.path.join(temp_dir, img_filename)
|
72 |
+
img.save(img_path, "PNG")
|
73 |
+
|
74 |
+
extracted_image_paths.append(img_path)
|
75 |
+
print(f"✅ Extracted image {index} from page {page_num}: {img_path}")
|
76 |
+
|
77 |
+
except Exception as e:
|
78 |
+
print(f"Error extracting image {index}: {str(e)}")
|
79 |
+
|
80 |
+
# Fallback: try to use base64 data if available
|
81 |
+
base64_data = img_data.get('base64', '')
|
82 |
+
if base64_data:
|
83 |
+
try:
|
84 |
+
img_filename = f"extracted_image_base64_{index}.jpg"
|
85 |
+
img_path = os.path.join(temp_dir, img_filename)
|
86 |
+
|
87 |
+
with open(img_path, "wb") as img_file:
|
88 |
+
img_file.write(base64.b64decode(base64_data))
|
89 |
+
|
90 |
+
extracted_image_paths.append(img_path)
|
91 |
+
print(f"✅ Used base64 data for image {index}: {img_path}")
|
92 |
+
except Exception as e2:
|
93 |
+
print(f"Error using base64 data for image {index}: {str(e2)}")
|
94 |
+
|
95 |
+
pdf_doc.close()
|
96 |
+
return extracted_image_paths
|
97 |
+
|
98 |
+
except Exception as e:
|
99 |
+
print(f"Error opening PDF file: {str(e)}")
|
100 |
+
return []
|
101 |
+
|
102 |
+
@staticmethod
|
103 |
+
def extract_all_images_from_pdf(pdf_path: str) -> List[str]:
|
104 |
+
"""
|
105 |
+
Extract all images from PDF without using bounding boxes.
|
106 |
+
This is a fallback method when no bounding box data is available.
|
107 |
+
|
108 |
+
Args:
|
109 |
+
pdf_path: Path to the PDF file
|
110 |
+
|
111 |
+
Returns:
|
112 |
+
List[str]: List of paths to extracted image files
|
113 |
+
"""
|
114 |
+
try:
|
115 |
+
pdf_doc = fitz.open(pdf_path)
|
116 |
+
extracted_image_paths = []
|
117 |
+
temp_dir = tempfile.gettempdir()
|
118 |
+
|
119 |
+
for page_num in range(len(pdf_doc)):
|
120 |
+
page = pdf_doc[page_num]
|
121 |
+
image_list = page.get_images()
|
122 |
+
|
123 |
+
for img_index, img in enumerate(image_list):
|
124 |
+
try:
|
125 |
+
# Get image data
|
126 |
+
xref = img[0]
|
127 |
+
pix = fitz.Pixmap(pdf_doc, xref)
|
128 |
+
|
129 |
+
# Convert to PNG if CMYK
|
130 |
+
if pix.n - pix.alpha < 4: # GRAY or RGB
|
131 |
+
img_data = pix.tobytes("png")
|
132 |
+
else: # CMYK: convert to RGB first
|
133 |
+
pix1 = fitz.Pixmap(fitz.csRGB, pix)
|
134 |
+
img_data = pix1.tobytes("png")
|
135 |
+
pix1 = None
|
136 |
+
|
137 |
+
# Save image
|
138 |
+
img_filename = f"all_images_page{page_num}_img{img_index}.png"
|
139 |
+
img_path = os.path.join(temp_dir, img_filename)
|
140 |
+
|
141 |
+
with open(img_path, "wb") as f:
|
142 |
+
f.write(img_data)
|
143 |
+
|
144 |
+
extracted_image_paths.append(img_path)
|
145 |
+
pix = None
|
146 |
+
|
147 |
+
except Exception as e:
|
148 |
+
print(f"Error extracting image {img_index} from page {page_num}: {str(e)}")
|
149 |
+
|
150 |
+
pdf_doc.close()
|
151 |
+
return extracted_image_paths
|
152 |
+
|
153 |
+
except Exception as e:
|
154 |
+
print(f"Error extracting all images from PDF: {str(e)}")
|
155 |
+
return []
|