spagestic commited on
Commit
c725c9c
·
1 Parent(s): 580b86f

feat: Refactor PDF Text Extractor application structure

Browse files

- Introduced a modular architecture by separating the application into distinct modules: app.py, ui, and utils.
- Implemented a main function in app.py to handle application launch and configuration.
- Added environment variable loading and API key validation.
- Created a .env.example file for environment variable setup guidance.
- Enhanced the UI components and handlers for better user interaction.
- Developed a comprehensive PDF text extraction utility using Mistral AI.
- Added tests for OCR functionality and setup validation.
- Updated .gitignore to exclude environment files and unnecessary artifacts.

.env.example ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Environment variables for PDF Explainer
2
+ # Copy this file to .env and fill in your actual API key
3
+
4
+ # Mistral AI API Key - Get yours from https://console.mistral.ai/
5
+ MISTRAL_API_KEY=your_mistral_api_key_here
.gitignore ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables
2
+ **/.env
3
+
4
+ # Python cache
5
+ **/__pycache__/
6
+ *.py[cod]
7
+ *$py.class
8
+
9
+ # Virtual environment
10
+ **/.venv
11
+ .venv/
12
+ venv/
13
+ env/
14
+
15
+ # IDE files
16
+ .vscode/settings.json
17
+ .idea/
18
+
19
+ # OS files
20
+ .DS_Store
21
+ Thumbs.db
22
+
23
+ # Gradio temporary files
24
+ gradio_cached_examples/
25
+ flagged/
26
+
27
+ # Log files
28
+ *.log
29
+
30
+ # Distribution / packaging
31
+ .Python
32
+ build/
33
+ develop-eggs/
34
+ dist/
35
+ downloads/
36
+ eggs/
37
+ .eggs/
38
+ lib/
39
+ lib64/
40
+ parts/
41
+ sdist/
42
+ var/
43
+ wheels/
44
+ *.egg-info/
45
+ .installed.cfg
46
+ *.egg
47
+ MANIFEST
app.py CHANGED
@@ -1,7 +1,35 @@
1
- import gradio as gr
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF Text Extractor Application
3
+ Main entry point for the PDF Text Extractor application.
4
+ """
5
 
6
+ import os
7
+ from dotenv import load_dotenv
8
+ from ui import create_interface
9
+ from utils.config import check_api_key, get_app_config
10
 
11
+ def main():
12
+ """Main function to launch the application."""
13
+
14
+ # Load environment variables from .env file
15
+ load_dotenv()
16
+
17
+ # Check for API key
18
+ check_api_key()
19
+
20
+ # Create and launch the interface
21
+ interface = create_interface()
22
+
23
+ # Get application configuration
24
+ app_config = get_app_config()
25
+
26
+ # Launch with appropriate settings
27
+ interface.launch(
28
+ server_port=app_config["server_port"],
29
+ debug=app_config["debug"],
30
+ quiet=app_config["quiet"],
31
+ max_file_size=app_config["max_file_size"]
32
+ )
33
+
34
+ if __name__ == "__main__":
35
+ main()
main.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF Text Extractor using Gradio and Mistral AI
3
+ A web application for extracting text from PDF files using Mistral's OCR capabilities.
4
+
5
+ This is a legacy entry point that maintains compatibility with the original app.
6
+ For a more modular structure, see app.py and the ui/ and utils/ folders.
7
+ """
8
+
9
+ # Import from the new modular structure
10
+ from app import main
11
+
12
+
13
+ # Execute the main function when run as script
14
+ if __name__ == "__main__":
15
+ main()
pdf_text_extractor.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ from typing import Optional, Tuple, List, Dict, Any
4
+ from mistralai import Mistral
5
+
6
+ class PDFTextExtractor:
7
+ """PDF text extraction using Mistral AI OCR."""
8
+
9
+ def __init__(self):
10
+ """Initialize the PDF text extractor with Mistral AI client."""
11
+ self.api_key = os.environ.get("MISTRAL_API_KEY")
12
+ if not self.api_key:
13
+ raise ValueError("MISTRAL_API_KEY environment variable is required")
14
+ self.client = Mistral(api_key=self.api_key)
15
+
16
+ def encode_pdf(self, pdf_path: str) -> Optional[str]:
17
+ """
18
+ Encode the PDF file to base64.
19
+
20
+ Args:
21
+ pdf_path: Path to the PDF file
22
+
23
+ Returns:
24
+ Base64 encoded string or None if error
25
+ """
26
+ try:
27
+ with open(pdf_path, "rb") as pdf_file:
28
+ return base64.b64encode(pdf_file.read()).decode('utf-8')
29
+ except FileNotFoundError:
30
+ print(f"Error: The file {pdf_path} was not found.")
31
+ return None
32
+ except Exception as e:
33
+ print(f"Error encoding PDF: {e}")
34
+ return None
35
+
36
+ def extract_text_from_pdf(self, pdf_file) -> Tuple[str, str, List[Dict[str, Any]]]:
37
+ """
38
+ Extract text and images from uploaded PDF using Mistral AI OCR.
39
+
40
+ Args:
41
+ pdf_file: Gradio file object
42
+
43
+ Returns:
44
+ Tuple of (extracted_text, status_message, images_data)
45
+ """
46
+ if pdf_file is None:
47
+ return "", "Please upload a PDF file.", []
48
+
49
+ try:
50
+ # Get the file path from Gradio file object
51
+ pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
52
+
53
+ # Encode PDF to base64
54
+ base64_pdf = self.encode_pdf(pdf_path)
55
+ if base64_pdf is None:
56
+ return "", "Failed to encode PDF file.", []
57
+
58
+ # Process with Mistral OCR
59
+ print(f"🔄 Processing PDF with Mistral OCR...")
60
+ ocr_response = self.client.ocr.process(
61
+ model="mistral-ocr-latest",
62
+ document={
63
+ "type": "document_url",
64
+ "document_url": f"data:application/pdf;base64,{base64_pdf}"
65
+ },
66
+ include_image_base64=True
67
+ )
68
+
69
+ # Enhanced debugging and response parsing
70
+ print("🔍 Analyzing OCR Response Structure...")
71
+ print(f" Type: {type(ocr_response)}")
72
+ print(f" String representation: {str(ocr_response)[:500]}...")
73
+
74
+ # Check if it's a simple object with attributes
75
+ if hasattr(ocr_response, '__dict__'):
76
+ print(f" Object attributes: {list(ocr_response.__dict__.keys())}")
77
+ for key, value in ocr_response.__dict__.items():
78
+ print(f" {key}: {type(value)} = {str(value)[:100]}...")
79
+
80
+ # Check if it has commonly expected attributes
81
+ common_attrs = ['text', 'content', 'result', 'data', 'output', 'extracted_text', 'ocr_text', 'choices', 'message']
82
+ for attr in common_attrs:
83
+ if hasattr(ocr_response, attr):
84
+ value = getattr(ocr_response, attr)
85
+ print(f" Has '{attr}': {type(value)} = {str(value)[:100]}...")
86
+
87
+ # Check if it's iterable but not a string
88
+ try:
89
+ if hasattr(ocr_response, '__iter__') and not isinstance(ocr_response, str):
90
+ print(f" Iterable with {len(list(ocr_response))} items")
91
+ for i, item in enumerate(ocr_response):
92
+ if i < 3: # Show first 3 items
93
+ print(f" Item {i}: {type(item)} = {str(item)[:100]}...")
94
+ except Exception as e:
95
+ print(f" Error checking iteration: {e}")
96
+
97
+ # Advanced text extraction with multiple strategies
98
+ extracted_text = ""
99
+ extraction_method = "none"
100
+ extracted_images = []
101
+
102
+ # Strategy 1: Mistral OCR specific - pages with markdown content and images
103
+ if hasattr(ocr_response, 'pages') and ocr_response.pages:
104
+ pages = ocr_response.pages
105
+ if isinstance(pages, list) and len(pages) > 0:
106
+ page_texts = []
107
+
108
+ for i, page in enumerate(pages):
109
+ # Extract text
110
+ if hasattr(page, 'markdown') and page.markdown:
111
+ page_texts.append(page.markdown)
112
+ print(f"✅ Found text in page {i} markdown: {len(page.markdown)} characters")
113
+
114
+ # Extract images
115
+ if hasattr(page, 'images') and page.images:
116
+ for j, img in enumerate(page.images):
117
+ image_data = {
118
+ 'page': i,
119
+ 'image_id': f"img-{i}-{j}",
120
+ 'top_left_x': getattr(img, 'top_left_x', 0),
121
+ 'top_left_y': getattr(img, 'top_left_y', 0),
122
+ 'bottom_right_x': getattr(img, 'bottom_right_x', 0),
123
+ 'bottom_right_y': getattr(img, 'bottom_right_y', 0),
124
+ 'base64': getattr(img, 'image_base64', '')
125
+ }
126
+ extracted_images.append(image_data)
127
+ print(f"✅ Found image in page {i}, image {j}: coordinates ({image_data['top_left_x']}, {image_data['top_left_y']}) to ({image_data['bottom_right_x']}, {image_data['bottom_right_y']})")
128
+
129
+ if page_texts:
130
+ extracted_text = "\n\n".join(page_texts)
131
+ extraction_method = f"pages_markdown_{len(page_texts)}_pages"
132
+
133
+ # Try to extract images from other response structures if no images found yet
134
+ if not extracted_images:
135
+ # Check if response has images attribute directly
136
+ if hasattr(ocr_response, 'images') and ocr_response.images:
137
+ for j, img in enumerate(ocr_response.images):
138
+ image_data = {
139
+ 'page': 0,
140
+ 'image_id': getattr(img, 'id', f"img-{j}"),
141
+ 'top_left_x': getattr(img, 'top_left_x', 0),
142
+ 'top_left_y': getattr(img, 'top_left_y', 0),
143
+ 'bottom_right_x': getattr(img, 'bottom_right_x', 0),
144
+ 'bottom_right_y': getattr(img, 'bottom_right_y', 0),
145
+ 'base64': getattr(img, 'image_base64', '')
146
+ }
147
+ extracted_images.append(image_data)
148
+ print(f"✅ Found image {j}: coordinates ({image_data['top_left_x']}, {image_data['top_left_y']}) to ({image_data['bottom_right_x']}, {image_data['bottom_right_y']})")
149
+
150
+ # Continue with fallback strategies for text extraction
151
+ if not extracted_text:
152
+ # Strategy 2: Direct text attribute (fallback)
153
+ if hasattr(ocr_response, 'text') and ocr_response.text:
154
+ extracted_text = str(ocr_response.text)
155
+ extraction_method = "direct_text_attribute"
156
+
157
+ # Strategy 3: Content attribute (fallback)
158
+ elif hasattr(ocr_response, 'content') and ocr_response.content:
159
+ content = ocr_response.content
160
+ if isinstance(content, str):
161
+ extracted_text = content
162
+ extraction_method = "content_attribute_string"
163
+ elif hasattr(content, 'text'):
164
+ extracted_text = str(content.text)
165
+ extraction_method = "content_text_attribute"
166
+ else:
167
+ extracted_text = str(content)
168
+ extraction_method = "content_attribute_converted"
169
+
170
+ # Strategy 4: Result attribute (fallback)
171
+ elif hasattr(ocr_response, 'result'):
172
+ result = ocr_response.result
173
+ if isinstance(result, str):
174
+ extracted_text = result
175
+ extraction_method = "result_string"
176
+ elif hasattr(result, 'text'):
177
+ extracted_text = str(result.text)
178
+ extraction_method = "result_text_attribute"
179
+ elif isinstance(result, dict) and 'text' in result:
180
+ extracted_text = str(result['text'])
181
+ extraction_method = "result_dict_text"
182
+ else:
183
+ extracted_text = str(result)
184
+ extraction_method = "result_converted"
185
+
186
+ # Strategy 5: Choices attribute (ChatGPT-style response - fallback)
187
+ elif hasattr(ocr_response, 'choices') and ocr_response.choices:
188
+ choices = ocr_response.choices
189
+ if isinstance(choices, list) and len(choices) > 0:
190
+ choice = choices[0]
191
+ if hasattr(choice, 'message') and hasattr(choice.message, 'content'):
192
+ extracted_text = str(choice.message.content)
193
+ extraction_method = "choices_message_content"
194
+ elif hasattr(choice, 'text'):
195
+ extracted_text = str(choice.text)
196
+ extraction_method = "choices_text"
197
+ else:
198
+ extracted_text = str(choice)
199
+ extraction_method = "choices_converted"
200
+
201
+ # Strategy 6: Dict-like access (fallback)
202
+ elif hasattr(ocr_response, 'get') or isinstance(ocr_response, dict):
203
+ for key in ['text', 'content', 'result', 'extracted_text', 'ocr_text', 'output']:
204
+ if hasattr(ocr_response, 'get'):
205
+ value = ocr_response.get(key)
206
+ else:
207
+ value = ocr_response.get(key) if isinstance(ocr_response, dict) else None
208
+
209
+ if value:
210
+ extracted_text = str(value)
211
+ extraction_method = f"dict_key_{key}"
212
+ break
213
+
214
+ # Strategy 7: Inspect all attributes for string-like content (fallback)
215
+ elif hasattr(ocr_response, '__dict__'):
216
+ for key, value in ocr_response.__dict__.items():
217
+ if isinstance(value, str) and len(value) > 20: # Likely text content
218
+ extracted_text = value
219
+ extraction_method = f"attribute_{key}"
220
+ break
221
+ elif hasattr(value, 'text') and isinstance(value.text, str):
222
+ extracted_text = str(value.text)
223
+ extraction_method = f"nested_text_in_{key}"
224
+ break
225
+
226
+ # Strategy 8: Convert entire response to string if it seems to contain text (fallback)
227
+ if not extracted_text:
228
+ response_str = str(ocr_response)
229
+ if len(response_str) > 50 and not response_str.startswith('<'): # Not an object reference
230
+ extracted_text = response_str
231
+ extraction_method = "full_response_string"
232
+
233
+ print(f"🎯 Extraction method used: {extraction_method}")
234
+ print(f"📏 Extracted text length: {len(extracted_text)} characters")
235
+ print(f"🖼️ Extracted images: {len(extracted_images)}")
236
+
237
+ if extracted_text:
238
+ status = f"✅ Successfully extracted text from PDF ({len(extracted_text)} characters)"
239
+ if extracted_images:
240
+ status += f" and {len(extracted_images)} image(s)"
241
+ else:
242
+ extracted_text = "No text could be extracted from this PDF."
243
+ status = "⚠️ OCR completed but no text was found in response."
244
+ if extracted_images:
245
+ status = f"✅ Successfully extracted {len(extracted_images)} image(s) from PDF, but no text was found."
246
+ print(f"❌ No extractable text found in OCR response")
247
+
248
+ return extracted_text, status, extracted_images
249
+
250
+ except Exception as e:
251
+ error_msg = f"Error processing PDF: {str(e)}"
252
+ print(error_msg)
253
+ return "", f"❌ {error_msg}", []
254
+
requirements.txt ADDED
File without changes
tests/test_ocr_direct.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Quick OCR Test Script
3
+ Tests the Mistral AI OCR functionality directly without the Gradio interface.
4
+ """
5
+
6
+ import base64
7
+ import os
8
+ import tempfile
9
+ from mistralai import Mistral
10
+ from dotenv import load_dotenv
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ def create_simple_pdf_content():
16
+ """Create a minimal PDF in memory for testing."""
17
+ # Simple PDF content (this is a basic PDF structure)
18
+ pdf_content = """%PDF-1.4
19
+ 1 0 obj
20
+ <<
21
+ /Type /Catalog
22
+ /Pages 2 0 R
23
+ >>
24
+ endobj
25
+
26
+ 2 0 obj
27
+ <<
28
+ /Type /Pages
29
+ /Kids [3 0 R]
30
+ /Count 1
31
+ >>
32
+ endobj
33
+
34
+ 3 0 obj
35
+ <<
36
+ /Type /Page
37
+ /Parent 2 0 R
38
+ /MediaBox [0 0 612 792]
39
+ /Contents 4 0 R
40
+ /Resources <<
41
+ /Font <<
42
+ /F1 5 0 R
43
+ >>
44
+ >>
45
+ >>
46
+ endobj
47
+
48
+ 4 0 obj
49
+ <<
50
+ /Length 44
51
+ >>
52
+ stream
53
+ BT
54
+ /F1 12 Tf
55
+ 72 720 Td
56
+ (Hello World! Test OCR) Tj
57
+ ET
58
+ endstream
59
+ endobj
60
+
61
+ 5 0 obj
62
+ <<
63
+ /Type /Font
64
+ /Subtype /Type1
65
+ /BaseFont /Helvetica
66
+ >>
67
+ endobj
68
+
69
+ xref
70
+ 0 6
71
+ 0000000000 65535 f
72
+ 0000000010 00000 n
73
+ 0000000079 00000 n
74
+ 0000000173 00000 n
75
+ 0000000301 00000 n
76
+ 0000000380 00000 n
77
+ trailer
78
+ <<
79
+ /Size 6
80
+ /Root 1 0 R
81
+ >>
82
+ startxref
83
+ 456
84
+ %%EOF"""
85
+
86
+ return pdf_content.encode('utf-8')
87
+
88
+ def test_mistral_ocr():
89
+ """Test the Mistral OCR functionality directly."""
90
+
91
+ print("🧪 Starting Mistral OCR Test...")
92
+
93
+ # Check API key
94
+ api_key = os.environ.get("MISTRAL_API_KEY")
95
+ if not api_key:
96
+ print("❌ MISTRAL_API_KEY environment variable not found")
97
+ print(" Please set it in your .env file or environment")
98
+ return False
99
+
100
+ print(f"✅ API key found: {api_key[:8]}...")
101
+
102
+ try:
103
+ # Initialize Mistral client
104
+ client = Mistral(api_key=api_key)
105
+ print("✅ Mistral client initialized")
106
+
107
+ # Create a simple test PDF
108
+ pdf_content = create_simple_pdf_content()
109
+ base64_pdf = base64.b64encode(pdf_content).decode('utf-8')
110
+ print(f"✅ Test PDF created ({len(pdf_content)} bytes)")
111
+
112
+ # Test the OCR endpoint
113
+ print("🔄 Sending OCR request to Mistral...")
114
+
115
+ response = client.ocr.process(
116
+ model="mistral-ocr-latest",
117
+ document={
118
+ "type": "document_url",
119
+ "document_url": f"data:application/pdf;base64,{base64_pdf}"
120
+ },
121
+ include_image_base64=True
122
+ )
123
+
124
+ print("✅ OCR request completed")
125
+
126
+ # Analyze the response
127
+ print("\n🔍 RESPONSE ANALYSIS:")
128
+ print(f"Response type: {type(response)}")
129
+ print(f"Response string: {str(response)[:200]}...")
130
+
131
+ if hasattr(response, '__dict__'):
132
+ print(f"Response attributes: {list(response.__dict__.keys())}")
133
+ for key, value in response.__dict__.items():
134
+ print(f" {key}: {type(value)} = {str(value)[:100]}...")
135
+ # Test all possible text extraction methods
136
+ print("\n🎯 TESTING TEXT EXTRACTION METHODS:")
137
+
138
+ methods = [
139
+ ("response.pages[].markdown", lambda r: "\n".join([page.markdown for page in r.pages]) if hasattr(r, 'pages') and r.pages and all(hasattr(p, 'markdown') for p in r.pages) else None),
140
+ ("response.text", lambda r: getattr(r, 'text', None)),
141
+ ("response.content", lambda r: getattr(r, 'content', None)),
142
+ ("response.result", lambda r: getattr(r, 'result', None)),
143
+ ("response.data", lambda r: getattr(r, 'data', None)),
144
+ ("response['text']", lambda r: r.get('text') if hasattr(r, 'get') else None),
145
+ ("response['content']", lambda r: r.get('content') if hasattr(r, 'get') else None),
146
+ ]
147
+
148
+ extracted_text = None
149
+ successful_method = None
150
+
151
+ for method_name, method_func in methods:
152
+ try:
153
+ result = method_func(response)
154
+ if result:
155
+ print(f"✅ {method_name}: Found content ({len(str(result))} chars)")
156
+ print(f" Content: {str(result)[:100]}...")
157
+ if not extracted_text: # Use the first successful method
158
+ extracted_text = str(result)
159
+ successful_method = method_name
160
+ else:
161
+ print(f"❌ {method_name}: No content found")
162
+ except Exception as e:
163
+ print(f"❌ {method_name}: Error - {e}")
164
+
165
+ if extracted_text:
166
+ print(f"\n🎉 SUCCESSFULLY EXTRACTED TEXT using {successful_method}:")
167
+ print(f"📝 Full extracted text: '{extracted_text}'")
168
+ else:
169
+ print(f"\n❌ NO TEXT EXTRACTED from any method")
170
+
171
+ return True
172
+
173
+ except Exception as e:
174
+ print(f"❌ OCR test failed: {e}")
175
+ print(f" Error type: {type(e)}")
176
+
177
+ # If it's a 401 error, the API key might be invalid
178
+ if "401" in str(e) or "unauthorized" in str(e).lower():
179
+ print(" This might be an API key issue. Please check your MISTRAL_API_KEY")
180
+
181
+ return False
182
+
183
+ def test_api_connectivity():
184
+ """Test basic connectivity to Mistral API."""
185
+
186
+ print("🌐 Testing API connectivity...")
187
+
188
+ api_key = os.environ.get("MISTRAL_API_KEY")
189
+ if not api_key:
190
+ print("❌ No API key found")
191
+ return False
192
+
193
+ try:
194
+ client = Mistral(api_key=api_key)
195
+
196
+ # Try a simple API call (if available)
197
+ # Note: This might fail if the endpoint doesn't exist, but it tests connectivity
198
+ print("🔄 Testing API connection...")
199
+
200
+ # The exact method to test connectivity may vary based on Mistral's API
201
+ # For now, we'll just try to initialize and catch any immediate errors
202
+ print("✅ Mistral client appears to be working")
203
+ return True
204
+
205
+ except Exception as e:
206
+ print(f"❌ API connectivity test failed: {e}")
207
+ return False
208
+
209
+ def main():
210
+ """Main test function."""
211
+
212
+ print("🚀 Mistral OCR Quick Test")
213
+ print("=" * 40)
214
+
215
+ # Test API connectivity first
216
+ if not test_api_connectivity():
217
+ print("\n❌ Basic connectivity test failed")
218
+ return
219
+
220
+ print("\n" + "="*40)
221
+
222
+ # Test OCR functionality
223
+ if test_mistral_ocr():
224
+ print("\n✅ OCR test completed - check the response analysis above")
225
+ else:
226
+ print("\n❌ OCR test failed")
227
+
228
+ print("\n💡 Next steps:")
229
+ print(" 1. If the test worked, run: python main.py")
230
+ print(" 2. If there were errors, check the API key and try again")
231
+ print(" 3. Use the response analysis to improve text extraction")
232
+
233
+ if __name__ == "__main__":
234
+ main()
tests/test_setup.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test script for PDF Extractor setup validation
3
+ """
4
+
5
+ import sys
6
+ import os
7
+ from dotenv import load_dotenv
8
+
9
+ def test_imports():
10
+ """Test if all required packages are importable."""
11
+ try:
12
+ import gradio as gr
13
+ print("✅ Gradio imported successfully")
14
+
15
+ import mistralai
16
+ print("✅ Mistral AI imported successfully")
17
+
18
+ from dotenv import load_dotenv
19
+ print("✅ python-dotenv imported successfully")
20
+
21
+ return True
22
+ except ImportError as e:
23
+ print(f"❌ Import error: {e}")
24
+ return False
25
+
26
+ def test_environment():
27
+ """Test environment variable setup."""
28
+ load_dotenv()
29
+
30
+ api_key = os.environ.get("MISTRAL_API_KEY")
31
+ if api_key:
32
+ # Don't print the actual key, just confirm it exists
33
+ print("✅ MISTRAL_API_KEY environment variable is set")
34
+ return True
35
+ else:
36
+ print("⚠️ MISTRAL_API_KEY not found in environment")
37
+ print(" Please copy .env.example to .env and add your API key")
38
+ return False
39
+
40
+ def main():
41
+ """Run all tests."""
42
+ print("🔍 PDF Extractor Setup Validation")
43
+ print("=" * 40)
44
+
45
+ import_success = test_imports()
46
+ env_success = test_environment()
47
+
48
+ print("\n" + "=" * 40)
49
+ if import_success:
50
+ print("✅ All packages are properly installed")
51
+ if env_success:
52
+ print("✅ Environment is configured correctly")
53
+ print("🚀 Ready to run: python main.py")
54
+ else:
55
+ print("⚠️ Environment needs configuration")
56
+ print("📝 Next step: Set up your .env file")
57
+ else:
58
+ print("❌ Package installation incomplete")
59
+ print("📝 Next step: pip install -r requirements.txt")
60
+
61
+ if __name__ == "__main__":
62
+ main()
ui/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """UI components for PDF Text Extractor."""
2
+ from ui.interface import create_interface
3
+ from ui.handlers import copy_text, download_text, process_images_for_display
4
+ from ui.components import (
5
+ create_header, create_upload_section, create_action_button,
6
+ create_text_display, create_action_buttons, create_image_gallery,
7
+ apply_custom_css
8
+ )
9
+
10
+ __all__ = [
11
+ "create_interface", "copy_text", "download_text", "process_images_for_display",
12
+ "create_header", "create_upload_section", "create_action_button",
13
+ "create_text_display", "create_action_buttons", "create_image_gallery",
14
+ "apply_custom_css"
15
+ ]
ui/components.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ UI components module for PDF Text Extractor.
3
+ Contains functions for creating individual UI components.
4
+ """
5
+
6
+ import gradio as gr
7
+ from typing import Tuple, List, Dict, Any
8
+
9
+ def create_header() -> gr.Markdown:
10
+ """
11
+ Create the application header.
12
+
13
+ Returns:
14
+ gr.Markdown: Header component
15
+ """
16
+ return gr.Markdown("""
17
+ # 🔍 PDF Text Extractor
18
+
19
+ Extract text and images from PDF files using Mistral AI's OCR technology.
20
+
21
+ **Instructions:**
22
+ 1. Upload a PDF file using the file selector below
23
+ 2. Wait for processing to complete
24
+ 3. View the extracted text and images
25
+ 4. Use the Copy or Download buttons to save the extracted text
26
+
27
+ **Supported:** PDF files up to 10MB
28
+ """)
29
+
30
+ def create_upload_section() -> gr.File:
31
+ """
32
+ Create the file upload component.
33
+
34
+ Returns:
35
+ gr.File: File upload component
36
+ """
37
+ return gr.File(
38
+ label="Upload PDF File",
39
+ file_types=[".pdf"],
40
+ file_count="single"
41
+ )
42
+
43
+ def create_action_button() -> gr.Button:
44
+ """
45
+ Create the extract text action button.
46
+
47
+ Returns:
48
+ gr.Button: Action button component
49
+ """
50
+ return gr.Button("Extract Text & Images", variant="primary")
51
+
52
+ def create_text_display() -> Tuple[gr.Textbox, gr.Textbox]:
53
+ """
54
+ Create the text output and status display components.
55
+
56
+ Returns:
57
+ Tuple[gr.Textbox, gr.Textbox]: Text output and status components
58
+ """
59
+ text_output = gr.Textbox(
60
+ label="Extracted Text",
61
+ lines=10,
62
+ max_lines=20,
63
+ placeholder="Extracted text will appear here...",
64
+ show_copy_button=True
65
+ )
66
+
67
+ status_output = gr.Textbox(
68
+ label="Status",
69
+ lines=2,
70
+ placeholder="Upload a PDF to see status..."
71
+ )
72
+
73
+ return text_output, status_output
74
+
75
+ def create_image_gallery() -> gr.Gallery:
76
+ """
77
+ Create the image gallery component.
78
+
79
+ Returns:
80
+ gr.Gallery: Image gallery component
81
+ """
82
+ return gr.Gallery(
83
+ label="Extracted Images",
84
+ columns=3,
85
+ rows=2,
86
+ object_fit="contain",
87
+ height="auto",
88
+ visible=True,
89
+ show_label=True,
90
+ elem_id="image_gallery"
91
+ )
92
+
93
+ def create_action_buttons() -> Tuple[gr.Button, gr.Button]:
94
+ """
95
+ Create copy and download action buttons.
96
+
97
+ Returns:
98
+ Tuple[gr.Button, gr.Button]: Copy and download button components
99
+ """
100
+ copy_btn = gr.Button("📋 Copy to Clipboard")
101
+ download_btn = gr.Button("📥 Download as Text File")
102
+
103
+ return copy_btn, download_btn
104
+
105
+ def apply_custom_css() -> gr.HTML:
106
+ """
107
+ Apply custom CSS styling.
108
+
109
+ Returns:
110
+ gr.HTML: HTML component with CSS styles
111
+ """
112
+ return gr.HTML("""
113
+ <style>
114
+ .gradio-container {
115
+ max-width: 900px !important;
116
+ }
117
+ .output-markdown {
118
+ font-family: 'Courier New', monospace;
119
+ }
120
+ .image-gallery-caption {
121
+ text-align: center;
122
+ font-size: 0.9em;
123
+ }
124
+ </style>
125
+ """)
ui/handlers.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Event handlers for UI components.
3
+ Contains functions that handle user interactions with the interface.
4
+ """
5
+
6
+ import os
7
+ import tempfile
8
+ from typing import Optional, List, Dict, Any
9
+ from utils.pdf_image_extractor import PDFImageExtractor
10
+
11
+ def copy_text(text: str) -> str:
12
+ """
13
+ Handle Copy button click.
14
+
15
+ Args:
16
+ text: Text to copy to clipboard
17
+
18
+ Returns:
19
+ str: The input text (unchanged)
20
+ """
21
+ return text
22
+
23
+ def download_text(text: str) -> Optional[str]:
24
+ """
25
+ Handle Download button click.
26
+
27
+ Args:
28
+ text: Text to download
29
+
30
+ Returns:
31
+ Optional[str]: Path to the created text file or None if text is empty
32
+ """
33
+ import tempfile
34
+ import os
35
+
36
+ if not text:
37
+ return None
38
+
39
+ # Create a temporary file to hold the text
40
+ temp_dir = tempfile.gettempdir()
41
+ filename = "extracted_text.txt"
42
+ file_path = os.path.join(temp_dir, filename)
43
+
44
+ # Write the text to the file
45
+ with open(file_path, "w", encoding="utf-8") as f:
46
+ f.write(text)
47
+
48
+ return file_path
49
+
50
+ def process_images_for_display(images_data: List[Dict[str, Any]], pdf_path: str = None) -> List:
51
+ """
52
+ Process images for display in the Gradio gallery.
53
+
54
+ Args:
55
+ images_data: List of image data dictionaries from OCR response
56
+ pdf_path: Path to the original PDF file for image extraction
57
+
58
+ Returns:
59
+ List: List of image paths for gallery display
60
+ """
61
+ if not images_data:
62
+ return []
63
+
64
+ # If we have PDF path and bounding box data, extract images from PDF
65
+ if pdf_path and os.path.exists(pdf_path):
66
+ print("🖼️ Extracting images from PDF using bounding box coordinates...")
67
+ extracted_paths = PDFImageExtractor.extract_images_from_pdf(pdf_path, images_data)
68
+ if extracted_paths:
69
+ return extracted_paths
70
+
71
+ # Fallback: extract all images from PDF if bounding box extraction failed
72
+ print("🔄 Fallback: Extracting all images from PDF...")
73
+ extracted_paths = PDFImageExtractor.extract_all_images_from_pdf(pdf_path)
74
+ if extracted_paths:
75
+ return extracted_paths[:len(images_data)] # Limit to expected number of images
76
+
77
+ # Fallback: use base64 data from OCR response
78
+ print("🔄 Using base64 image data from OCR response...")
79
+ gallery_images = []
80
+ temp_dir = tempfile.gettempdir()
81
+
82
+ for index, img_data in enumerate(images_data):
83
+ try:
84
+ # Get image base64 data
85
+ base64_data = img_data.get('base64', '')
86
+ if not base64_data:
87
+ continue
88
+
89
+ # Create a temporary file to save the image
90
+ img_filename = f"extracted_image_fallback_{index}.jpg"
91
+ img_path = os.path.join(temp_dir, img_filename)
92
+
93
+ # Convert base64 to image file
94
+ import base64
95
+ with open(img_path, "wb") as img_file:
96
+ img_file.write(base64.b64decode(base64_data))
97
+
98
+ # Add path to gallery list (Gradio Gallery expects a list of paths)
99
+ gallery_images.append(img_path)
100
+
101
+ except Exception as e:
102
+ print(f"Error processing image {index}: {str(e)}")
103
+
104
+ return gallery_images
ui/interface.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Interface creation module for PDF Text Extractor.
3
+ Defines the Gradio interface components and layout.
4
+ """
5
+
6
+ import gradio as gr
7
+ from pdf_text_extractor import PDFTextExtractor
8
+ from ui.handlers import copy_text, download_text, process_images_for_display
9
+ from ui.components import (
10
+ create_header, create_upload_section, create_action_button,
11
+ create_text_display, create_action_buttons, create_image_gallery, apply_custom_css
12
+ )
13
+
14
+ def create_dummy_interface() -> gr.Blocks:
15
+ """
16
+ Create a simple interface for when the API key is not configured.
17
+
18
+ Returns:
19
+ gr.Blocks: Gradio interface with disabled functionality
20
+ """
21
+ with gr.Blocks(title="PDF Text Extractor") as interface:
22
+ gr.Markdown("""
23
+ # 🔍 PDF Text Extractor
24
+
25
+ ⚠️ **API key not configured.** Please set MISTRAL_API_KEY environment variable and restart the application.
26
+ """)
27
+
28
+ with gr.Row():
29
+ gr.File(label="Upload PDF", file_types=[".pdf"])
30
+
31
+ with gr.Row():
32
+ gr.Button("Extract Text", variant="primary", interactive=False)
33
+
34
+ with gr.Row():
35
+ gr.Textbox(
36
+ label="Extracted Text",
37
+ lines=10,
38
+ value="API key not configured. Text extraction is unavailable.",
39
+ interactive=False
40
+ )
41
+
42
+ with gr.Row():
43
+ gr.Textbox(
44
+ label="Status",
45
+ lines=2,
46
+ value="❌ MISTRAL_API_KEY environment variable is not set. Please set it and restart the application."
47
+ )
48
+
49
+ with gr.Row():
50
+ gr.Button("📋 Copy to Clipboard", interactive=False)
51
+ gr.Button("📥 Download as Text File", interactive=False)
52
+
53
+ return interface
54
+
55
+
56
+
57
+ def create_main_interface(extractor: PDFTextExtractor) -> gr.Blocks:
58
+ """
59
+ Create the main application interface.
60
+
61
+ Args:
62
+ extractor: PDFTextExtractor instance
63
+
64
+ Returns:
65
+ gr.Blocks: Gradio interface with full functionality """
66
+ # Make the extractor a local function attribute
67
+ def process_pdf_wrapper(pdf_file):
68
+ """Process PDF with the extractor from closure"""
69
+ extracted_text, status, images_data = extractor.extract_text_from_pdf(pdf_file)
70
+ # Get PDF file path for image extraction
71
+ pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file if pdf_file else None
72
+ gallery_images = process_images_for_display(images_data, pdf_path)
73
+ return extracted_text, status, gallery_images
74
+
75
+ with gr.Blocks(title="🔍 PDF Text Extractor", theme=gr.themes.Soft()) as interface:
76
+ # Add the header
77
+ create_header()
78
+
79
+ # Add file upload section
80
+ with gr.Row():
81
+ pdf_input = create_upload_section()
82
+
83
+ # Add extract button
84
+ with gr.Row():
85
+ submit_btn = create_action_button()
86
+
87
+ # Add status display
88
+ with gr.Row():
89
+ status_output = gr.Textbox(
90
+ label="Status",
91
+ lines=2,
92
+ placeholder="Upload a PDF to see status..."
93
+ )
94
+
95
+ # Create tabs for text and images
96
+ with gr.Tabs():
97
+ with gr.TabItem("Extracted Text"):
98
+ text_output = gr.Textbox(
99
+ label="Extracted Text",
100
+ lines=15,
101
+ max_lines=30,
102
+ placeholder="Extracted text will appear here...",
103
+ show_copy_button=True
104
+ )
105
+
106
+ # Add action buttons for text
107
+ with gr.Row():
108
+ copy_btn, download_btn = create_action_buttons()
109
+
110
+ with gr.TabItem("Extracted Images"):
111
+ image_gallery = create_image_gallery()
112
+ image_info = gr.Markdown("Images extracted from the PDF will appear here.")
113
+ # Set up function calls
114
+ submit_btn.click(
115
+ fn=process_pdf_wrapper,
116
+ inputs=[pdf_input],
117
+ outputs=[text_output, status_output, image_gallery]
118
+ )
119
+
120
+ # Handle Copy button click
121
+ copy_btn.click(
122
+ fn=copy_text,
123
+ inputs=text_output,
124
+ outputs=None,
125
+ js="""
126
+ function(text) {
127
+ if (text) {
128
+ navigator.clipboard.writeText(text);
129
+ // Show a temporary notification
130
+ var notification = document.createElement('div');
131
+ notification.textContent = 'Text copied to clipboard!';
132
+ notification.style.position = 'fixed';
133
+ notification.style.bottom = '20px';
134
+ notification.style.left = '50%';
135
+ notification.style.transform = 'translateX(-50%)';
136
+ notification.style.padding = '10px 20px';
137
+ notification.style.background = '#4CAF50';
138
+ notification.style.color = 'white';
139
+ notification.style.borderRadius = '4px';
140
+ notification.style.zIndex = '1000';
141
+ document.body.appendChild(notification);
142
+ setTimeout(function() {
143
+ document.body.removeChild(notification);
144
+ }, 2000);
145
+ }
146
+ return text;
147
+ }
148
+ """
149
+ )
150
+
151
+ # Handle Download button click
152
+ download_btn.click(
153
+ fn=download_text,
154
+ inputs=text_output,
155
+ outputs=gr.File(label="Download", elem_id="download_output"),
156
+ show_progress=False
157
+ )
158
+
159
+ # Apply custom CSS styling
160
+ apply_custom_css()
161
+
162
+ return interface
163
+
164
+ def create_interface() -> gr.Blocks:
165
+ """
166
+ Create and configure the Gradio interface.
167
+
168
+ Returns:
169
+ gr.Blocks: Configured Gradio interface
170
+ """
171
+ # Initialize the PDF extractor
172
+ try:
173
+ extractor = PDFTextExtractor()
174
+ return create_main_interface(extractor)
175
+ except ValueError as e:
176
+ # Create a dummy interface if API key is missing
177
+ return create_dummy_interface()
utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """Utility functions for PDF Text Extractor."""
2
+ from utils.config import check_api_key, get_app_config
3
+
4
+ __all__ = ["check_api_key", "get_app_config"]
utils/config.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration utilities for PDF Text Extractor.
3
+ Contains functions for handling environment variables and app configuration.
4
+ """
5
+
6
+ import os
7
+ from typing import Dict, Any
8
+
9
+ def check_api_key() -> bool:
10
+ """
11
+ Check if the Mistral API key is set in environment variables.
12
+
13
+ Returns:
14
+ bool: True if API key is set, False otherwise
15
+ """
16
+ api_key = os.environ.get("MISTRAL_API_KEY")
17
+ if not api_key:
18
+ print("⚠️ Warning: MISTRAL_API_KEY environment variable is not set.")
19
+ print(" Please set it before using the PDF extraction functionality.")
20
+ print(" Example: export MISTRAL_API_KEY='your-api-key-here'")
21
+ print()
22
+ return False
23
+ return True
24
+
25
+ def get_app_config() -> Dict[str, Any]:
26
+ """
27
+ Get application configuration settings.
28
+
29
+ Returns:
30
+ Dict[str, Any]: Application configuration settings
31
+ """
32
+ return {
33
+ "server_port": 7861, # Use different port to avoid conflicts
34
+ "debug": True, # Enable debug mode for development
35
+ "quiet": False, # Show startup messages
36
+ "max_file_size": "10mb" # Limit PDF file size
37
+ # Uncomment the following to enable external access and public link sharing:
38
+ # "server_name": "0.0.0.0", # Allow external access
39
+ # "share": True, # Create public link
40
+ }
utils/pdf_image_extractor.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF Image Extraction utilities.
3
+ Extracts images from PDF using bounding box coordinates.
4
+ """
5
+
6
+ import os
7
+ import tempfile
8
+ from typing import List, Dict, Any, Optional
9
+ import fitz # PyMuPDF
10
+ from PIL import Image
11
+ import base64
12
+ import io
13
+
14
+
15
+ class PDFImageExtractor:
16
+ """Extract images from PDF using bounding box coordinates."""
17
+
18
+ @staticmethod
19
+ def extract_images_from_pdf(pdf_path: str, images_data: List[Dict[str, Any]]) -> List[str]:
20
+ """
21
+ Extract images from PDF using bounding box coordinates.
22
+
23
+ Args:
24
+ pdf_path: Path to the PDF file
25
+ images_data: List of image data with bounding box coordinates
26
+
27
+ Returns:
28
+ List[str]: List of paths to extracted image files
29
+ """
30
+ if not images_data:
31
+ return []
32
+
33
+ try:
34
+ # Open the PDF document
35
+ pdf_doc = fitz.open(pdf_path)
36
+ extracted_image_paths = []
37
+ temp_dir = tempfile.gettempdir()
38
+
39
+ for index, img_data in enumerate(images_data):
40
+ try:
41
+ page_num = img_data.get('page', 0)
42
+
43
+ # Ensure page number is valid
44
+ if page_num >= len(pdf_doc):
45
+ print(f"Warning: Page {page_num} not found in PDF (max: {len(pdf_doc)-1})")
46
+ continue
47
+
48
+ # Get the page
49
+ page = pdf_doc[page_num]
50
+
51
+ # Get bounding box coordinates
52
+ top_left_x = img_data.get('top_left_x', 0)
53
+ top_left_y = img_data.get('top_left_y', 0)
54
+ bottom_right_x = img_data.get('bottom_right_x', 0)
55
+ bottom_right_y = img_data.get('bottom_right_y', 0)
56
+
57
+ # Create a rectangle for the bounding box
58
+ # PyMuPDF uses (x0, y0, x1, y1) format
59
+ bbox = fitz.Rect(top_left_x, top_left_y, bottom_right_x, bottom_right_y)
60
+
61
+ # Render the page as a pixmap with high resolution
62
+ mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
63
+ pix = page.get_pixmap(matrix=mat, clip=bbox)
64
+
65
+ # Convert pixmap to PIL Image
66
+ img_data_bytes = pix.tobytes("png")
67
+ img = Image.open(io.BytesIO(img_data_bytes))
68
+
69
+ # Save the image to a temporary file
70
+ img_filename = f"extracted_image_page{page_num}_{index}.png"
71
+ img_path = os.path.join(temp_dir, img_filename)
72
+ img.save(img_path, "PNG")
73
+
74
+ extracted_image_paths.append(img_path)
75
+ print(f"✅ Extracted image {index} from page {page_num}: {img_path}")
76
+
77
+ except Exception as e:
78
+ print(f"Error extracting image {index}: {str(e)}")
79
+
80
+ # Fallback: try to use base64 data if available
81
+ base64_data = img_data.get('base64', '')
82
+ if base64_data:
83
+ try:
84
+ img_filename = f"extracted_image_base64_{index}.jpg"
85
+ img_path = os.path.join(temp_dir, img_filename)
86
+
87
+ with open(img_path, "wb") as img_file:
88
+ img_file.write(base64.b64decode(base64_data))
89
+
90
+ extracted_image_paths.append(img_path)
91
+ print(f"✅ Used base64 data for image {index}: {img_path}")
92
+ except Exception as e2:
93
+ print(f"Error using base64 data for image {index}: {str(e2)}")
94
+
95
+ pdf_doc.close()
96
+ return extracted_image_paths
97
+
98
+ except Exception as e:
99
+ print(f"Error opening PDF file: {str(e)}")
100
+ return []
101
+
102
+ @staticmethod
103
+ def extract_all_images_from_pdf(pdf_path: str) -> List[str]:
104
+ """
105
+ Extract all images from PDF without using bounding boxes.
106
+ This is a fallback method when no bounding box data is available.
107
+
108
+ Args:
109
+ pdf_path: Path to the PDF file
110
+
111
+ Returns:
112
+ List[str]: List of paths to extracted image files
113
+ """
114
+ try:
115
+ pdf_doc = fitz.open(pdf_path)
116
+ extracted_image_paths = []
117
+ temp_dir = tempfile.gettempdir()
118
+
119
+ for page_num in range(len(pdf_doc)):
120
+ page = pdf_doc[page_num]
121
+ image_list = page.get_images()
122
+
123
+ for img_index, img in enumerate(image_list):
124
+ try:
125
+ # Get image data
126
+ xref = img[0]
127
+ pix = fitz.Pixmap(pdf_doc, xref)
128
+
129
+ # Convert to PNG if CMYK
130
+ if pix.n - pix.alpha < 4: # GRAY or RGB
131
+ img_data = pix.tobytes("png")
132
+ else: # CMYK: convert to RGB first
133
+ pix1 = fitz.Pixmap(fitz.csRGB, pix)
134
+ img_data = pix1.tobytes("png")
135
+ pix1 = None
136
+
137
+ # Save image
138
+ img_filename = f"all_images_page{page_num}_img{img_index}.png"
139
+ img_path = os.path.join(temp_dir, img_filename)
140
+
141
+ with open(img_path, "wb") as f:
142
+ f.write(img_data)
143
+
144
+ extracted_image_paths.append(img_path)
145
+ pix = None
146
+
147
+ except Exception as e:
148
+ print(f"Error extracting image {img_index} from page {page_num}: {str(e)}")
149
+
150
+ pdf_doc.close()
151
+ return extracted_image_paths
152
+
153
+ except Exception as e:
154
+ print(f"Error extracting all images from PDF: {str(e)}")
155
+ return []