Spaces:

Agents-MCP-Hackathon
/

pdf_explainer

Sleeping

spagestic commited on Jun 6

Commit

c725c9c

1 Parent(s): 580b86f

feat: Refactor PDF Text Extractor application structure

- Introduced a modular architecture by separating the application into distinct modules: app.py, ui, and utils.
- Implemented a main function in app.py to handle application launch and configuration.
- Added environment variable loading and API key validation.
- Created a .env.example file for environment variable setup guidance.
- Enhanced the UI components and handlers for better user interaction.
- Developed a comprehensive PDF text extraction utility using Mistral AI.
- Added tests for OCR functionality and setup validation.
- Updated .gitignore to exclude environment files and unnecessary artifacts.

Files changed (15) hide show

.env.example +5 -0
.gitignore +47 -0
app.py +33 -5
main.py +15 -0
pdf_text_extractor.py +254 -0
requirements.txt +0 -0
tests/test_ocr_direct.py +234 -0
tests/test_setup.py +62 -0
ui/__init__.py +15 -0
ui/components.py +125 -0
ui/handlers.py +104 -0
ui/interface.py +177 -0
utils/__init__.py +4 -0
utils/config.py +40 -0
utils/pdf_image_extractor.py +155 -0

.env.example ADDED Viewed

	@@ -0,0 +1,5 @@

+# Environment variables for PDF Explainer
+# Copy this file to .env and fill in your actual API key
+# Mistral AI API Key - Get yours from https://console.mistral.ai/
+MISTRAL_API_KEY=your_mistral_api_key_here

.gitignore ADDED Viewed

	@@ -0,0 +1,47 @@

+# Environment variables
+**/.env
+# Python cache
+**/__pycache__/
+*.py[cod]
+*$py.class
+# Virtual environment
+**/.venv
+.venv/
+venv/
+env/
+# IDE files
+.vscode/settings.json
+.idea/
+# OS files
+.DS_Store
+Thumbs.db
+# Gradio temporary files
+gradio_cached_examples/
+flagged/
+# Log files
+*.log
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST

app.py CHANGED Viewed

@@ -1,7 +1,35 @@
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+"""
+PDF Text Extractor Application
+Main entry point for the PDF Text Extractor application.
+"""
+import os
+from dotenv import load_dotenv
+from ui import create_interface
+from utils.config import check_api_key, get_app_config
+def main():
+    """Main function to launch the application."""
+    # Load environment variables from .env file
+    load_dotenv()
+    # Check for API key
+    check_api_key()
+    # Create and launch the interface
+    interface = create_interface()
+    # Get application configuration
+    app_config = get_app_config()
+    # Launch with appropriate settings
+    interface.launch(
+        server_port=app_config["server_port"],
+        debug=app_config["debug"],
+        quiet=app_config["quiet"],
+        max_file_size=app_config["max_file_size"]
+    )
+if __name__ == "__main__":
+    main()

main.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""
+PDF Text Extractor using Gradio and Mistral AI
+A web application for extracting text from PDF files using Mistral's OCR capabilities.
+This is a legacy entry point that maintains compatibility with the original app.
+For a more modular structure, see app.py and the ui/ and utils/ folders.
+"""
+# Import from the new modular structure
+from app import main
+# Execute the main function when run as script
+if __name__ == "__main__":
+    main()

pdf_text_extractor.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import base64
+import os
+from typing import Optional, Tuple, List, Dict, Any
+from mistralai import Mistral
+class PDFTextExtractor:
+    """PDF text extraction using Mistral AI OCR."""
+    def __init__(self):
+        """Initialize the PDF text extractor with Mistral AI client."""
+        self.api_key = os.environ.get("MISTRAL_API_KEY")
+        if not self.api_key:
+            raise ValueError("MISTRAL_API_KEY environment variable is required")
+        self.client = Mistral(api_key=self.api_key)
+    def encode_pdf(self, pdf_path: str) -> Optional[str]:
+        """
+        Encode the PDF file to base64.
+        Args:
+            pdf_path: Path to the PDF file
+        Returns:
+            Base64 encoded string or None if error
+        """
+        try:
+            with open(pdf_path, "rb") as pdf_file:
+                return base64.b64encode(pdf_file.read()).decode('utf-8')
+        except FileNotFoundError:
+            print(f"Error: The file {pdf_path} was not found.")
+            return None
+        except Exception as e:
+            print(f"Error encoding PDF: {e}")
+            return None
+    def extract_text_from_pdf(self, pdf_file) -> Tuple[str, str, List[Dict[str, Any]]]:
+        """
+        Extract text and images from uploaded PDF using Mistral AI OCR.
+        Args:
+            pdf_file: Gradio file object
+        Returns:
+            Tuple of (extracted_text, status_message, images_data)
+        """
+        if pdf_file is None:
+            return "", "Please upload a PDF file.", []
+        try:
+            # Get the file path from Gradio file object
+            pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
+            # Encode PDF to base64
+            base64_pdf = self.encode_pdf(pdf_path)
+            if base64_pdf is None:
+                return "", "Failed to encode PDF file.", []
+            # Process with Mistral OCR
+            print(f"🔄 Processing PDF with Mistral OCR...")
+            ocr_response = self.client.ocr.process(
+                model="mistral-ocr-latest",
+                document={
+                    "type": "document_url",
+                    "document_url": f"data:application/pdf;base64,{base64_pdf}"
+                },
+                include_image_base64=True
+            )
+            # Enhanced debugging and response parsing
+            print("🔍 Analyzing OCR Response Structure...")
+            print(f"  Type: {type(ocr_response)}")
+            print(f"  String representation: {str(ocr_response)[:500]}...")
+            # Check if it's a simple object with attributes
+            if hasattr(ocr_response, '__dict__'):
+                print(f"  Object attributes: {list(ocr_response.__dict__.keys())}")
+                for key, value in ocr_response.__dict__.items():
+                    print(f"    {key}: {type(value)} = {str(value)[:100]}...")
+            # Check if it has commonly expected attributes
+            common_attrs = ['text', 'content', 'result', 'data', 'output', 'extracted_text', 'ocr_text', 'choices', 'message']
+            for attr in common_attrs:
+                if hasattr(ocr_response, attr):
+                    value = getattr(ocr_response, attr)
+                    print(f"  Has '{attr}': {type(value)} = {str(value)[:100]}...")
+            # Check if it's iterable but not a string
+            try:
+                if hasattr(ocr_response, '__iter__') and not isinstance(ocr_response, str):
+                    print(f"  Iterable with {len(list(ocr_response))} items")
+                    for i, item in enumerate(ocr_response):
+                        if i < 3:  # Show first 3 items
+                            print(f"    Item {i}: {type(item)} = {str(item)[:100]}...")
+            except Exception as e:
+                print(f"  Error checking iteration: {e}")
+            # Advanced text extraction with multiple strategies
+            extracted_text = ""
+            extraction_method = "none"
+            extracted_images = []
+            # Strategy 1: Mistral OCR specific - pages with markdown content and images
+            if hasattr(ocr_response, 'pages') and ocr_response.pages:
+                pages = ocr_response.pages
+                if isinstance(pages, list) and len(pages) > 0:
+                    page_texts = []
+                    for i, page in enumerate(pages):
+                        # Extract text
+                        if hasattr(page, 'markdown') and page.markdown:
+                            page_texts.append(page.markdown)
+                            print(f"✅ Found text in page {i} markdown: {len(page.markdown)} characters")
+                        # Extract images
+                        if hasattr(page, 'images') and page.images:
+                            for j, img in enumerate(page.images):
+                                image_data = {
+                                    'page': i,
+                                    'image_id': f"img-{i}-{j}",
+                                    'top_left_x': getattr(img, 'top_left_x', 0),
+                                    'top_left_y': getattr(img, 'top_left_y', 0),
+                                    'bottom_right_x': getattr(img, 'bottom_right_x', 0),
+                                    'bottom_right_y': getattr(img, 'bottom_right_y', 0),
+                                    'base64': getattr(img, 'image_base64', '')
+                                }
+                                extracted_images.append(image_data)
+                                print(f"✅ Found image in page {i}, image {j}: coordinates ({image_data['top_left_x']}, {image_data['top_left_y']}) to ({image_data['bottom_right_x']}, {image_data['bottom_right_y']})")
+                    if page_texts:
+                        extracted_text = "\n\n".join(page_texts)
+                        extraction_method = f"pages_markdown_{len(page_texts)}_pages"
+            # Try to extract images from other response structures if no images found yet
+            if not extracted_images:
+                # Check if response has images attribute directly
+                if hasattr(ocr_response, 'images') and ocr_response.images:
+                    for j, img in enumerate(ocr_response.images):
+                        image_data = {
+                            'page': 0,
+                            'image_id': getattr(img, 'id', f"img-{j}"),
+                            'top_left_x': getattr(img, 'top_left_x', 0),
+                            'top_left_y': getattr(img, 'top_left_y', 0),
+                            'bottom_right_x': getattr(img, 'bottom_right_x', 0),
+                            'bottom_right_y': getattr(img, 'bottom_right_y', 0),
+                            'base64': getattr(img, 'image_base64', '')
+                        }
+                        extracted_images.append(image_data)
+                        print(f"✅ Found image {j}: coordinates ({image_data['top_left_x']}, {image_data['top_left_y']}) to ({image_data['bottom_right_x']}, {image_data['bottom_right_y']})")
+            # Continue with fallback strategies for text extraction
+            if not extracted_text:
+                # Strategy 2: Direct text attribute (fallback)
+                if hasattr(ocr_response, 'text') and ocr_response.text:
+                    extracted_text = str(ocr_response.text)
+                    extraction_method = "direct_text_attribute"
+                # Strategy 3: Content attribute (fallback)
+                elif hasattr(ocr_response, 'content') and ocr_response.content:
+                    content = ocr_response.content
+                    if isinstance(content, str):
+                        extracted_text = content
+                        extraction_method = "content_attribute_string"
+                    elif hasattr(content, 'text'):
+                        extracted_text = str(content.text)
+                        extraction_method = "content_text_attribute"
+                    else:
+                        extracted_text = str(content)
+                        extraction_method = "content_attribute_converted"
+                # Strategy 4: Result attribute (fallback)
+                elif hasattr(ocr_response, 'result'):
+                    result = ocr_response.result
+                    if isinstance(result, str):
+                        extracted_text = result
+                        extraction_method = "result_string"
+                    elif hasattr(result, 'text'):
+                        extracted_text = str(result.text)
+                        extraction_method = "result_text_attribute"
+                    elif isinstance(result, dict) and 'text' in result:
+                        extracted_text = str(result['text'])
+                        extraction_method = "result_dict_text"
+                    else:
+                        extracted_text = str(result)
+                        extraction_method = "result_converted"
+                # Strategy 5: Choices attribute (ChatGPT-style response - fallback)
+                elif hasattr(ocr_response, 'choices') and ocr_response.choices:
+                    choices = ocr_response.choices
+                    if isinstance(choices, list) and len(choices) > 0:
+                        choice = choices[0]
+                        if hasattr(choice, 'message') and hasattr(choice.message, 'content'):
+                            extracted_text = str(choice.message.content)
+                            extraction_method = "choices_message_content"
+                        elif hasattr(choice, 'text'):
+                            extracted_text = str(choice.text)
+                            extraction_method = "choices_text"
+                        else:
+                            extracted_text = str(choice)
+                            extraction_method = "choices_converted"
+                # Strategy 6: Dict-like access (fallback)
+                elif hasattr(ocr_response, 'get') or isinstance(ocr_response, dict):
+                    for key in ['text', 'content', 'result', 'extracted_text', 'ocr_text', 'output']:
+                        if hasattr(ocr_response, 'get'):
+                            value = ocr_response.get(key)
+                        else:
+                            value = ocr_response.get(key) if isinstance(ocr_response, dict) else None
+                        if value:
+                            extracted_text = str(value)
+                            extraction_method = f"dict_key_{key}"
+                            break
+                # Strategy 7: Inspect all attributes for string-like content (fallback)
+                elif hasattr(ocr_response, '__dict__'):
+                    for key, value in ocr_response.__dict__.items():
+                        if isinstance(value, str) and len(value) > 20:  # Likely text content
+                            extracted_text = value
+                            extraction_method = f"attribute_{key}"
+                            break
+                        elif hasattr(value, 'text') and isinstance(value.text, str):
+                            extracted_text = str(value.text)
+                            extraction_method = f"nested_text_in_{key}"
+                            break
+                # Strategy 8: Convert entire response to string if it seems to contain text (fallback)
+                if not extracted_text:
+                    response_str = str(ocr_response)
+                    if len(response_str) > 50 and not response_str.startswith('<'):  # Not an object reference
+                        extracted_text = response_str
+                        extraction_method = "full_response_string"
+            print(f"🎯 Extraction method used: {extraction_method}")
+            print(f"📏 Extracted text length: {len(extracted_text)} characters")
+            print(f"🖼️ Extracted images: {len(extracted_images)}")
+            if extracted_text:
+                status = f"✅ Successfully extracted text from PDF ({len(extracted_text)} characters)"
+                if extracted_images:
+                    status += f" and {len(extracted_images)} image(s)"
+            else:
+                extracted_text = "No text could be extracted from this PDF."
+                status = "⚠️ OCR completed but no text was found in response."
+                if extracted_images:
+                    status = f"✅ Successfully extracted {len(extracted_images)} image(s) from PDF, but no text was found."
+                print(f"❌ No extractable text found in OCR response")
+            return extracted_text, status, extracted_images
+        except Exception as e:
+            error_msg = f"Error processing PDF: {str(e)}"
+            print(error_msg)
+            return "", f"❌ {error_msg}", []

requirements.txt ADDED Viewed

File without changes

tests/test_ocr_direct.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""
+Quick OCR Test Script
+Tests the Mistral AI OCR functionality directly without the Gradio interface.
+"""
+import base64
+import os
+import tempfile
+from mistralai import Mistral
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+def create_simple_pdf_content():
+    """Create a minimal PDF in memory for testing."""
+    # Simple PDF content (this is a basic PDF structure)
+    pdf_content = """%PDF-1.4
+1 0 obj
+<<
+/Type /Catalog
+/Pages 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/Type /Pages
+/Kids [3 0 R]
+/Count 1
+>>
+endobj
+3 0 obj
+<<
+/Type /Page
+/Parent 2 0 R
+/MediaBox [0 0 612 792]
+/Contents 4 0 R
+/Resources <<
+/Font <<
+/F1 5 0 R
+>>
+>>
+>>
+endobj
+4 0 obj
+<<
+/Length 44
+>>
+stream
+BT
+/F1 12 Tf
+72 720 Td
+(Hello World! Test OCR) Tj
+ET
+endstream
+endobj
+5 0 obj
+<<
+/Type /Font
+/Subtype /Type1
+/BaseFont /Helvetica
+>>
+endobj
+xref
+0 6
+0000000000 65535 f
+0000000010 00000 n
+0000000079 00000 n
+0000000173 00000 n
+0000000301 00000 n
+0000000380 00000 n
+trailer
+<<
+/Size 6
+/Root 1 0 R
+>>
+startxref
+456
+%%EOF"""
+    return pdf_content.encode('utf-8')
+def test_mistral_ocr():
+    """Test the Mistral OCR functionality directly."""
+    print("🧪 Starting Mistral OCR Test...")
+    # Check API key
+    api_key = os.environ.get("MISTRAL_API_KEY")
+    if not api_key:
+        print("❌ MISTRAL_API_KEY environment variable not found")
+        print("   Please set it in your .env file or environment")
+        return False
+    print(f"✅ API key found: {api_key[:8]}...")
+    try:
+        # Initialize Mistral client
+        client = Mistral(api_key=api_key)
+        print("✅ Mistral client initialized")
+        # Create a simple test PDF
+        pdf_content = create_simple_pdf_content()
+        base64_pdf = base64.b64encode(pdf_content).decode('utf-8')
+        print(f"✅ Test PDF created ({len(pdf_content)} bytes)")
+        # Test the OCR endpoint
+        print("🔄 Sending OCR request to Mistral...")
+        response = client.ocr.process(
+            model="mistral-ocr-latest",
+            document={
+                "type": "document_url",
+                "document_url": f"data:application/pdf;base64,{base64_pdf}"
+            },
+            include_image_base64=True
+        )
+        print("✅ OCR request completed")
+        # Analyze the response
+        print("\n🔍 RESPONSE ANALYSIS:")
+        print(f"Response type: {type(response)}")
+        print(f"Response string: {str(response)[:200]}...")
+        if hasattr(response, '__dict__'):
+            print(f"Response attributes: {list(response.__dict__.keys())}")
+            for key, value in response.__dict__.items():
+                print(f"  {key}: {type(value)} = {str(value)[:100]}...")
+          # Test all possible text extraction methods
+        print("\n🎯 TESTING TEXT EXTRACTION METHODS:")
+        methods = [
+            ("response.pages[].markdown", lambda r: "\n".join([page.markdown for page in r.pages]) if hasattr(r, 'pages') and r.pages and all(hasattr(p, 'markdown') for p in r.pages) else None),
+            ("response.text", lambda r: getattr(r, 'text', None)),
+            ("response.content", lambda r: getattr(r, 'content', None)),
+            ("response.result", lambda r: getattr(r, 'result', None)),
+            ("response.data", lambda r: getattr(r, 'data', None)),
+            ("response['text']", lambda r: r.get('text') if hasattr(r, 'get') else None),
+            ("response['content']", lambda r: r.get('content') if hasattr(r, 'get') else None),
+        ]
+        extracted_text = None
+        successful_method = None
+        for method_name, method_func in methods:
+            try:
+                result = method_func(response)
+                if result:
+                    print(f"✅ {method_name}: Found content ({len(str(result))} chars)")
+                    print(f"   Content: {str(result)[:100]}...")
+                    if not extracted_text:  # Use the first successful method
+                        extracted_text = str(result)
+                        successful_method = method_name
+                else:
+                    print(f"❌ {method_name}: No content found")
+            except Exception as e:
+                print(f"❌ {method_name}: Error - {e}")
+        if extracted_text:
+            print(f"\n🎉 SUCCESSFULLY EXTRACTED TEXT using {successful_method}:")
+            print(f"📝 Full extracted text: '{extracted_text}'")
+        else:
+            print(f"\n❌ NO TEXT EXTRACTED from any method")
+        return True
+    except Exception as e:
+        print(f"❌ OCR test failed: {e}")
+        print(f"   Error type: {type(e)}")
+        # If it's a 401 error, the API key might be invalid
+        if "401" in str(e) or "unauthorized" in str(e).lower():
+            print("   This might be an API key issue. Please check your MISTRAL_API_KEY")
+        return False
+def test_api_connectivity():
+    """Test basic connectivity to Mistral API."""
+    print("🌐 Testing API connectivity...")
+    api_key = os.environ.get("MISTRAL_API_KEY")
+    if not api_key:
+        print("❌ No API key found")
+        return False
+    try:
+        client = Mistral(api_key=api_key)
+        # Try a simple API call (if available)
+        # Note: This might fail if the endpoint doesn't exist, but it tests connectivity
+        print("🔄 Testing API connection...")
+        # The exact method to test connectivity may vary based on Mistral's API
+        # For now, we'll just try to initialize and catch any immediate errors
+        print("✅ Mistral client appears to be working")
+        return True
+    except Exception as e:
+        print(f"❌ API connectivity test failed: {e}")
+        return False
+def main():
+    """Main test function."""
+    print("🚀 Mistral OCR Quick Test")
+    print("=" * 40)
+    # Test API connectivity first
+    if not test_api_connectivity():
+        print("\n❌ Basic connectivity test failed")
+        return
+    print("\n" + "="*40)
+    # Test OCR functionality
+    if test_mistral_ocr():
+        print("\n✅ OCR test completed - check the response analysis above")
+    else:
+        print("\n❌ OCR test failed")
+    print("\n💡 Next steps:")
+    print("   1. If the test worked, run: python main.py")
+    print("   2. If there were errors, check the API key and try again")
+    print("   3. Use the response analysis to improve text extraction")
+if __name__ == "__main__":
+    main()

tests/test_setup.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""
+Test script for PDF Extractor setup validation
+"""
+import sys
+import os
+from dotenv import load_dotenv
+def test_imports():
+    """Test if all required packages are importable."""
+    try:
+        import gradio as gr
+        print("✅ Gradio imported successfully")
+        import mistralai
+        print("✅ Mistral AI imported successfully")
+        from dotenv import load_dotenv
+        print("✅ python-dotenv imported successfully")
+        return True
+    except ImportError as e:
+        print(f"❌ Import error: {e}")
+        return False
+def test_environment():
+    """Test environment variable setup."""
+    load_dotenv()
+    api_key = os.environ.get("MISTRAL_API_KEY")
+    if api_key:
+        # Don't print the actual key, just confirm it exists
+        print("✅ MISTRAL_API_KEY environment variable is set")
+        return True
+    else:
+        print("⚠️  MISTRAL_API_KEY not found in environment")
+        print("   Please copy .env.example to .env and add your API key")
+        return False
+def main():
+    """Run all tests."""
+    print("🔍 PDF Extractor Setup Validation")
+    print("=" * 40)
+    import_success = test_imports()
+    env_success = test_environment()
+    print("\n" + "=" * 40)
+    if import_success:
+        print("✅ All packages are properly installed")
+        if env_success:
+            print("✅ Environment is configured correctly")
+            print("🚀 Ready to run: python main.py")
+        else:
+            print("⚠️  Environment needs configuration")
+            print("📝 Next step: Set up your .env file")
+    else:
+        print("❌ Package installation incomplete")
+        print("📝 Next step: pip install -r requirements.txt")
+if __name__ == "__main__":
+    main()

ui/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""UI components for PDF Text Extractor."""
+from ui.interface import create_interface
+from ui.handlers import copy_text, download_text, process_images_for_display
+from ui.components import (
+    create_header, create_upload_section, create_action_button,
+    create_text_display, create_action_buttons, create_image_gallery,
+    apply_custom_css
+)
+__all__ = [
+    "create_interface", "copy_text", "download_text", "process_images_for_display",
+    "create_header", "create_upload_section", "create_action_button",
+    "create_text_display", "create_action_buttons", "create_image_gallery",
+    "apply_custom_css"
+]

ui/components.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""
+UI components module for PDF Text Extractor.
+Contains functions for creating individual UI components.
+"""
+import gradio as gr
+from typing import Tuple, List, Dict, Any
+def create_header() -> gr.Markdown:
+    """
+    Create the application header.
+    Returns:
+        gr.Markdown: Header component
+    """
+    return gr.Markdown("""
+    # 🔍 PDF Text Extractor
+    Extract text and images from PDF files using Mistral AI's OCR technology.
+    **Instructions:**
+    1. Upload a PDF file using the file selector below
+    2. Wait for processing to complete
+    3. View the extracted text and images
+    4. Use the Copy or Download buttons to save the extracted text
+    **Supported:** PDF files up to 10MB
+    """)
+def create_upload_section() -> gr.File:
+    """
+    Create the file upload component.
+    Returns:
+        gr.File: File upload component
+    """
+    return gr.File(
+        label="Upload PDF File",
+        file_types=[".pdf"],
+        file_count="single"
+    )
+def create_action_button() -> gr.Button:
+    """
+    Create the extract text action button.
+    Returns:
+        gr.Button: Action button component
+    """
+    return gr.Button("Extract Text & Images", variant="primary")
+def create_text_display() -> Tuple[gr.Textbox, gr.Textbox]:
+    """
+    Create the text output and status display components.
+    Returns:
+        Tuple[gr.Textbox, gr.Textbox]: Text output and status components
+    """
+    text_output = gr.Textbox(
+        label="Extracted Text",
+        lines=10,
+        max_lines=20,
+        placeholder="Extracted text will appear here...",
+        show_copy_button=True
+    )
+    status_output = gr.Textbox(
+        label="Status",
+        lines=2,
+        placeholder="Upload a PDF to see status..."
+    )
+    return text_output, status_output
+def create_image_gallery() -> gr.Gallery:
+    """
+    Create the image gallery component.
+    Returns:
+        gr.Gallery: Image gallery component
+    """
+    return gr.Gallery(
+        label="Extracted Images",
+        columns=3,
+        rows=2,
+        object_fit="contain",
+        height="auto",
+        visible=True,
+        show_label=True,
+        elem_id="image_gallery"
+    )
+def create_action_buttons() -> Tuple[gr.Button, gr.Button]:
+    """
+    Create copy and download action buttons.
+    Returns:
+        Tuple[gr.Button, gr.Button]: Copy and download button components
+    """
+    copy_btn = gr.Button("📋 Copy to Clipboard")
+    download_btn = gr.Button("📥 Download as Text File")
+    return copy_btn, download_btn
+def apply_custom_css() -> gr.HTML:
+    """
+    Apply custom CSS styling.
+    Returns:
+        gr.HTML: HTML component with CSS styles
+    """
+    return gr.HTML("""
+    <style>
+    .gradio-container {
+        max-width: 900px !important;
+    }
+    .output-markdown {
+        font-family: 'Courier New', monospace;
+    }
+    .image-gallery-caption {
+        text-align: center;
+        font-size: 0.9em;
+    }
+    </style>
+    """)

ui/handlers.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Event handlers for UI components.
+Contains functions that handle user interactions with the interface.
+"""
+import os
+import tempfile
+from typing import Optional, List, Dict, Any
+from utils.pdf_image_extractor import PDFImageExtractor
+def copy_text(text: str) -> str:
+    """
+    Handle Copy button click.
+    Args:
+        text: Text to copy to clipboard
+    Returns:
+        str: The input text (unchanged)
+    """
+    return text
+def download_text(text: str) -> Optional[str]:
+    """
+    Handle Download button click.
+    Args:
+        text: Text to download
+    Returns:
+        Optional[str]: Path to the created text file or None if text is empty
+    """
+    import tempfile
+    import os
+    if not text:
+        return None
+    # Create a temporary file to hold the text
+    temp_dir = tempfile.gettempdir()
+    filename = "extracted_text.txt"
+    file_path = os.path.join(temp_dir, filename)
+    # Write the text to the file
+    with open(file_path, "w", encoding="utf-8") as f:
+        f.write(text)
+    return file_path
+def process_images_for_display(images_data: List[Dict[str, Any]], pdf_path: str = None) -> List:
+    """
+    Process images for display in the Gradio gallery.
+    Args:
+        images_data: List of image data dictionaries from OCR response
+        pdf_path: Path to the original PDF file for image extraction
+    Returns:
+        List: List of image paths for gallery display
+    """
+    if not images_data:
+        return []
+    # If we have PDF path and bounding box data, extract images from PDF
+    if pdf_path and os.path.exists(pdf_path):
+        print("🖼️ Extracting images from PDF using bounding box coordinates...")
+        extracted_paths = PDFImageExtractor.extract_images_from_pdf(pdf_path, images_data)
+        if extracted_paths:
+            return extracted_paths
+        # Fallback: extract all images from PDF if bounding box extraction failed
+        print("🔄 Fallback: Extracting all images from PDF...")
+        extracted_paths = PDFImageExtractor.extract_all_images_from_pdf(pdf_path)
+        if extracted_paths:
+            return extracted_paths[:len(images_data)]  # Limit to expected number of images
+    # Fallback: use base64 data from OCR response
+    print("🔄 Using base64 image data from OCR response...")
+    gallery_images = []
+    temp_dir = tempfile.gettempdir()
+    for index, img_data in enumerate(images_data):
+        try:
+            # Get image base64 data
+            base64_data = img_data.get('base64', '')
+            if not base64_data:
+                continue
+            # Create a temporary file to save the image
+            img_filename = f"extracted_image_fallback_{index}.jpg"
+            img_path = os.path.join(temp_dir, img_filename)
+            # Convert base64 to image file
+            import base64
+            with open(img_path, "wb") as img_file:
+                img_file.write(base64.b64decode(base64_data))
+            # Add path to gallery list (Gradio Gallery expects a list of paths)
+            gallery_images.append(img_path)
+        except Exception as e:
+            print(f"Error processing image {index}: {str(e)}")
+    return gallery_images

ui/interface.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+Interface creation module for PDF Text Extractor.
+Defines the Gradio interface components and layout.
+"""
+import gradio as gr
+from pdf_text_extractor import PDFTextExtractor
+from ui.handlers import copy_text, download_text, process_images_for_display
+from ui.components import (
+    create_header, create_upload_section, create_action_button,
+    create_text_display, create_action_buttons, create_image_gallery, apply_custom_css
+)
+def create_dummy_interface() -> gr.Blocks:
+    """
+    Create a simple interface for when the API key is not configured.
+    Returns:
+        gr.Blocks: Gradio interface with disabled functionality
+    """
+    with gr.Blocks(title="PDF Text Extractor") as interface:
+        gr.Markdown("""
+        # 🔍 PDF Text Extractor
+        ⚠️ **API key not configured.** Please set MISTRAL_API_KEY environment variable and restart the application.
+        """)
+        with gr.Row():
+            gr.File(label="Upload PDF", file_types=[".pdf"])
+        with gr.Row():
+            gr.Button("Extract Text", variant="primary", interactive=False)
+        with gr.Row():
+            gr.Textbox(
+                label="Extracted Text",
+                lines=10,
+                value="API key not configured. Text extraction is unavailable.",
+                interactive=False
+            )
+        with gr.Row():
+            gr.Textbox(
+                label="Status",
+                lines=2,
+                value="❌ MISTRAL_API_KEY environment variable is not set. Please set it and restart the application."
+            )
+        with gr.Row():
+            gr.Button("📋 Copy to Clipboard", interactive=False)
+            gr.Button("📥 Download as Text File", interactive=False)
+    return interface
+def create_main_interface(extractor: PDFTextExtractor) -> gr.Blocks:
+    """
+    Create the main application interface.
+    Args:
+        extractor: PDFTextExtractor instance
+    Returns:
+        gr.Blocks: Gradio interface with full functionality    """
+    # Make the extractor a local function attribute
+    def process_pdf_wrapper(pdf_file):
+        """Process PDF with the extractor from closure"""
+        extracted_text, status, images_data = extractor.extract_text_from_pdf(pdf_file)
+        # Get PDF file path for image extraction
+        pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file if pdf_file else None
+        gallery_images = process_images_for_display(images_data, pdf_path)
+        return extracted_text, status, gallery_images
+    with gr.Blocks(title="🔍 PDF Text Extractor", theme=gr.themes.Soft()) as interface:
+        # Add the header
+        create_header()
+        # Add file upload section
+        with gr.Row():
+            pdf_input = create_upload_section()
+        # Add extract button
+        with gr.Row():
+            submit_btn = create_action_button()
+        # Add status display
+        with gr.Row():
+            status_output = gr.Textbox(
+                label="Status",
+                lines=2,
+                placeholder="Upload a PDF to see status..."
+            )
+        # Create tabs for text and images
+        with gr.Tabs():
+            with gr.TabItem("Extracted Text"):
+                text_output = gr.Textbox(
+                    label="Extracted Text",
+                    lines=15,
+                    max_lines=30,
+                    placeholder="Extracted text will appear here...",
+                    show_copy_button=True
+                )
+                # Add action buttons for text
+                with gr.Row():
+                    copy_btn, download_btn = create_action_buttons()
+            with gr.TabItem("Extracted Images"):
+                image_gallery = create_image_gallery()
+                image_info = gr.Markdown("Images extracted from the PDF will appear here.")
+          # Set up function calls
+        submit_btn.click(
+            fn=process_pdf_wrapper,
+            inputs=[pdf_input],
+            outputs=[text_output, status_output, image_gallery]
+        )
+        # Handle Copy button click
+        copy_btn.click(
+            fn=copy_text,
+            inputs=text_output,
+            outputs=None,
+            js="""
+            function(text) {
+                if (text) {
+                    navigator.clipboard.writeText(text);
+                    // Show a temporary notification
+                    var notification = document.createElement('div');
+                    notification.textContent = 'Text copied to clipboard!';
+                    notification.style.position = 'fixed';
+                    notification.style.bottom = '20px';
+                    notification.style.left = '50%';
+                    notification.style.transform = 'translateX(-50%)';
+                    notification.style.padding = '10px 20px';
+                    notification.style.background = '#4CAF50';
+                    notification.style.color = 'white';
+                    notification.style.borderRadius = '4px';
+                    notification.style.zIndex = '1000';
+                    document.body.appendChild(notification);
+                    setTimeout(function() {
+                        document.body.removeChild(notification);
+                    }, 2000);
+                }
+                return text;
+            }
+            """
+        )
+        # Handle Download button click
+        download_btn.click(
+            fn=download_text,
+            inputs=text_output,
+            outputs=gr.File(label="Download", elem_id="download_output"),
+            show_progress=False
+        )
+        # Apply custom CSS styling
+        apply_custom_css()
+    return interface
+def create_interface() -> gr.Blocks:
+    """
+    Create and configure the Gradio interface.
+    Returns:
+        gr.Blocks: Configured Gradio interface
+    """
+    # Initialize the PDF extractor
+    try:
+        extractor = PDFTextExtractor()
+        return create_main_interface(extractor)
+    except ValueError as e:
+        # Create a dummy interface if API key is missing
+        return create_dummy_interface()

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""Utility functions for PDF Text Extractor."""
+from utils.config import check_api_key, get_app_config
+__all__ = ["check_api_key", "get_app_config"]

utils/config.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""
+Configuration utilities for PDF Text Extractor.
+Contains functions for handling environment variables and app configuration.
+"""
+import os
+from typing import Dict, Any
+def check_api_key() -> bool:
+    """
+    Check if the Mistral API key is set in environment variables.
+    Returns:
+        bool: True if API key is set, False otherwise
+    """
+    api_key = os.environ.get("MISTRAL_API_KEY")
+    if not api_key:
+        print("⚠️  Warning: MISTRAL_API_KEY environment variable is not set.")
+        print("   Please set it before using the PDF extraction functionality.")
+        print("   Example: export MISTRAL_API_KEY='your-api-key-here'")
+        print()
+        return False
+    return True
+def get_app_config() -> Dict[str, Any]:
+    """
+    Get application configuration settings.
+    Returns:
+        Dict[str, Any]: Application configuration settings
+    """
+    return {
+        "server_port": 7861,       # Use different port to avoid conflicts
+        "debug": True,             # Enable debug mode for development
+        "quiet": False,            # Show startup messages
+        "max_file_size": "10mb"    # Limit PDF file size
+        # Uncomment the following to enable external access and public link sharing:
+        # "server_name": "0.0.0.0",  # Allow external access
+        # "share": True,            # Create public link
+    }

utils/pdf_image_extractor.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+PDF Image Extraction utilities.
+Extracts images from PDF using bounding box coordinates.
+"""
+import os
+import tempfile
+from typing import List, Dict, Any, Optional
+import fitz  # PyMuPDF
+from PIL import Image
+import base64
+import io
+class PDFImageExtractor:
+    """Extract images from PDF using bounding box coordinates."""
+    @staticmethod
+    def extract_images_from_pdf(pdf_path: str, images_data: List[Dict[str, Any]]) -> List[str]:
+        """
+        Extract images from PDF using bounding box coordinates.
+        Args:
+            pdf_path: Path to the PDF file
+            images_data: List of image data with bounding box coordinates
+        Returns:
+            List[str]: List of paths to extracted image files
+        """
+        if not images_data:
+            return []
+        try:
+            # Open the PDF document
+            pdf_doc = fitz.open(pdf_path)
+            extracted_image_paths = []
+            temp_dir = tempfile.gettempdir()
+            for index, img_data in enumerate(images_data):
+                try:
+                    page_num = img_data.get('page', 0)
+                    # Ensure page number is valid
+                    if page_num >= len(pdf_doc):
+                        print(f"Warning: Page {page_num} not found in PDF (max: {len(pdf_doc)-1})")
+                        continue
+                    # Get the page
+                    page = pdf_doc[page_num]
+                    # Get bounding box coordinates
+                    top_left_x = img_data.get('top_left_x', 0)
+                    top_left_y = img_data.get('top_left_y', 0)
+                    bottom_right_x = img_data.get('bottom_right_x', 0)
+                    bottom_right_y = img_data.get('bottom_right_y', 0)
+                    # Create a rectangle for the bounding box
+                    # PyMuPDF uses (x0, y0, x1, y1) format
+                    bbox = fitz.Rect(top_left_x, top_left_y, bottom_right_x, bottom_right_y)
+                    # Render the page as a pixmap with high resolution
+                    mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better quality
+                    pix = page.get_pixmap(matrix=mat, clip=bbox)
+                    # Convert pixmap to PIL Image
+                    img_data_bytes = pix.tobytes("png")
+                    img = Image.open(io.BytesIO(img_data_bytes))
+                    # Save the image to a temporary file
+                    img_filename = f"extracted_image_page{page_num}_{index}.png"
+                    img_path = os.path.join(temp_dir, img_filename)
+                    img.save(img_path, "PNG")
+                    extracted_image_paths.append(img_path)
+                    print(f"✅ Extracted image {index} from page {page_num}: {img_path}")
+                except Exception as e:
+                    print(f"Error extracting image {index}: {str(e)}")
+                    # Fallback: try to use base64 data if available
+                    base64_data = img_data.get('base64', '')
+                    if base64_data:
+                        try:
+                            img_filename = f"extracted_image_base64_{index}.jpg"
+                            img_path = os.path.join(temp_dir, img_filename)
+                            with open(img_path, "wb") as img_file:
+                                img_file.write(base64.b64decode(base64_data))
+                            extracted_image_paths.append(img_path)
+                            print(f"✅ Used base64 data for image {index}: {img_path}")
+                        except Exception as e2:
+                            print(f"Error using base64 data for image {index}: {str(e2)}")
+            pdf_doc.close()
+            return extracted_image_paths
+        except Exception as e:
+            print(f"Error opening PDF file: {str(e)}")
+            return []
+    @staticmethod
+    def extract_all_images_from_pdf(pdf_path: str) -> List[str]:
+        """
+        Extract all images from PDF without using bounding boxes.
+        This is a fallback method when no bounding box data is available.
+        Args:
+            pdf_path: Path to the PDF file
+        Returns:
+            List[str]: List of paths to extracted image files
+        """
+        try:
+            pdf_doc = fitz.open(pdf_path)
+            extracted_image_paths = []
+            temp_dir = tempfile.gettempdir()
+            for page_num in range(len(pdf_doc)):
+                page = pdf_doc[page_num]
+                image_list = page.get_images()
+                for img_index, img in enumerate(image_list):
+                    try:
+                        # Get image data
+                        xref = img[0]
+                        pix = fitz.Pixmap(pdf_doc, xref)
+                        # Convert to PNG if CMYK
+                        if pix.n - pix.alpha < 4:  # GRAY or RGB
+                            img_data = pix.tobytes("png")
+                        else:  # CMYK: convert to RGB first
+                            pix1 = fitz.Pixmap(fitz.csRGB, pix)
+                            img_data = pix1.tobytes("png")
+                            pix1 = None
+                        # Save image
+                        img_filename = f"all_images_page{page_num}_img{img_index}.png"
+                        img_path = os.path.join(temp_dir, img_filename)
+                        with open(img_path, "wb") as f:
+                            f.write(img_data)
+                        extracted_image_paths.append(img_path)
+                        pix = None
+                    except Exception as e:
+                        print(f"Error extracting image {img_index} from page {page_num}: {str(e)}")
+            pdf_doc.close()
+            return extracted_image_paths
+        except Exception as e:
+            print(f"Error extracting all images from PDF: {str(e)}")
+            return []